-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
157 lines (122 loc) · 4.39 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
import numpy as np
import scipy.linalg
def get_bins(bin_arr, min_bin_size, max_bin_count):
"""
Select equal width bins along bin_arr such that there are >min_bin_size rows
per bin but at most max_bin_count bins.
Parameters:
bin_arr : array-like
Array over which to generate bins. Must have length N > min_bin_size.
min_bin_size : int
Target minimum number of rows per bin.
max_bin_count : int
Maximum number of bins to consider.
Returns:
bins : array-ndarray
Array of bin boundaries.
"""
# raise error if there aren't enough rows to make one bin
if len(bin_arr)<min_bin_size:
raise ValueError("Not enough rows in bin_arr to make one bin.")
else:
# make sure input is in appropriate formats
bin_arr = np.asarray(bin_arr, dtype=np.float64)
min_bin_size = int(min_bin_size)
max_bin_count = int(max_bin_count)
# bin count to start with
bin_count = max_bin_count
# variable in which to store current smallest bin size
smallest_bin_size = 0.
# while the bins are too small...
while smallest_bin_size<min_bin_size:
# generate new bins
bins = np.linspace(bin_arr.min()-1E-9, bin_arr.max()+1E-9, bin_count)
# count rows per bin
bin_sizes, _ = np.histogram(bin_arr, bins=bins)
# update current smallest bin size
smallest_bin_size = np.min(bin_sizes)
#decrement bin count
bin_count -= 1
# return final bin array
return bins
def Chebfit(x, y,
val, err,
ndeg=2,
domain=[-1, 1],
fitsubset=None):
"""
Fit Chebyshev polynomial.
Authors: Sergey Koposov, Leigh Smith
Parameters:
x, y : array_like
1D arrays of shape (n,) containing floating point coordinates.
val : array_like
1D array of shape (n,) containing floating point target values.
err : array_like
Weight the fit with the provided 1d array of errors.
ndeg : int, optional
Number of degrees to use for the Chebyshev polynomial (default 2).
domain : (2,) array_like, optional
The domain to use for the Chebyshev polynomial (default [-1,1]).
fitsubset : boolean array, optional
Use only this subset of rows for the LSQ fit.
Returns:
coeff : ndarray
An array containing fitted coefficients.
val_pred : ndarray
An array containing predicted values.
"""
# build design matrix
M1 = []
for i in range(ndeg + 1):
for j in range(i + 1):
M1.append(
np.polynomial.Chebyshev(
(np.arange(ndeg + 1) == j).astype(int),
domain=domain)(x) * np.polynomial.Chebyshev(
(np.arange(ndeg + 1) == (i - j)).astype(int),
domain=domain)(y))
M1 = np.array(M1).T
# fit subset is all if none set
if fitsubset is None:
fitsubset = slice(None)
# divide through by errors
err = np.asarray(err)
_val = val/err
Mv = M1/err[:,None]
# run the lsq fit, and subsequent prediction
coeff = scipy.linalg.lstsq(Mv[fitsubset], _val[fitsubset])[0]
val_pred = np.dot(Mv, coeff)*err
return coeff, val_pred
def Qn(arr, c_n=2.219144):
"""
Rousseeuw and Croux Qn scale estimator, Gaussian efficiency 82%.
Parameters:
arr : array-like
Array of size N over which to compute Qn.
c_n : float, optional
Multiplier to use. Optimal value is dependant on N. Where N is large the
optimal value is 2.219144 (the default).
Returns:
Q_n : float
Rousseeuw and Croux Qn scale estimator for arr.
"""
if len(arr)==0:
return None
elif len(arr)==1:
return 0
else:
if len(arr)<10 and c_n==2.219144:
warnings.warn("Array is small, default Cn might not be appropriate.")
arr = np.asarray(arr)
# create array mask where upper triangle are ones, zeros elsewhere
# i.e. 1 where i<j else 0
msk = np.tri(arr.size, k=-1, dtype=np.bool).T
_i, _j = map(lambda a: a[msk].flatten(), np.mgrid[0:arr.size, 0:arr.size])
# differences
diff = np.abs(arr[_i] - arr[_j])
# Qn is c_n * Q_25(diff)
# c_n is 2.219144 for normally distributed data of moderate length and above
Q_n = c_n * np.quantile(diff, 0.25, interpolation='nearest')
return Q_n