1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006-2021 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 import java.util.HashSet;
22 import java.util.Set;
23
24 import cern.colt.list.DoubleArrayList;
25 import cern.jet.stat.Descriptive;
26
27 /**
28 * Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can
29 * be found elsewhere.
30 *
31 * @author Paul Pavlidis
32 * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/math/package-summary.html">cern.jet.math
33 * </a>
34 * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/package-summary.html">cern.jet.stat
35 * </a>
36 */
37 public class Stats {
38
39
40 /**
41 * Convert an array into a cumulative density function (CDF). This assumes that the input contains counts
42 * representing the distribution in question.
43 *
44 * @param x The input of counts (i.e. a histogram).
45 * @return DoubleArrayList the CDF.
46 */
47 public static DoubleArrayList cdf(DoubleArrayList x) {
48 return cumulateRight(normalize(x));
49 }
50
51 /**
52 * Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the
53 * concern is the left tail.
54 *
55 * @param x DoubleArrayList
56 * @return cern.colt.list.DoubleArrayList
57 */
58 public static DoubleArrayList cumulate(DoubleArrayList x) {
59 if (x.size() == 0) {
60 return new DoubleArrayList(0);
61 }
62
63 DoubleArrayList r = new DoubleArrayList();
64
65 double sum = 0.0;
66 for (int i = 0; i < x.size(); i++) {
67 sum += x.get(i);
68 r.add(sum);
69 }
70 return r;
71 }
72
73 /**
74 * Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating
75 * upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very
76 * small numbers that could be lost to rounding.
77 *
78 * @param x the array of data to be cumulated.
79 * @return cern.colt.list.DoubleArrayList
80 */
81 public static DoubleArrayList cumulateRight(DoubleArrayList x) {
82 if (x.size() == 0) {
83 return new DoubleArrayList(0);
84 }
85
86 DoubleArrayList r = new DoubleArrayList(new double[x.size()]);
87
88 double sum = 0.0;
89 for (int i = x.size() - 1; i >= 0; i--) {
90 sum += x.get(i);
91 r.set(i, sum);
92 }
93 return r;
94 }
95
96 /**
97 * Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this
98 * returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the
99 * absolute value of the mean; that is, negative values are treated as magnitudes.
100 *
101 * @param data DoubleArrayList
102 * @return the cv
103 * @todo offer a regularized version of this function.
104 */
105 public static double cv(DoubleArrayList data) {
106 double mean = DescriptiveWithMissing.mean(data);
107
108 double sampleVariance = DescriptiveWithMissing.sampleVariance(data, mean);
109
110 if (sampleVariance == 0.0) return 0.0;
111
112 if (mean == 0.0) {
113 return 0.0;
114 }
115
116 return Math.sqrt(sampleVariance) / Math.abs(mean);
117 }
118
119 /**
120 * Test whether a value is a valid fractional or probability value.
121 *
122 * @param value
123 * @return true if the value is in the interval 0 to 1.
124 */
125 public static boolean isValidFraction(double value) {
126 if (value > 1.0 || value < 0.0) {
127 return false;
128 }
129 return true;
130 }
131
132 /**
133 * calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile
134 * must be a value from 0 to 100.
135 *
136 * @param index the rank of the value we wish to average above.
137 * @param array Array for which we want to get the quantile.
138 * @param effectiveSize The size of the array, not including NaNs.
139 * @return double
140 * @see DescriptiveWithMissing#meanAboveQuantile
141 */
142 public static double meanAboveQuantile(int index, double[] array, int effectiveSize) {
143
144 double[] temp = new double[effectiveSize];
145 double median;
146 double returnvalue = 0.0;
147 int k = 0;
148
149 temp = array;
150 median = quantile(index, array, effectiveSize);
151
152 for (int i = 0; i < effectiveSize; i++) {
153 if (temp[i] > median) {
154 returnvalue += temp[i];
155 k++;
156 }
157 }
158 return returnvalue / k;
159 }
160
161 /**
162 * Adjust the elements of an array so they total to 1.0.
163 *
164 * @param x Input array.
165 * @return Normalized array.
166 */
167 public static DoubleArrayList normalize(DoubleArrayList x) {
168 return normalize(x, Descriptive.sum(x));
169 }
170
171 /**
172 * Divide the elements of an array by a given factor.
173 *
174 * @param x Input array.
175 * @param normfactor double
176 * @return Normalized array.
177 */
178 public static DoubleArrayList normalize(DoubleArrayList x, double normfactor) {
179 if (x.size() == 0) {
180 return new DoubleArrayList(0);
181 }
182
183 DoubleArrayList r = new DoubleArrayList();
184
185 for (int i = 0; i < x.size(); i++) {
186 r.add(x.get(i) / normfactor);
187 }
188 return r;
189
190 }
191
192 /**
193 * @param array input data
194 * @param tolerance a small constant
195 * @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct
196 * value.
197 */
198 public static Integer numberofDistinctValues(DoubleArrayList array, double tolerance) {
199
200 Set<Double> distinct = new HashSet<>();
201 int r = 1;
202 if (tolerance > 0.0) {
203 r = (int) Math.ceil(1.0 / tolerance);
204 }
205 for (int i = 0; i < array.size(); i++) {
206 double v = array.get(i);
207 if (tolerance > 0) {
208 // this might not be foolproof
209 distinct.add((double) Math.round(v * r) / r);
210 } else {
211 distinct.add(v);
212 }
213 }
214 return Math.max(0, distinct.size());
215
216 }
217
218
219 /**
220 * @param tolerance a small constant
221 * @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely
222 */
223 public static Integer numberofDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
224
225 Set<Double> distinct = new HashSet<>();
226 int r = 1;
227 if (tolerance > 0.0) {
228 r = (int) Math.ceil(1.0 / tolerance);
229 }
230 for (int i = 0; i < array.size(); i++) {
231 double v = array.get(i);
232 if (Double.isNaN(v)) {
233 continue;
234 }
235 if (tolerance > 0) {
236 // this might not be foolproof
237 distinct.add((double) Math.round(v * r) / r);
238 } else {
239 distinct.add(v);
240 }
241 }
242 return Math.max(0, distinct.size());
243
244 }
245
246 /**
247 * Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned.
248 *
249 * @param array input data
250 * @param tolerance a small constant to define the difference that is "distinct"
251 * @return
252 */
253 public static Double fractionDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
254 double numNonNA = (double) numNonMissing(array);
255 if (numNonNA == 0) return 0.0;
256 return (double) numberofDistinctValuesNonNA(array, tolerance) / numNonNA;
257 }
258
259 private static Integer numNonMissing(DoubleArrayList array) {
260 int nm = 0;
261 for (int i = 0; i < array.size(); i++) {
262 if (Double.isNaN(array.get(i))) continue;
263 nm++;
264 }
265 return nm;
266 }
267
268
269 /**
270 * Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored.
271 *
272 * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
273 * we should enter 100.
274 * @param values double[] - array of data we want quantile of
275 * @param effectiveSize int the effective size of the array
276 * @return double the value at the requested quantile
277 * @see DescriptiveWithMissing#quantile
278 */
279 public static double quantile(int index, double[] values, int effectiveSize) {
280 double pivot = -1.0;
281 if (index == 0) {
282 double ans = values[0];
283 for (int i = 1; i < effectiveSize; i++) {
284 if (ans > values[i]) {
285 ans = values[i];
286 }
287 }
288 return ans;
289 }
290
291 double[] temp = new double[effectiveSize];
292
293 for (int i = 0; i < effectiveSize; i++) {
294 temp[i] = values[i];
295 }
296
297 pivot = temp[0];
298
299 double[] smaller = new double[effectiveSize];
300 double[] bigger = new double[effectiveSize];
301 int itrSm = 0;
302 int itrBg = 0;
303 for (int i = 1; i < effectiveSize; i++) {
304 if (temp[i] <= pivot) {
305 smaller[itrSm] = temp[i];
306 itrSm++;
307 } else if (temp[i] > pivot) {
308 bigger[itrBg] = temp[i];
309 itrBg++;
310 }
311 }
312 if (itrSm > index) { // quantile must be in the 'smaller' array
313 return quantile(index, smaller, itrSm);
314 } else if (itrSm < index) { // quantile is in the 'bigger' array
315 return quantile(index - itrSm - 1, bigger, itrBg);
316 } else {
317 return pivot;
318 }
319
320 }
321
322 /**
323 * Compute the range of an array. Missing values are ignored.
324 *
325 * @param data DoubleArrayList
326 * @return double
327 */
328 public static double range(DoubleArrayList data) {
329 return DescriptiveWithMissing.max(data) - DescriptiveWithMissing.min(data);
330 }
331
332 private Stats() { /* block instantiation */
333 }
334
335 }