1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006-2021 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 import java.util.HashSet;
22 import java.util.Set;
23
24 import cern.colt.list.DoubleArrayList;
25 import cern.jet.stat.Descriptive;
26 import org.apache.commons.math3.util.DoubleArray;
27
28 /**
29 * Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can
30 * be found elsewhere.
31 *
32 * @author Paul Pavlidis
33 * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/math/package-summary.html">cern.jet.math
34 * </a>
35 * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/package-summary.html">cern.jet.stat
36 * </a>
37 */
38 public class Stats {
39
40
41 /**
42 * Convert an array into a cumulative density function (CDF). This assumes that the input contains counts
43 * representing the distribution in question.
44 *
45 * @param x The input of counts (i.e. a histogram).
46 * @return DoubleArrayList the CDF.
47 */
48 public static DoubleArrayList cdf(DoubleArrayList x) {
49 return cumulateRight(normalize(x));
50 }
51
52 /**
53 * Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the
54 * concern is the left tail.
55 *
56 * @param x DoubleArrayList
57 * @return cern.colt.list.DoubleArrayList
58 */
59 public static DoubleArrayList cumulate(DoubleArrayList x) {
60 if (x.size() == 0) {
61 return new DoubleArrayList(0);
62 }
63
64 DoubleArrayList r = new DoubleArrayList();
65
66 double sum = 0.0;
67 for (int i = 0; i < x.size(); i++) {
68 sum += x.get(i);
69 r.add(sum);
70 }
71 return r;
72 }
73
74 /**
75 * Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating
76 * upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very
77 * small numbers that could be lost to rounding.
78 *
79 * @param x the array of data to be cumulated.
80 * @return cern.colt.list.DoubleArrayList
81 */
82 public static DoubleArrayList cumulateRight(DoubleArrayList x) {
83 if (x.size() == 0) {
84 return new DoubleArrayList(0);
85 }
86
87 DoubleArrayList r = new DoubleArrayList(new double[x.size()]);
88
89 double sum = 0.0;
90 for (int i = x.size() - 1; i >= 0; i--) {
91 sum += x.get(i);
92 r.set(i, sum);
93 }
94 return r;
95 }
96
97 /**
98 * Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this
99 * returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the
100 * absolute value of the mean; that is, negative values are treated as magnitudes.
101 *
102 * @param data DoubleArrayList
103 * @return the cv
104 * @todo offer a regularized version of this function.
105 */
106 public static double cv(DoubleArrayList data) {
107 double mean = DescriptiveWithMissing.mean(data);
108
109 double sampleVariance = DescriptiveWithMissing.sampleVariance(data, mean);
110
111 if (sampleVariance == 0.0) return 0.0;
112
113 if (mean == 0.0) {
114 return 0.0;
115 }
116
117 return Math.sqrt(sampleVariance) / Math.abs(mean);
118 }
119
120 /**
121 * Test whether a value is a valid fractional or probability value.
122 *
123 * @param value
124 * @return true if the value is in the interval 0 to 1.
125 */
126 public static boolean isValidFraction(double value) {
127 if (value > 1.0 || value < 0.0) {
128 return false;
129 }
130 return true;
131 }
132
133 /**
134 * calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile
135 * must be a value from 0 to 100.
136 *
137 * @param index the rank of the value we wish to average above.
138 * @param array Array for which we want to get the quantile.
139 * @param effectiveSize The size of the array, not including NaNs.
140 * @return double
141 * @see DescriptiveWithMissing#meanAboveQuantile
142 */
143 public static double meanAboveQuantile(int index, double[] array, int effectiveSize) {
144
145 double[] temp = new double[effectiveSize];
146 double median;
147 double returnvalue = 0.0;
148 int k = 0;
149
150 temp = array;
151 median = quantile(index, array, effectiveSize);
152
153 for (int i = 0; i < effectiveSize; i++) {
154 if (temp[i] > median) {
155 returnvalue += temp[i];
156 k++;
157 }
158 }
159 return returnvalue / k;
160 }
161
162 /**
163 * Adjust the elements of an array so they total to 1.0.
164 *
165 * @param x Input array.
166 * @return Normalized array.
167 */
168 public static DoubleArrayList normalize(DoubleArrayList x) {
169 return normalize(x, Descriptive.sum(x));
170 }
171
172 /**
173 * Divide the elements of an array by a given factor.
174 *
175 * @param x Input array.
176 * @param normfactor double
177 * @return Normalized array.
178 */
179 public static DoubleArrayList normalize(DoubleArrayList x, double normfactor) {
180 if (x.size() == 0) {
181 return new DoubleArrayList(0);
182 }
183
184 DoubleArrayList r = new DoubleArrayList();
185
186 for (int i = 0; i < x.size(); i++) {
187 r.add(x.get(i) / normfactor);
188 }
189 return r;
190
191 }
192
193 /**
194 * @param array input data
195 * @param tolerance a small constant
196 * @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct
197 * value.
198 */
199 public static Integer numberofDistinctValues(DoubleArrayList array, double tolerance) {
200
201 Set<Double> distinct = new HashSet<>();
202 int r = 1;
203 if (tolerance > 0.0) {
204 r = (int) Math.ceil(1.0 / tolerance);
205 }
206 for (int i = 0; i < array.size(); i++) {
207 double v = array.get(i);
208 if (tolerance > 0) {
209 // this might not be foolproof
210 distinct.add((double) Math.round(v * r) / r);
211 } else {
212 distinct.add(v);
213 }
214 }
215 return Math.max(0, distinct.size());
216
217 }
218
219
220 /**
221 * @param tolerance a small constant
222 * @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely
223 */
224 public static Integer numberofDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
225
226 Set<Double> distinct = new HashSet<>();
227 int r = 1;
228 if (tolerance > 0.0) {
229 r = (int) Math.ceil(1.0 / tolerance);
230 }
231 for (int i = 0; i < array.size(); i++) {
232 double v = array.get(i);
233 if (Double.isNaN(v)) {
234 continue;
235 }
236 if (tolerance > 0) {
237 // this might not be foolproof
238 distinct.add((double) Math.round(v * r) / r);
239 } else {
240 distinct.add(v);
241 }
242 }
243 return Math.max(0, distinct.size());
244
245 }
246
247 /**
248 * Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned.
249 *
250 * @param array input data
251 * @param tolerance a small constant to define the difference that is "distinct"
252 * @return
253 */
254 public static Double fractionDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
255 double numNonNA = (double) numNonMissing(array);
256 if (numNonNA == 0) return 0.0;
257 return (double) numberofDistinctValuesNonNA(array, tolerance) / numNonNA;
258 }
259
260 private static Integer numNonMissing(DoubleArrayList array) {
261 int nm = 0;
262 for (int i = 0; i < array.size(); i++) {
263 if (Double.isNaN(array.get(i))) continue;
264 nm++;
265 }
266 return nm;
267 }
268
269
270 /**
271 * Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored.
272 *
273 * @param index - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
274 * we should enter 100.
275 * @param values double[] - array of data we want quantile of
276 * @param effectiveSize int the effective size of the array
277 * @return double the value at the requested quantile
278 * @see DescriptiveWithMissing#quantile
279 */
280 public static double quantile(int index, double[] values, int effectiveSize) {
281 double pivot = -1.0;
282 if (index == 0) {
283 double ans = values[0];
284 for (int i = 1; i < effectiveSize; i++) {
285 if (ans > values[i]) {
286 ans = values[i];
287 }
288 }
289 return ans;
290 }
291
292 double[] temp = new double[effectiveSize];
293
294 for (int i = 0; i < effectiveSize; i++) {
295 temp[i] = values[i];
296 }
297
298 pivot = temp[0];
299
300 double[] smaller = new double[effectiveSize];
301 double[] bigger = new double[effectiveSize];
302 int itrSm = 0;
303 int itrBg = 0;
304 for (int i = 1; i < effectiveSize; i++) {
305 if (temp[i] <= pivot) {
306 smaller[itrSm] = temp[i];
307 itrSm++;
308 } else if (temp[i] > pivot) {
309 bigger[itrBg] = temp[i];
310 itrBg++;
311 }
312 }
313 if (itrSm > index) { // quantile must be in the 'smaller' array
314 return quantile(index, smaller, itrSm);
315 } else if (itrSm < index) { // quantile is in the 'bigger' array
316 return quantile(index - itrSm - 1, bigger, itrBg);
317 } else {
318 return pivot;
319 }
320
321 }
322
323 /**
324 * Compute the range of an array. Missing values are ignored.
325 *
326 * @param data DoubleArrayList
327 * @return double
328 */
329 public static double range(DoubleArrayList data) {
330 return DescriptiveWithMissing.max(data) - DescriptiveWithMissing.min(data);
331 }
332
333 private Stats() { /* block instantiation */
334 }
335
336 }