View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2006-2021 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math;
20  
21  import java.util.HashSet;
22  import java.util.Set;
23  
24  import cern.colt.list.DoubleArrayList;
25  import cern.jet.stat.Descriptive;
26  import org.apache.commons.math3.util.DoubleArray;
27  
28  /**
29   * Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can
30   * be found elsewhere.
31   *
32   * @author Paul Pavlidis
33   * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/math/package-summary.html">cern.jet.math
34   * </a>
35   * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/package-summary.html">cern.jet.stat
36   * </a>
37   */
38  public class Stats {
39  
40  
41      /**
42       * Convert an array into a cumulative density function (CDF). This assumes that the input contains counts
43       * representing the distribution in question.
44       *
45       * @param x The input of counts (i.e. a histogram).
46       * @return DoubleArrayList the CDF.
47       */
48      public static DoubleArrayList cdf(DoubleArrayList x) {
49          return cumulateRight(normalize(x));
50      }
51  
52      /**
53       * Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the
54       * concern is the left tail.
55       *
56       * @param x DoubleArrayList
57       * @return cern.colt.list.DoubleArrayList
58       */
59      public static DoubleArrayList cumulate(DoubleArrayList x) {
60          if (x.size() == 0) {
61              return new DoubleArrayList(0);
62          }
63  
64          DoubleArrayList r = new DoubleArrayList();
65  
66          double sum = 0.0;
67          for (int i = 0; i < x.size(); i++) {
68              sum += x.get(i);
69              r.add(sum);
70          }
71          return r;
72      }
73  
74      /**
75       * Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating
76       * upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very
77       * small numbers that could be lost to rounding.
78       *
79       * @param x the array of data to be cumulated.
80       * @return cern.colt.list.DoubleArrayList
81       */
82      public static DoubleArrayList cumulateRight(DoubleArrayList x) {
83          if (x.size() == 0) {
84              return new DoubleArrayList(0);
85          }
86  
87          DoubleArrayList r = new DoubleArrayList(new double[x.size()]);
88  
89          double sum = 0.0;
90          for (int i = x.size() - 1; i >= 0; i--) {
91              sum += x.get(i);
92              r.set(i, sum);
93          }
94          return r;
95      }
96  
97      /**
98       * Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this
99       * returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the
100      * absolute value of the mean; that is, negative values are treated as magnitudes.
101      *
102      * @param data DoubleArrayList
103      * @return the cv
104      * @todo offer a regularized version of this function.
105      */
106     public static double cv(DoubleArrayList data) {
107         double mean = DescriptiveWithMissing.mean(data);
108 
109         double sampleVariance = DescriptiveWithMissing.sampleVariance(data, mean);
110 
111         if (sampleVariance == 0.0) return 0.0;
112 
113         if (mean == 0.0) {
114             return 0.0;
115         }
116 
117         return Math.sqrt(sampleVariance) / Math.abs(mean);
118     }
119 
120     /**
121      * Test whether a value is a valid fractional or probability value.
122      *
123      * @param value
124      * @return true if the value is in the interval 0 to 1.
125      */
126     public static boolean isValidFraction(double value) {
127         if (value > 1.0 || value < 0.0) {
128             return false;
129         }
130         return true;
131     }
132 
133     /**
134      * calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile
135      * must be a value from 0 to 100.
136      *
137      * @param index         the rank of the value we wish to average above.
138      * @param array         Array for which we want to get the quantile.
139      * @param effectiveSize The size of the array, not including NaNs.
140      * @return double
141      * @see DescriptiveWithMissing#meanAboveQuantile
142      */
143     public static double meanAboveQuantile(int index, double[] array, int effectiveSize) {
144 
145         double[] temp = new double[effectiveSize];
146         double median;
147         double returnvalue = 0.0;
148         int k = 0;
149 
150         temp = array;
151         median = quantile(index, array, effectiveSize);
152 
153         for (int i = 0; i < effectiveSize; i++) {
154             if (temp[i] > median) {
155                 returnvalue += temp[i];
156                 k++;
157             }
158         }
159         return returnvalue / k;
160     }
161 
162     /**
163      * Adjust the elements of an array so they total to 1.0.
164      *
165      * @param x Input array.
166      * @return Normalized array.
167      */
168     public static DoubleArrayList normalize(DoubleArrayList x) {
169         return normalize(x, Descriptive.sum(x));
170     }
171 
172     /**
173      * Divide the elements of an array by a given factor.
174      *
175      * @param x          Input array.
176      * @param normfactor double
177      * @return Normalized array.
178      */
179     public static DoubleArrayList normalize(DoubleArrayList x, double normfactor) {
180         if (x.size() == 0) {
181             return new DoubleArrayList(0);
182         }
183 
184         DoubleArrayList r = new DoubleArrayList();
185 
186         for (int i = 0; i < x.size(); i++) {
187             r.add(x.get(i) / normfactor);
188         }
189         return r;
190 
191     }
192 
193     /**
194      * @param array input data
195      * @param tolerance a small constant
196      * @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct
197      * value.
198      */
199     public static Integer numberofDistinctValues(DoubleArrayList array, double tolerance) {
200 
201         Set<Double> distinct = new HashSet<>();
202         int r = 1;
203         if (tolerance > 0.0) {
204             r = (int) Math.ceil(1.0 / tolerance);
205         }
206         for (int i = 0; i < array.size(); i++) {
207             double v = array.get(i);
208             if (tolerance > 0) {
209                 // this might not be foolproof
210                 distinct.add((double) Math.round(v * r) / r);
211             } else {
212                 distinct.add(v);
213             }
214         }
215         return Math.max(0, distinct.size());
216 
217     }
218 
219 
220     /**
221      * @param tolerance a small constant
222      * @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely
223      */
224     public static Integer numberofDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
225 
226         Set<Double> distinct = new HashSet<>();
227         int r = 1;
228         if (tolerance > 0.0) {
229             r = (int) Math.ceil(1.0 / tolerance);
230         }
231         for (int i = 0; i < array.size(); i++) {
232             double v = array.get(i);
233             if (Double.isNaN(v)) {
234                 continue;
235             }
236             if (tolerance > 0) {
237                 // this might not be foolproof
238                 distinct.add((double) Math.round(v * r) / r);
239             } else {
240                 distinct.add(v);
241             }
242         }
243         return Math.max(0, distinct.size());
244 
245     }
246 
247     /**
248      * Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned.
249      *
250      * @param array input data
251      * @param tolerance a small constant to define the difference that is "distinct"
252      * @return
253      */
254     public static Double fractionDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
255         double numNonNA = (double) numNonMissing(array);
256         if (numNonNA == 0) return 0.0;
257         return (double) numberofDistinctValuesNonNA(array, tolerance) / numNonNA;
258     }
259 
260     private static Integer numNonMissing(DoubleArrayList array) {
261         int nm = 0;
262         for (int i = 0; i < array.size(); i++) {
263             if (Double.isNaN(array.get(i))) continue;
264             nm++;
265         }
266         return nm;
267     }
268 
269 
270     /**
271      * Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored.
272      *
273      * @param index         - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
274      *                      we should enter 100.
275      * @param values        double[] - array of data we want quantile of
276      * @param effectiveSize int the effective size of the array
277      * @return double the value at the requested quantile
278      * @see DescriptiveWithMissing#quantile
279      */
280     public static double quantile(int index, double[] values, int effectiveSize) {
281         double pivot = -1.0;
282         if (index == 0) {
283             double ans = values[0];
284             for (int i = 1; i < effectiveSize; i++) {
285                 if (ans > values[i]) {
286                     ans = values[i];
287                 }
288             }
289             return ans;
290         }
291 
292         double[] temp = new double[effectiveSize];
293 
294         for (int i = 0; i < effectiveSize; i++) {
295             temp[i] = values[i];
296         }
297 
298         pivot = temp[0];
299 
300         double[] smaller = new double[effectiveSize];
301         double[] bigger = new double[effectiveSize];
302         int itrSm = 0;
303         int itrBg = 0;
304         for (int i = 1; i < effectiveSize; i++) {
305             if (temp[i] <= pivot) {
306                 smaller[itrSm] = temp[i];
307                 itrSm++;
308             } else if (temp[i] > pivot) {
309                 bigger[itrBg] = temp[i];
310                 itrBg++;
311             }
312         }
313         if (itrSm > index) { // quantile must be in the 'smaller' array
314             return quantile(index, smaller, itrSm);
315         } else if (itrSm < index) { // quantile is in the 'bigger' array
316             return quantile(index - itrSm - 1, bigger, itrBg);
317         } else {
318             return pivot;
319         }
320 
321     }
322 
323     /**
324      * Compute the range of an array. Missing values are ignored.
325      *
326      * @param data DoubleArrayList
327      * @return double
328      */
329     public static double range(DoubleArrayList data) {
330         return DescriptiveWithMissing.max(data) - DescriptiveWithMissing.min(data);
331     }
332 
333     private Stats() { /* block instantiation */
334     }
335 
336 }