View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2006-2021 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math;
20  
21  import java.util.HashSet;
22  import java.util.Set;
23  
24  import cern.colt.list.DoubleArrayList;
25  import cern.jet.stat.Descriptive;
26  
27  /**
28   * Miscellaneous functions used for statistical analysis. Some are optimized or specialized versions of methods that can
29   * be found elsewhere.
30   *
31   * @author Paul Pavlidis
32   * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/math/package-summary.html">cern.jet.math
33   * </a>
34   * @see <a href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/package-summary.html">cern.jet.stat
35   * </a>
36   */
37  public class Stats {
38  
39  
40      /**
41       * Convert an array into a cumulative density function (CDF). This assumes that the input contains counts
42       * representing the distribution in question.
43       *
44       * @param x The input of counts (i.e. a histogram).
45       * @return DoubleArrayList the CDF.
46       */
47      public static DoubleArrayList cdf(DoubleArrayList x) {
48          return cumulateRight(normalize(x));
49      }
50  
51      /**
52       * Convert an array into a cumulative array. Summing is from the left hand side. Use this to make CDFs where the
53       * concern is the left tail.
54       *
55       * @param x DoubleArrayList
56       * @return cern.colt.list.DoubleArrayList
57       */
58      public static DoubleArrayList cumulate(DoubleArrayList x) {
59          if (x.size() == 0) {
60              return new DoubleArrayList(0);
61          }
62  
63          DoubleArrayList r = new DoubleArrayList();
64  
65          double sum = 0.0;
66          for (int i = 0; i < x.size(); i++) {
67              sum += x.get(i);
68              r.add(sum);
69          }
70          return r;
71      }
72  
73      /**
74       * Convert an array into a cumulative array. Summing is from the right hand side. This is useful for creating
75       * upper-tail cumulative density histograms from count histograms, where the upper tail is expected to have very
76       * small numbers that could be lost to rounding.
77       *
78       * @param x the array of data to be cumulated.
79       * @return cern.colt.list.DoubleArrayList
80       */
81      public static DoubleArrayList cumulateRight(DoubleArrayList x) {
82          if (x.size() == 0) {
83              return new DoubleArrayList(0);
84          }
85  
86          DoubleArrayList r = new DoubleArrayList(new double[x.size()]);
87  
88          double sum = 0.0;
89          for (int i = x.size() - 1; i >= 0; i--) {
90              sum += x.get(i);
91              r.set(i, sum);
92          }
93          return r;
94      }
95  
96      /**
97       * Compute the coefficient of variation of an array (standard deviation / mean). If the variance is zero, this
98       * returns zero. If the mean is zero, NaN is returned. If the mean is negative, the CV is computed relative to the
99       * absolute value of the mean; that is, negative values are treated as magnitudes.
100      *
101      * @param data DoubleArrayList
102      * @return the cv
103      * @todo offer a regularized version of this function.
104      */
105     public static double cv(DoubleArrayList data) {
106         double mean = DescriptiveWithMissing.mean(data);
107 
108         double sampleVariance = DescriptiveWithMissing.sampleVariance(data, mean);
109 
110         if (sampleVariance == 0.0) return 0.0;
111 
112         if (mean == 0.0) {
113             return 0.0;
114         }
115 
116         return Math.sqrt(sampleVariance) / Math.abs(mean);
117     }
118 
119     /**
120      * Test whether a value is a valid fractional or probability value.
121      *
122      * @param value
123      * @return true if the value is in the interval 0 to 1.
124      */
125     public static boolean isValidFraction(double value) {
126         if (value > 1.0 || value < 0.0) {
127             return false;
128         }
129         return true;
130     }
131 
132     /**
133      * calculate the mean of the values above (NOT greater or equal to) a particular index rank of an array. Quantile
134      * must be a value from 0 to 100.
135      *
136      * @param index         the rank of the value we wish to average above.
137      * @param array         Array for which we want to get the quantile.
138      * @param effectiveSize The size of the array, not including NaNs.
139      * @return double
140      * @see DescriptiveWithMissing#meanAboveQuantile
141      */
142     public static double meanAboveQuantile(int index, double[] array, int effectiveSize) {
143 
144         double[] temp = new double[effectiveSize];
145         double median;
146         double returnvalue = 0.0;
147         int k = 0;
148 
149         temp = array;
150         median = quantile(index, array, effectiveSize);
151 
152         for (int i = 0; i < effectiveSize; i++) {
153             if (temp[i] > median) {
154                 returnvalue += temp[i];
155                 k++;
156             }
157         }
158         return returnvalue / k;
159     }
160 
161     /**
162      * Adjust the elements of an array so they total to 1.0.
163      *
164      * @param x Input array.
165      * @return Normalized array.
166      */
167     public static DoubleArrayList normalize(DoubleArrayList x) {
168         return normalize(x, Descriptive.sum(x));
169     }
170 
171     /**
172      * Divide the elements of an array by a given factor.
173      *
174      * @param x          Input array.
175      * @param normfactor double
176      * @return Normalized array.
177      */
178     public static DoubleArrayList normalize(DoubleArrayList x, double normfactor) {
179         if (x.size() == 0) {
180             return new DoubleArrayList(0);
181         }
182 
183         DoubleArrayList r = new DoubleArrayList();
184 
185         for (int i = 0; i < x.size(); i++) {
186             r.add(x.get(i) / normfactor);
187         }
188         return r;
189 
190     }
191 
192     /**
193      * @param array input data
194      * @param tolerance a small constant
195      * @return number of distinct values in the array, within tolerance. Double.NaN is counted as a distinct
196      * value.
197      */
198     public static Integer numberofDistinctValues(DoubleArrayList array, double tolerance) {
199 
200         Set<Double> distinct = new HashSet<>();
201         int r = 1;
202         if (tolerance > 0.0) {
203             r = (int) Math.ceil(1.0 / tolerance);
204         }
205         for (int i = 0; i < array.size(); i++) {
206             double v = array.get(i);
207             if (tolerance > 0) {
208                 // this might not be foolproof
209                 distinct.add((double) Math.round(v * r) / r);
210             } else {
211                 distinct.add(v);
212             }
213         }
214         return Math.max(0, distinct.size());
215 
216     }
217 
218 
219     /**
220      * @param tolerance a small constant
221      * @return number of distinct values in the array, within tolerance. Double.NaN is ignored entirely
222      */
223     public static Integer numberofDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
224 
225         Set<Double> distinct = new HashSet<>();
226         int r = 1;
227         if (tolerance > 0.0) {
228             r = (int) Math.ceil(1.0 / tolerance);
229         }
230         for (int i = 0; i < array.size(); i++) {
231             double v = array.get(i);
232             if (Double.isNaN(v)) {
233                 continue;
234             }
235             if (tolerance > 0) {
236                 // this might not be foolproof
237                 distinct.add((double) Math.round(v * r) / r);
238             } else {
239                 distinct.add(v);
240             }
241         }
242         return Math.max(0, distinct.size());
243 
244     }
245 
246     /**
247      * Compute the fraction of values which are distinct. NaNs are ignored entirely. If the data are all NaN, 0.0 is returned.
248      *
249      * @param array input data
250      * @param tolerance a small constant to define the difference that is "distinct"
251      * @return
252      */
253     public static Double fractionDistinctValuesNonNA(DoubleArrayList array, double tolerance) {
254         double numNonNA = (double) numNonMissing(array);
255         if (numNonNA == 0) return 0.0;
256         return (double) numberofDistinctValuesNonNA(array, tolerance) / numNonNA;
257     }
258 
259     private static Integer numNonMissing(DoubleArrayList array) {
260         int nm = 0;
261         for (int i = 0; i < array.size(); i++) {
262             if (Double.isNaN(array.get(i))) continue;
263             nm++;
264         }
265         return nm;
266     }
267 
268 
269     /**
270      * Given a double array, calculate the quantile requested. Note that no interpolation is done and missing values are ignored.
271      *
272      * @param index         - the rank of the value we wish to get. Thus if we have 200 items in the array, and want the median,
273      *                      we should enter 100.
274      * @param values        double[] - array of data we want quantile of
275      * @param effectiveSize int the effective size of the array
276      * @return double the value at the requested quantile
277      * @see DescriptiveWithMissing#quantile
278      */
279     public static double quantile(int index, double[] values, int effectiveSize) {
280         double pivot = -1.0;
281         if (index == 0) {
282             double ans = values[0];
283             for (int i = 1; i < effectiveSize; i++) {
284                 if (ans > values[i]) {
285                     ans = values[i];
286                 }
287             }
288             return ans;
289         }
290 
291         double[] temp = new double[effectiveSize];
292 
293         for (int i = 0; i < effectiveSize; i++) {
294             temp[i] = values[i];
295         }
296 
297         pivot = temp[0];
298 
299         double[] smaller = new double[effectiveSize];
300         double[] bigger = new double[effectiveSize];
301         int itrSm = 0;
302         int itrBg = 0;
303         for (int i = 1; i < effectiveSize; i++) {
304             if (temp[i] <= pivot) {
305                 smaller[itrSm] = temp[i];
306                 itrSm++;
307             } else if (temp[i] > pivot) {
308                 bigger[itrBg] = temp[i];
309                 itrBg++;
310             }
311         }
312         if (itrSm > index) { // quantile must be in the 'smaller' array
313             return quantile(index, smaller, itrSm);
314         } else if (itrSm < index) { // quantile is in the 'bigger' array
315             return quantile(index - itrSm - 1, bigger, itrBg);
316         } else {
317             return pivot;
318         }
319 
320     }
321 
322     /**
323      * Compute the range of an array. Missing values are ignored.
324      *
325      * @param data DoubleArrayList
326      * @return double
327      */
328     public static double range(DoubleArrayList data) {
329         return DescriptiveWithMissing.max(data) - DescriptiveWithMissing.min(data);
330     }
331 
332     private Stats() { /* block instantiation */
333     }
334 
335 }