View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2006 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math;
20  
21  import cern.colt.list.DoubleArrayList;
22  import cern.jet.stat.Descriptive;
23  
24  /**
25   * Mathematical functions for statistics that allow missing values without scotching the calculations.
26   * <p>
27   * Be careful because some methods from cern.jet.stat.Descriptive have not been overridden and will yield a
28   * UnsupportedOperationException if used.
29   * </p>
30   * <p>
31   * Some functions that come with DoubleArrayLists will not work in an entirely compatible way with missing values. For
32   * examples, size() reports the total number of elements, including missing values. To get a count of non-missing
33   * values, use this.sizeWithoutMissingValues(). The right one to use may vary.
34   * </p>
35   * <p>
36   * Not all methods need to be overridden. However, all methods that take a "size" parameter should be passed the results
37   * of sizeWithoutMissingValues(data), instead of data.size().
38   * <p>
39   * Based in part on code from the colt package: Copyright &copy; 1999 CERN - European Organization for Nuclear Research.
40   *
41   * @author Paul Pavlidis
42   * @see <a
43   * href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/Descriptive.html">cern.jet.stat.Descriptive
44   * </a>
45   */
46  public class DescriptiveWithMissing extends cern.jet.stat.Descriptive {
47  
48      /**
49       * <b>Not supported. </b>
50       *
51       * @param data     DoubleArrayList
52       * @param lag      int
53       * @param mean     double
54       * @param variance double
55       * @return double
56       */
57      public static double autoCorrelation(DoubleArrayList data, int lag, double mean, double variance) {
58          throw new UnsupportedOperationException("autoCorrelation not supported with missing values");
59      }
60  
61      /**
62       * Highly optimized version of the correlation computation, where as much information is precomputed as possible.
63       * Use of this method only makes sense if many comparisons with the inputs x and y are being performed.
64       * <p>
65       * Implementation note: In correlation(DoubleArrayList x, DoubleArrayList y), profiling shows that calls to
66       * Double.NaN consume half the CPU time. The precomputation of the element-by-element squared values is another
67       * obvious optimization. There is also no checking for matching lengths of the arrays.
68       *
69       * @param x
70       * @param y
71       * @param selfSquaredX double array containing values of x_i^2 for each x.
72       * @param selfSquaredY
73       * @param nanStatusX   boolean array containing value of Double.isNaN() for each X.
74       * @param nanStatusY
75       * @return
76       */
77      public static double correlation(double[] x, double[] y, double[] selfSquaredX, double[] selfSquaredY, boolean[] nanStatusX, boolean[] nanStatusY) {
78  
79          double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
80          int numused = 0;
81          syy = 0.0;
82          sxy = 0.0;
83          sxx = 0.0;
84          sx = 0.0;
85          sy = 0.0;
86  
87          int length = x.length;
88          for (int j = 0; j < length; j++) {
89              xj = x[j];
90              yj = y[j];
91  
92              if (nanStatusX[j] || nanStatusY[j]) {
93                  continue;
94              }
95              sx += xj;
96              sy += yj;
97              sxy += xj * yj;
98              sxx += selfSquaredX[j];
99              syy += selfSquaredY[j];
100             numused++;
101         }
102 
103         if (numused > 0) {
104             ay = sy / numused;
105             ax = sx / numused;
106             return (sxy - sx * ay) / Math.sqrt((sxx - sx * ax) * (syy - sy * ay));
107         }
108         return Double.NaN; // signifies that it could not be calculated.
109     }
110 
111     /**
112      * Returns the correlation of two data sequences. That is
113      * <tt>covariance(data1,data2)/(standardDev1*standardDev2)</tt>. Missing values are ignored. This method is
114      * overridden to stop users from using the method in the superclass when missing values are present. The problem is
115      * that the standard deviation cannot be computed without knowning which values are not missing in both vectors to
116      * be compared. Thus the standardDev parameters are thrown away by this method.
117      *
118      * @param data1        DoubleArrayList
119      * @param standardDev1 double - not used
120      * @param data2        DoubleArrayList
121      * @param standardDev2 double - not used
122      * @return double
123      */
124     public static double correlation(DoubleArrayList data1, double standardDev1, DoubleArrayList data2, double standardDev2) {
125         return correlation(data1, data2);
126     }
127 
128     /**
129      * Calculate the pearson correlation of two arrays. Missing values (NaNs) are ignored.
130      *
131      * @param x DoubleArrayList
132      * @param y DoubleArrayList
133      * @return double
134      */
135     public static double correlation(DoubleArrayList x, DoubleArrayList y) {
136         int j;
137         double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
138         int numused;
139         syy = 0.0;
140         sxy = 0.0;
141         sxx = 0.0;
142         sx = 0.0;
143         sy = 0.0;
144         numused = 0;
145         if (x.size() != y.size()) {
146             throw new ArithmeticException("Unequal vector sizes: " + x.size() + " != " + y.size());
147         }
148 
149         double[] xel = x.elements();
150         double[] yel = y.elements();
151 
152         int length = x.size();
153         for (j = 0; j < length; j++) {
154             xj = xel[j];
155             yj = yel[j];
156 
157             if (!Double.isNaN(xj) && !Double.isNaN(yj)) {
158                 sx += xj;
159                 sy += yj;
160                 sxy += xj * yj;
161                 sxx += xj * xj;
162                 syy += yj * yj;
163                 numused++;
164             }
165         }
166 
167         if (numused > 0) {
168             ay = sy / numused;
169             ax = sx / numused;
170             return (sxy - sx * ay) / Math.sqrt((sxx - sx * ax) * (syy - sy * ay));
171         }
172         return Double.NaN; // signifies that it could not be calculated.
173 
174     }
175 
176     /**
177      * Returns the SAMPLE covariance of two data sequences. Pairs of values are only considered if both are not NaN. If
178      * there are no non-missing pairs, the covariance is zero.
179      *
180      * @param data1 the first vector
181      * @param data2 the second vector
182      * @return double
183      */
184     public static double covariance(DoubleArrayList data1, DoubleArrayList data2) {
185         int size = data1.size();
186         if (size != data2.size() || size == 0) {
187             throw new IllegalArgumentException();
188         }
189         double[] elements1 = data1.elements();
190         double[] elements2 = data2.elements();
191 
192         /* initialize sumx and sumy to the first non-NaN pair of values */
193 
194         int i = 0;
195         double sumx = 0.0, sumy = 0.0, Sxy = 0.0;
196         while (i < size) {
197             sumx = elements1[i];
198             sumy = elements2[i];
199             if (!Double.isNaN(elements1[i]) && !Double.isNaN(elements2[i])) {
200                 break;
201             }
202             i++;
203         }
204         i++;
205         int usedPairs = 1;
206         for (; i < size; ++i) {
207             double x = elements1[i];
208             double y = elements2[i];
209             if (Double.isNaN(x) || Double.isNaN(y)) {
210                 continue;
211             }
212 
213             sumx += x;
214             Sxy += (x - sumx / (usedPairs + 1)) * (y - sumy / usedPairs);
215             sumy += y;
216             usedPairs++;
217         }
218         return Sxy / (usedPairs - 1);
219     }
220 
221     /**
222      * Durbin-Watson computation. This measures the serial correlation in a data series.
223      *
224      * @param data DoubleArrayList
225      * @return double
226      * @todo this will still break in some situations where there are missing values
227      */
228     public static double durbinWatson(DoubleArrayList data) {
229         int size = data.size();
230         if (size < 2) {
231             throw new IllegalArgumentException("data sequence must contain at least two values.");
232         }
233 
234         double[] elements = data.elements();
235         double run = 0;
236         double run_sq = 0;
237         int firstNotNaN = 0;
238         while (firstNotNaN < size) {
239             run_sq = elements[firstNotNaN] * elements[firstNotNaN];
240 
241             if (!Double.isNaN(elements[firstNotNaN])) {
242                 break;
243             }
244 
245             firstNotNaN++;
246         }
247 
248         if (firstNotNaN > 0 && size - firstNotNaN < 2) {
249             throw new IllegalArgumentException("data sequence must contain at least two non-missing values.");
250 
251         }
252 
253         for (int i = firstNotNaN + 1; i < size; i++) {
254             int gap = 1;
255             while (i < size && Double.isNaN(elements[i])) {
256                 gap++;
257                 i++;
258                 continue;
259             }
260             if (i >= size) continue; // missing value at end will cause this.
261             double x = elements[i] - elements[i - gap];
262             /**  */
263             run += x * x;
264             run_sq += elements[i] * elements[i];
265         }
266 
267         return run / run_sq;
268     }
269 
270     /**
271      * Returns the geometric mean of a data sequence. Missing values are ignored. Note that for a geometric mean to be
272      * meaningful, the minimum of the data sequence must not be less or equal to zero. <br>
273      * The geometric mean is given by <tt>pow( Product( data[i] ),
274      * 1/data.size())</tt>. This method tries to avoid overflows at the expense of an equivalent but somewhat slow
275      * definition: <tt>geo = Math.exp( Sum(
276      * Log(data[i]) ) / data.size())</tt>.
277      *
278      * @param data DoubleArrayList
279      * @return double
280      */
281     public static double geometricMean(DoubleArrayList data) {
282         return geometricMean(sizeWithoutMissingValues(data), sumOfLogarithms(data, 0, data.size() - 1));
283     }
284 
285     /**
286      * Incrementally maintains and updates minimum, maximum, sum and sum of squares of a data sequence.
287      * <p>
288      * Assume we have already recorded some data sequence elements and know their minimum, maximum, sum and sum of
289      * squares. Assume further, we are to record some more elements and to derive updated values of minimum, maximum,
290      * sum and sum of squares. This method computes those updated values without needing to know the already recorded
291      * elements. This is interesting for interactive online monitoring and/or applications that cannot keep the entire
292      * huge data sequence in memory.
293      *
294      * @param data  the additional elements to be incorporated into min, max, etc.
295      * @param from  the index of the first element within <tt>data</tt> to consider.
296      * @param to    the index of the last element within <tt>data</tt> to consider. The method incorporates elements
297      *              <tt>data[from], ..., data[to]</tt>.
298      * @param inOut the old values in the following format:
299      *              <ul>
300      *              <li><tt>inOut[0]</tt> is the old minimum. <li><tt>inOut[1]</tt> is the old maximum. <li><tt>inOut[2]</tt>
301      *              is the old sum. <li><tt>inOut[3]</tt> is the old sum of squares.
302      *              </ul>
303      *              If no data sequence elements have so far been recorded set the values as follows
304      *              <ul>
305      *              <li><tt>inOut[0] = Double.POSITIVE_INFINITY</tt> as the old minimum. <li><tt>inOut[1] =
306      *              Double.NEGATIVE_INFINITY</tt> as the old maximum. <li><tt>inOut[2] = 0.0</tt> as the old sum. <li><tt>
307      *              inOut[3] = 0.0</tt> as the old sum of squares.
308      *              </ul>
309      */
310     public static void incrementalUpdate(DoubleArrayList data, int from, int to, double[] inOut) {
311         throw new UnsupportedOperationException("incrementalUpdate not supported with missing values");
312     }
313 
314     /**
315      * <b>Not supported. </b>
316      *
317      * @param data         DoubleArrayList
318      * @param from         int
319      * @param to           int
320      * @param fromSumIndex int
321      * @param toSumIndex   int
322      * @param sumOfPowers  double[]
323      */
324     public static void incrementalUpdateSumsOfPowers(DoubleArrayList data, int from, int to, int fromSumIndex, int toSumIndex, double[] sumOfPowers) {
325         throw new UnsupportedOperationException("incrementalUpdateSumsOfPowers not supported with missing values");
326     }
327 
328     /**
329      * <b>Not supported. </b>
330      *
331      * @param data    DoubleArrayList
332      * @param weights DoubleArrayList
333      * @param from    int
334      * @param to      int
335      * @param inOut   double[]
336      */
337     public static void incrementalWeightedUpdate(DoubleArrayList data, DoubleArrayList weights, int from, int to, double[] inOut) {
338         throw new UnsupportedOperationException("incrementalWeightedUpdate not supported with missing values");
339     }
340 
341     /**
342      * Returns the kurtosis (aka excess) of a data sequence.
343      *
344      * @param moment4           the fourth central moment, which is <tt>moment(data,4,mean)</tt>.
345      * @param standardDeviation the standardDeviation.
346      * @return double
347      */
348     public static double kurtosis(double moment4, double standardDeviation) {
349         return -3 + moment4 / (standardDeviation * standardDeviation * standardDeviation * standardDeviation);
350     }
351 
352     /**
353      * Returns the kurtosis (aka excess) of a data sequence, which is <tt>-3 +
354      * moment(data,4,mean) / standardDeviation<sup>4</sup></tt>.
355      *
356      * @param data              DoubleArrayList
357      * @param mean              double
358      * @param standardDeviation double
359      * @return double
360      */
361     public static double kurtosis(DoubleArrayList data, double mean, double standardDeviation) {
362         return kurtosis(moment(data, 4, mean), standardDeviation);
363     }
364 
365     /**
366      * <b>Not supported. </b>
367      *
368      * @param data DoubleArrayList
369      * @param mean double
370      * @return double
371      */
372     public static double lag1(DoubleArrayList data, double mean) {
373         throw new UnsupportedOperationException("lag1 not supported with missing values");
374     }
375 
376     /**
377      * @param dat
378      * @return the median absolute deviation from the median.
379      */
380     public static double mad(DoubleArrayList dat) {
381         DoubleArrayList copy = removeMissing(dat.copy());
382 
383         double median = median(copy);
384 
385         DoubleArrayList devsum = new DoubleArrayList(copy.size());
386         for (int i = 0; i < copy.size(); i++) {
387             double v = copy.getQuick(i);
388             devsum.add(Math.abs(v - median));
389         }
390 
391         devsum.sort();
392         return Descriptive.median(devsum);
393     }
394 
395     public static double max(DoubleArrayList input) {
396         int size = input.size();
397         if (size == 0) throw new IllegalArgumentException();
398 
399         double[] elements = input.elements();
400         double max = Double.MIN_VALUE;
401         for (int i = 0; i < size; i++) {
402             if (Double.isNaN(elements[i])) continue;
403             if (elements[i] > max) max = elements[i];
404         }
405 
406         return max;
407     }
408 
409     /**
410      * Special mean calculation where we use the effective size as an input.
411      *
412      * @param elements      The data double array.
413      * @param effectiveSize The effective size used for the mean calculation.
414      * @return double
415      */
416     public static double mean(double[] elements, int effectiveSize) {
417 
418         int length = elements.length;
419 
420         if (0 == effectiveSize) {
421             return Double.NaN;
422         }
423 
424         double sum = 0.0;
425         int i, count;
426         count = 0;
427         double value;
428         for (i = 0; i < length; i++) {
429             value = elements[i];
430             if (Double.isNaN(value)) {
431                 continue;
432             }
433             sum += value;
434             count++;
435         }
436         if (0.0 == count) {
437             return Double.NaN;
438         }
439         return sum / effectiveSize;
440     }
441 
442     /**
443      * @param data Values to be analyzed.
444      * @return Mean of the values in x. Missing values are ignored in the analysis.
445      */
446     public static double mean(DoubleArrayList data) {
447         return sum(data) / sizeWithoutMissingValues(data);
448     }
449 
450     /**
451      * Special mean calculation where we use the effective size as an input.
452      *
453      * @param x             The data
454      * @param effectiveSize The effective size used for the mean calculation.
455      * @return double
456      */
457     public static double mean(DoubleArrayList x, int effectiveSize) {
458 
459         int length = x.size();
460 
461         if (0 == effectiveSize) {
462             return Double.NaN;
463         }
464 
465         double sum = 0.0;
466         int i, count;
467         count = 0;
468         double value;
469         double[] elements = x.elements();
470         for (i = 0; i < length; i++) {
471             value = elements[i];
472             if (Double.isNaN(value)) {
473                 continue;
474             }
475             sum += value;
476             count++;
477         }
478         if (0.0 == count) {
479             return Double.NaN;
480         }
481         return sum / effectiveSize;
482 
483     }
484 
485     /**
486      * Calculate the mean of the values above to a particular quantile of an array.
487      *
488      * @param quantile A value from 0 to 1
489      * @param array    Array for which we want to get the quantile.
490      * @return double
491      */
492     public static double meanAboveQuantile(double quantile, DoubleArrayList array) {
493 
494         if (quantile < 0.0 || quantile > 1.0) {
495             throw new IllegalArgumentException("Quantile must be between 0 and 1");
496         }
497 
498         double returnvalue = 0.0;
499         int k = 0;
500 
501         double quantileValue = DescriptiveWithMissing.quantile(array, quantile);
502 
503         for (int i = 0; i < array.size(); i++) {
504             if (array.get(i) > quantileValue) {
505                 returnvalue += array.get(i);
506                 k++;
507             }
508         }
509 
510         if (k == 0) {
511             throw new ArithmeticException("No values found above quantile");
512         }
513 
514         return returnvalue / k;
515     }
516 
517     /**
518      * Returns the median. Missing values are ignored entirely.
519      *
520      * @param data the data sequence, does not have to be sorted.
521      * @return double
522      */
523     public static double median(DoubleArrayList data) {
524         return DescriptiveWithMissing.quantile(data, 0.5);
525     }
526 
527     public static double min(DoubleArrayList input) {
528         int size = input.size();
529         if (size == 0) throw new IllegalArgumentException();
530 
531         double[] elements = input.elements();
532         double min = Double.MAX_VALUE;
533         for (int i = 0; i < size; i++) {
534             if (Double.isNaN(elements[i])) continue;
535             if (elements[i] < min) min = elements[i];
536         }
537 
538         return min;
539     }
540 
541     /**
542      * Returns the moment of <tt>k</tt> -th order with constant <tt>c</tt> of a data sequence, which is
543      * <tt>Sum( (data[i]-c)<sup>k</sup> ) /
544      * data.size()</tt>.
545      *
546      * @param data DoubleArrayList
547      * @param k    int
548      * @param c    double
549      * @return double
550      */
551     public static double moment(DoubleArrayList data, int k, double c) {
552         return sumOfPowerDeviations(data, k, c) / sizeWithoutMissingValues(data);
553     }
554 
555     /**
556      * Returns the product of a data sequence, which is <tt>Prod( data[i] )</tt>. Missing values are ignored. In other
557      * words: <tt>data[0]*data[1]*...*data[data.size()-1]</tt>. Note that you may easily get numeric overflows.
558      *
559      * @param data DoubleArrayList
560      * @return double
561      */
562     public static double product(DoubleArrayList data) {
563         int size = data.size();
564         double[] elements = data.elements();
565 
566         double product = 1;
567         for (int i = size; --i >= 0; ) {
568             if (Double.isNaN(elements[i])) {
569                 continue;
570             }
571             product *= elements[i];
572 
573         }
574         return product;
575     }
576 
577     /**
578      * Returns the <tt>phi-</tt> quantile; that is, an element <tt>elem</tt> for which holds that <tt>phi</tt> percent
579      * of data elements are less than <tt>elem</tt>. Missing values are ignored. The quantile need not necessarily be
580      * contained in the data sequence, it can be a linear interpolation.
581      *
582      * @param data the data sequence, does not have to be sorted.
583      * @param phi  the percentage; must satisfy <tt>0 &lt;= phi &lt;= 1</tt>.
584      * @return double
585      * @todo possibly implement so a copy is not made.
586      */
587     public static double quantile(DoubleArrayList data, double phi) {
588         DoubleArrayList sortedData = removeMissing(data.copy());
589         sortedData.sort();
590         return Descriptive.quantile(sortedData, phi);
591     }
592 
593     /**
594      * Returns how many percent of the elements contained in the receiver are <tt>&lt;= element</tt>. Does linear
595      * interpolation if the element is not contained but lies in between two contained elements. Missing values are
596      * ignored.
597      *
598      * @param data    the list to be searched
599      * @param element the element to search for.
600      * @return the percentage <tt>phi</tt> of elements <tt>&lt;= element</tt>(<tt>0.0 &lt;= phi &lt;= 1.0)</tt>.
601      */
602     public static double quantileInverse(DoubleArrayList data, double element) {
603         DoubleArrayList sortedData = removeMissing(data.copy());
604         sortedData.sort();
605         return rankInterpolated(sortedData, element) / sortedData.size();
606     }
607 
608     /**
609      * Returns the quantiles of the specified percentages. The quantiles need not necessarily be contained in the data
610      * sequence, it can be a linear interpolation.
611      *
612      * @param sortedData  the data sequence; <b>must be sorted ascending </b>.
613      * @param percentages the percentages for which quantiles are to be computed. Each percentage must be in the
614      *                    interval <tt>[0.0,1.0]</tt>.
615      * @return the quantiles.
616      */
617     public static DoubleArrayList quantiles(DoubleArrayList sortedData, DoubleArrayList percentages) {
618         int s = percentages.size();
619         DoubleArrayList quantiles = new DoubleArrayList(s);
620 
621         for (int i = 0; i < s; i++) {
622             quantiles.add(quantile(sortedData, percentages.get(i)));
623         }
624 
625         return quantiles;
626     }
627 
628     /**
629      * Returns the linearly interpolated number of elements in a list less or equal to a given element. Missing values
630      * are ignored. The rank is the number of elements <= element. Ranks are of the form
631      * <tt>{0, 1, 2,..., sortedList.size()}</tt>. If no element is <= element, then the rank is zero. If the element
632      * lies in between two contained elements, then linear interpolation is used and a non integer value is returned.
633      *
634      * @param sortedList the list to be searched (must be sorted ascending).
635      * @param element    the element to search for.
636      * @return the rank of the element.
637      * @todo possibly implement so a copy is not made.
638      */
639     public static double rankInterpolated(DoubleArrayList sortedList, double element) {
640         return Descriptive.rankInterpolated(removeMissing(sortedList), element);
641     }
642 
643     /**
644      * Makes a copy of the list that doesn't have the missing values.
645      *
646      * @param data DoubleArrayList
647      * @return DoubleArrayList
648      */
649     public static DoubleArrayList removeMissing(DoubleArrayList data) {
650         DoubleArrayList r = new DoubleArrayList(sizeWithoutMissingValues(data));
651         double[] elements = data.elements();
652         int size = data.size();
653         for (int i = 0; i < size; i++) {
654             if (Double.isNaN(elements[i])) {
655                 continue;
656             }
657             r.add(elements[i]);
658         }
659         return r;
660     }
661 
662     /**
663      * Returns the sample kurtosis (aka excess) of a data sequence.
664      *
665      * @param data           DoubleArrayList
666      * @param mean           double
667      * @param sampleVariance double
668      * @return double
669      */
670     public static double sampleKurtosis(DoubleArrayList data, double mean, double sampleVariance) {
671         return sampleKurtosis(sizeWithoutMissingValues(data), moment(data, 4, mean), sampleVariance);
672     }
673 
674     /**
675      * Returns the sample skew of a data sequence.
676      *
677      * @param data           DoubleArrayList
678      * @param mean           double
679      * @param sampleVariance double
680      * @return double
681      */
682     public static double sampleSkew(DoubleArrayList data, double mean, double sampleVariance) {
683         return sampleSkew(sizeWithoutMissingValues(data), moment(data, 3, mean), sampleVariance);
684     }
685 
686     /**
687      * Returns the sample standard deviation.
688      * <p>
689      * This is included for compatibility with the superclass, but does not implement the correction used there.
690      *
691      * @param size           the number of elements of the data sequence.
692      * @param sampleVariance the <b>sample variance </b>.
693      * @see cern.jet.stat.Descriptive#sampleStandardDeviation(int, double)
694      */
695     public static double sampleStandardDeviation(int size, double sampleVariance) {
696         return Math.sqrt(sampleVariance);
697     }
698 
699     /**
700      * Returns the sample variance of a data sequence. That is <tt>Sum (
701      * (data[i]-mean)^2 ) / (data.size()-1)</tt>.
702      *
703      * @param data DoubleArrayList
704      * @param mean double
705      * @return double
706      */
707     public static double sampleVariance(DoubleArrayList data, double mean) {
708         double[] elements = data.elements();
709         int effsize = sizeWithoutMissingValues(data);
710         int size = data.size();
711         double sum = 0;
712         // find the sum of the squares
713         for (int i = size; --i >= 0; ) {
714             if (Double.isNaN(elements[i])) {
715                 continue;
716             }
717             double delta = elements[i] - mean;
718             sum += delta * delta;
719         }
720 
721         return sum / (effsize - 1);
722     }
723 
724     /**
725      * Return the size of the list, ignoring missing values.
726      *
727      * @param list DoubleArrayList
728      * @return int
729      */
730     public static int sizeWithoutMissingValues(DoubleArrayList list) {
731 
732         int size = 0;
733         for (int i = 0; i < list.size(); i++) {
734             if (!Double.isNaN(list.get(i))) {
735                 size++;
736             }
737         }
738         return size;
739     }
740 
741     /**
742      * Returns the skew of a data sequence, which is <tt>moment(data,3,mean) /
743      * standardDeviation<sup>3</sup></tt>.
744      *
745      * @param data              DoubleArrayList
746      * @param mean              double
747      * @param standardDeviation double
748      * @return double
749      */
750     public static double skew(DoubleArrayList data, double mean, double standardDeviation) {
751         return skew(moment(data, 3, mean), standardDeviation);
752     }
753 
754     /**
755      * Standardize. Note that this does something slightly different than standardize in the superclass, because our
756      * sampleStandardDeviation does not use the correction of the superclass (which isn't really standard).
757      *
758      * @param data DoubleArrayList
759      */
760     public static void standardize(DoubleArrayList data) {
761         double mean = mean(data);
762         double stdev = Math.sqrt(sampleVariance(data, mean));
763         DescriptiveWithMissing.standardize(data, mean, stdev);
764     }
765 
766     /**
767      * Modifies a data sequence to be standardized. Missing values are ignored. Changes each element <tt>data[i]</tt> as
768      * follows: <tt>data[i] = (data[i]-mean)/standardDeviation</tt> unless the standard deviation is 0.00 or very close to zero (indicating the data are constant),
769      * in which case we return a vector of zeros (in effect just doing mean subtraction)
770      *
771      * @param data              DoubleArrayList
772      * @param mean              mean of data
773      * @param standardDeviation stdev of data. |stdev| < Constants.TINY is treated as zero.
774      */
775     public static void standardize(DoubleArrayList data, double mean, double standardDeviation) {
776 
777         if (Math.abs(0.0 - standardDeviation) < Constants.TINY) {
778             for (int i = 0; i < data.size(); i++) {
779                 if (!Double.isNaN(data.get(i))) {
780                     data.set(i, 0.0);
781                 }
782             }
783             return;
784         }
785 
786         double[] elements = data.elements();
787 
788         for (int i = 0; i < elements.length; i++) {
789             if (Double.isNaN(elements[i])) {
790                 continue;
791             }
792             elements[i] = (elements[i] - mean) / standardDeviation;
793         }
794     }
795 
796     /**
797      * Returns the sum of a data sequence. That is <tt>Sum( data[i] )</tt>.
798      *
799      * @param data DoubleArrayList
800      * @return double
801      */
802     public static double sum(DoubleArrayList data) {
803         return sumOfPowerDeviations(data, 1, 0.0);
804     }
805 
806     /**
807      * Returns the sum of inversions of a data sequence, which is <tt>Sum( 1.0 /
808      * data[i])</tt>.
809      *
810      * @param data the data sequence.
811      * @param from the index of the first data element (inclusive).
812      * @param to   the index of the last data element (inclusive).
813      * @return double
814      */
815     public static double sumOfInversions(DoubleArrayList data, int from, int to) {
816         return sumOfPowerDeviations(data, -1, 0.0, from, to);
817     }
818 
819     /**
820      * Returns the sum of logarithms of a data sequence, which is <tt>Sum(
821      * Log(data[i])</tt>. Missing values are ignored.
822      *
823      * @param data the data sequence.
824      * @param from the index of the first data element (inclusive).
825      * @param to   the index of the last data element (inclusive).
826      * @return double
827      */
828     public static double sumOfLogarithms(DoubleArrayList data, int from, int to) {
829         double[] elements = data.elements();
830         double logsum = 0;
831         for (int i = from - 1; ++i <= to; ) {
832             if (Double.isNaN(elements[i])) {
833                 continue;
834             }
835             logsum += Math.log(elements[i]);
836         }
837         return logsum;
838     }
839 
840     /**
841      * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or
842      * <tt>k == -2 .. 4</tt>.
843      *
844      * @param data DoubleArrayList
845      * @param k    int
846      * @param c    double
847      * @return double
848      */
849     public static double sumOfPowerDeviations(DoubleArrayList data, int k, double c) {
850         return sumOfPowerDeviations(data, k, c, 0, data.size() - 1);
851     }
852 
853     /**
854      * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt> for all <tt>i = from ..
855      * to</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or <tt>k == -2 .. 5</tt>. Missing values are
856      * ignored.
857      *
858      * @param data DoubleArrayList
859      * @param k    int
860      * @param c    double
861      * @param from int
862      * @param to   int
863      * @return double
864      */
865     public static double sumOfPowerDeviations(final DoubleArrayList data, final int k, final double c, final int from, final int to) {
866         final double[] elements = data.elements();
867         double sum = 0;
868         double v;
869         int i;
870         switch (k) { // optimized for speed
871             case -2:
872                 if (c == 0.0) {
873                     for (i = from - 1; ++i <= to; ) {
874                         if (Double.isNaN(elements[i])) {
875                             continue;
876                         }
877                         v = elements[i];
878                         sum += 1 / (v * v);
879                     }
880                 } else {
881                     for (i = from - 1; ++i <= to; ) {
882                         if (Double.isNaN(elements[i])) {
883                             continue;
884                         }
885                         v = elements[i] - c;
886                         sum += 1 / (v * v);
887                     }
888                 }
889                 break;
890             case -1:
891                 if (c == 0.0) {
892                     for (i = from - 1; ++i <= to; ) {
893                         if (Double.isNaN(elements[i])) {
894                             continue;
895                         }
896                         sum += 1 / elements[i];
897                     }
898                 } else {
899                     for (i = from - 1; ++i <= to; ) {
900                         if (Double.isNaN(elements[i])) {
901                             continue;
902                         }
903                         sum += 1 / (elements[i] - c);
904                     }
905                 }
906                 break;
907             case 0:
908                 for (i = from - 1; ++i <= to; ) {
909                     if (Double.isNaN(elements[i])) {
910                         continue;
911                     }
912                     sum += 1;
913                 }
914                 break;
915             case 1:
916                 if (c == 0.0) {
917                     for (i = from - 1; ++i <= to; ) {
918                         if (Double.isNaN(elements[i])) {
919                             continue;
920                         }
921                         sum += elements[i];
922                     }
923                 } else {
924                     for (i = from - 1; ++i <= to; ) {
925                         if (Double.isNaN(elements[i])) {
926                             continue;
927                         }
928                         sum += elements[i] - c;
929                     }
930                 }
931                 break;
932             case 2:
933                 if (c == 0.0) {
934                     for (i = from - 1; ++i <= to; ) {
935                         if (Double.isNaN(elements[i])) {
936                             continue;
937                         }
938                         v = elements[i];
939                         sum += v * v;
940                     }
941                 } else {
942                     for (i = from - 1; ++i <= to; ) {
943                         if (Double.isNaN(elements[i])) {
944                             continue;
945                         }
946                         v = elements[i] - c;
947                         sum += v * v;
948                     }
949                 }
950                 break;
951             case 3:
952                 if (c == 0.0) {
953                     for (i = from - 1; ++i <= to; ) {
954                         if (Double.isNaN(elements[i])) {
955                             continue;
956                         }
957                         v = elements[i];
958                         sum += v * v * v;
959                     }
960                 } else {
961                     for (i = from - 1; ++i <= to; ) {
962                         if (Double.isNaN(elements[i])) {
963                             continue;
964                         }
965                         v = elements[i] - c;
966                         sum += v * v * v;
967                     }
968                 }
969                 break;
970             case 4:
971                 if (c == 0.0) {
972                     for (i = from - 1; ++i <= to; ) {
973                         if (Double.isNaN(elements[i])) {
974                             continue;
975                         }
976                         v = elements[i];
977                         sum += v * v * v * v;
978                     }
979                 } else {
980                     for (i = from - 1; ++i <= to; ) {
981                         if (Double.isNaN(elements[i])) {
982                             continue;
983                         }
984                         v = elements[i] - c;
985                         sum += v * v * v * v;
986                     }
987                 }
988                 break;
989             case 5:
990                 if (c == 0.0) {
991                     for (i = from - 1; ++i <= to; ) {
992                         if (Double.isNaN(elements[i])) {
993                             continue;
994                         }
995                         v = elements[i];
996                         sum += v * v * v * v * v;
997                     }
998                 } else {
999                     for (i = from - 1; ++i <= to; ) {
1000                         if (Double.isNaN(elements[i])) {
1001                             continue;
1002                         }
1003                         v = elements[i] - c;
1004                         sum += v * v * v * v * v;
1005                     }
1006                 }
1007                 break;
1008             default:
1009                 for (i = from - 1; ++i <= to; ) {
1010                     if (Double.isNaN(elements[i])) {
1011                         continue;
1012                     }
1013                     sum += Math.pow(elements[i] - c, k);
1014                 }
1015                 break;
1016         }
1017         return sum;
1018     }
1019 
1020     /**
1021      * Returns the sum of powers of a data sequence, which is <tt>Sum (
1022      * data[i]<sup>k</sup> )</tt>.
1023      *
1024      * @param data DoubleArrayList
1025      * @param k    int
1026      * @return double
1027      */
1028     public static double sumOfPowers(DoubleArrayList data, int k) {
1029         return sumOfPowerDeviations(data, k, 0);
1030     }
1031 
1032     /**
1033      * Compute the sum of the squared deviations from the mean of a data sequence. Missing values are ignored.
1034      *
1035      * @param data DoubleArrayList
1036      * @return double
1037      */
1038     public static double sumOfSquaredDeviations(DoubleArrayList data) {
1039         return sumOfSquaredDeviations(sizeWithoutMissingValues(data), variance(sizeWithoutMissingValues(data), sum(data), sumOfSquares(data)));
1040     }
1041 
1042     /**
1043      * Returns the sum of squares of a data sequence. Skips missing values.
1044      *
1045      * @param data DoubleArrayList
1046      * @return double
1047      */
1048     public static double sumOfSquares(DoubleArrayList data) {
1049         return sumOfPowerDeviations(data, 2, 0.0);
1050     }
1051 
1052     /**
1053      * Returns the trimmed mean of a sorted data sequence. Missing values are completely ignored.
1054      *
1055      * @param sortedData the data sequence; <b>must be sorted ascending </b>.
1056      * @param mean       the mean of the (full) sorted data sequence.
1057      * @param left       int the number of leading elements to trim.
1058      * @param right      int number of trailing elements to trim.
1059      * @return double
1060      */
1061     public static double trimmedMean(DoubleArrayList sortedData, double mean, int left, int right) {
1062         return Descriptive.trimmedMean(removeMissing(sortedData), mean, left, right);
1063     }
1064 
1065     /**
1066      * Provided for convenience!
1067      *
1068      * @param data DoubleArrayList
1069      * @return double
1070      */
1071     public static double variance(DoubleArrayList data) {
1072         return variance(sizeWithoutMissingValues(data), sum(data), sumOfSquares(data));
1073     }
1074 
1075     /**
1076      *
1077      */
1078     public static double variance(int sizeWithoutMissing, double sum, double sumOfSquares) {
1079         return Descriptive.variance(sizeWithoutMissing, sum, sumOfSquares);
1080     }
1081 
1082     /**
1083      * Returns the weighted mean of a data sequence. That is <tt> Sum (data[i] *
1084      * weights[i]) / Sum ( weights[i] )</tt>.
1085      *
1086      * @param data    DoubleArrayList
1087      * @param weights DoubleArrayList
1088      * @return double
1089      */
1090     public static double weightedMean(DoubleArrayList data, DoubleArrayList weights) {
1091         int size = data.size();
1092         if (size != weights.size() || size == 0) {
1093             throw new IllegalArgumentException();
1094         }
1095 
1096         double[] elements = data.elements();
1097         double[] theWeights = weights.elements();
1098         double sum = 0.0;
1099         double weightsSum = 0.0;
1100         for (int i = size; --i >= 0; ) {
1101             double w = theWeights[i];
1102             if (Double.isNaN(elements[i]) || Double.isNaN(w)) {
1103                 continue;
1104             }
1105             sum += elements[i] * w;
1106             weightsSum += w;
1107         }
1108 
1109         return sum / weightsSum;
1110     }
1111 
1112     /**
1113      * Returns the winsorized mean of a sorted data sequence.
1114      *
1115      * @param sortedData DoubleArrayList, must already be sorted ascending
1116      * @param mean       the mean of the (full) sorted data sequence.
1117      * @param left       the number of leading elements to trim. Refers to the number of elements to trim
1118      *                   <em>excluding any missing values</em>
1119      * @param right      the number of trailing elements to trim <em>excluding any missing values</em>
1120      * @return double
1121      */
1122     public static double winsorizedMean(DoubleArrayList sortedData, double mean, int left, int right) {
1123         DoubleArrayList sdm = removeMissing(sortedData);
1124         return Descriptive.winsorizedMean(sdm, mean(sdm), left, right);
1125     }
1126 
1127     /* private methods */
1128 
1129     private DescriptiveWithMissing() {
1130     }
1131 
1132 } // end of class