View Javadoc
1   /*
2    * The baseCode project
3    * 
4    * Copyright (c) 2006 University of British Columbia
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math;
20  
21  import cern.colt.list.DoubleArrayList;
22  import cern.jet.stat.Descriptive;
23  
24  /**
25   * Mathematical functions for statistics that allow missing values without scotching the calculations.
26   * <p>
27   * Be careful because some methods from cern.jet.stat.Descriptive have not been overridden and will yield a
28   * UnsupportedOperationException if used.
29   * </p>
30   * <p>
31   * Some functions that come with DoubleArrayLists will not work in an entirely compatible way with missing values. For
32   * examples, size() reports the total number of elements, including missing values. To get a count of non-missing
33   * values, use this.sizeWithoutMissingValues(). The right one to use may vary.
34   * </p>
35   * <p>
36   * Not all methods need to be overridden. However, all methods that take a "size" parameter should be passed the results
37   * of sizeWithoutMissingValues(data), instead of data.size().
38   * <p>
39   * Based in part on code from the colt package: Copyright &copy; 1999 CERN - European Organization for Nuclear Research.
40   * 
41   * @see <a
42   *      href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/Descriptive.html">cern.jet.stat.Descriptive
43   *      </a>
44   * @author Paul Pavlidis
45   * 
46   */
47  public class DescriptiveWithMissing extends cern.jet.stat.Descriptive {
48  
49      /**
50       * <b>Not supported. </b>
51       * 
52       * @param data DoubleArrayList
53       * @param lag int
54       * @param mean double
55       * @param variance double
56       * @return double
57       */
58      public static double autoCorrelation( DoubleArrayList data, int lag, double mean, double variance ) {
59          throw new UnsupportedOperationException( "autoCorrelation not supported with missing values" );
60      }
61  
62      /**
63       * Highly optimized version of the correlation computation, where as much information is precomputed as possible.
64       * Use of this method only makes sense if many comparisons with the inputs x and y are being performed.
65       * <p>
66       * Implementation note: In correlation(DoubleArrayList x, DoubleArrayList y), profiling shows that calls to
67       * Double.NaN consume half the CPU time. The precomputation of the element-by-element squared values is another
68       * obvious optimization. There is also no checking for matching lengths of the arrays.
69       * 
70       * @param x
71       * @param y
72       * @param selfSquaredX double array containing values of x_i^2 for each x.
73       * @param selfSquaredY
74       * @param nanStatusX boolean array containing value of Double.isNaN() for each X.
75       * @param nanStatusY
76       * @return
77       */
78      public static double correlation( double[] x, double[] y, double[] selfSquaredX, double[] selfSquaredY,
79              boolean[] nanStatusX, boolean[] nanStatusY ) {
80  
81          double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
82          int numused = 0;
83          syy = 0.0;
84          sxy = 0.0;
85          sxx = 0.0;
86          sx = 0.0;
87          sy = 0.0;
88  
89          int length = x.length;
90          for ( int j = 0; j < length; j++ ) {
91              xj = x[j];
92              yj = y[j];
93  
94              if ( nanStatusX[j] || nanStatusY[j] ) {
95                  continue;
96              }
97              sx += xj;
98              sy += yj;
99              sxy += xj * yj;
100             sxx += selfSquaredX[j];
101             syy += selfSquaredY[j];
102             numused++;
103         }
104 
105         if ( numused > 0 ) {
106             ay = sy / numused;
107             ax = sx / numused;
108             return ( sxy - sx * ay ) / Math.sqrt( ( sxx - sx * ax ) * ( syy - sy * ay ) );
109         }
110         return Double.NaN; // signifies that it could not be calculated.
111     }
112 
113     /**
114      * Returns the correlation of two data sequences. That is
115      * <tt>covariance(data1,data2)/(standardDev1*standardDev2)</tt>. Missing values are ignored. This method is
116      * overridden to stop users from using the method in the superclass when missing values are present. The problem is
117      * that the standard deviation cannot be computed without knowning which values are not missing in both vectors to
118      * be compared. Thus the standardDev parameters are thrown away by this method.
119      * 
120      * @param data1 DoubleArrayList
121      * @param standardDev1 double - not used
122      * @param data2 DoubleArrayList
123      * @param standardDev2 double - not used
124      * @return double
125      */
126     public static double correlation( DoubleArrayList data1, double standardDev1, DoubleArrayList data2,
127             double standardDev2 ) {
128         return correlation( data1, data2 );
129     }
130 
131     /**
132      * Calculate the pearson correlation of two arrays. Missing values (NaNs) are ignored.
133      * 
134      * @param x DoubleArrayList
135      * @param y DoubleArrayList
136      * @return double
137      */
138     public static double correlation( DoubleArrayList x, DoubleArrayList y ) {
139         int j;
140         double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
141         int numused;
142         syy = 0.0;
143         sxy = 0.0;
144         sxx = 0.0;
145         sx = 0.0;
146         sy = 0.0;
147         numused = 0;
148         if ( x.size() != y.size() ) {
149             throw new ArithmeticException( "Unequal vector sizes: " + x.size() + " != " + y.size() );
150         }
151 
152         double[] xel = x.elements();
153         double[] yel = y.elements();
154 
155         int length = x.size();
156         for ( j = 0; j < length; j++ ) {
157             xj = xel[j];
158             yj = yel[j];
159 
160             if ( !Double.isNaN( xj ) && !Double.isNaN( yj ) ) {
161                 sx += xj;
162                 sy += yj;
163                 sxy += xj * yj;
164                 sxx += xj * xj;
165                 syy += yj * yj;
166                 numused++;
167             }
168         }
169 
170         if ( numused > 0 ) {
171             ay = sy / numused;
172             ax = sx / numused;
173             return ( sxy - sx * ay ) / Math.sqrt( ( sxx - sx * ax ) * ( syy - sy * ay ) );
174         }
175         return Double.NaN; // signifies that it could not be calculated.
176 
177     }
178 
179     /**
180      * Returns the SAMPLE covariance of two data sequences. Pairs of values are only considered if both are not NaN. If
181      * there are no non-missing pairs, the covariance is zero.
182      * 
183      * @param data1 the first vector
184      * @param data2 the second vector
185      * @return double
186      */
187     public static double covariance( DoubleArrayList data1, DoubleArrayList data2 ) {
188         int size = data1.size();
189         if ( size != data2.size() || size == 0 ) {
190             throw new IllegalArgumentException();
191         }
192         double[] elements1 = data1.elements();
193         double[] elements2 = data2.elements();
194 
195         /* initialize sumx and sumy to the first non-NaN pair of values */
196 
197         int i = 0;
198         double sumx = 0.0, sumy = 0.0, Sxy = 0.0;
199         while ( i < size ) {
200             sumx = elements1[i];
201             sumy = elements2[i];
202             if ( !Double.isNaN( elements1[i] ) && !Double.isNaN( elements2[i] ) ) {
203                 break;
204             }
205             i++;
206         }
207         i++;
208         int usedPairs = 1;
209         for ( ; i < size; ++i ) {
210             double x = elements1[i];
211             double y = elements2[i];
212             if ( Double.isNaN( x ) || Double.isNaN( y ) ) {
213                 continue;
214             }
215 
216             sumx += x;
217             Sxy += ( x - sumx / ( usedPairs + 1 ) ) * ( y - sumy / usedPairs );
218             sumy += y;
219             usedPairs++;
220         }
221         return Sxy / ( usedPairs - 1 );
222     }
223 
224     /**
225      * Durbin-Watson computation. This measures the serial correlation in a data series.
226      * 
227      * @param data DoubleArrayList
228      * @return double
229      * @todo this will still break in some situations where there are missing values
230      */
231     public static double durbinWatson( DoubleArrayList data ) {
232         int size = data.size();
233         if ( size < 2 ) {
234             throw new IllegalArgumentException( "data sequence must contain at least two values." );
235         }
236 
237         double[] elements = data.elements();
238         double run = 0;
239         double run_sq = 0;
240         int firstNotNaN = 0;
241         while ( firstNotNaN < size ) {
242             run_sq = elements[firstNotNaN] * elements[firstNotNaN];
243 
244             if ( !Double.isNaN( elements[firstNotNaN] ) ) {
245                 break;
246             }
247 
248             firstNotNaN++;
249         }
250 
251         if ( firstNotNaN > 0 && size - firstNotNaN < 2 ) {
252             throw new IllegalArgumentException( "data sequence must contain at least two non-missing values." );
253 
254         }
255 
256         for ( int i = firstNotNaN + 1; i < size; i++ ) {
257             int gap = 1;
258             while ( i < size && Double.isNaN( elements[i] ) ) {
259                 gap++;
260                 i++;
261                 continue;
262             }
263             if ( i >= size ) continue; // missing value at end will cause this.
264             double x = elements[i] - elements[i - gap];
265             /**  */
266             run += x * x;
267             run_sq += elements[i] * elements[i];
268         }
269 
270         return run / run_sq;
271     }
272 
273     /**
274      * Returns the geometric mean of a data sequence. Missing values are ignored. Note that for a geometric mean to be
275      * meaningful, the minimum of the data sequence must not be less or equal to zero. <br>
276      * The geometric mean is given by <tt>pow( Product( data[i] ),
277      * 1/data.size())</tt>. This method tries to avoid overflows at the expense of an equivalent but somewhat slow
278      * definition: <tt>geo = Math.exp( Sum(
279      * Log(data[i]) ) / data.size())</tt>.
280      * 
281      * @param data DoubleArrayList
282      * @return double
283      */
284     public static double geometricMean( DoubleArrayList data ) {
285         return geometricMean( sizeWithoutMissingValues( data ), sumOfLogarithms( data, 0, data.size() - 1 ) );
286     }
287 
288     /**
289      * Incrementally maintains and updates minimum, maximum, sum and sum of squares of a data sequence.
290      * <p>
291      * Assume we have already recorded some data sequence elements and know their minimum, maximum, sum and sum of
292      * squares. Assume further, we are to record some more elements and to derive updated values of minimum, maximum,
293      * sum and sum of squares. This method computes those updated values without needing to know the already recorded
294      * elements. This is interesting for interactive online monitoring and/or applications that cannot keep the entire
295      * huge data sequence in memory.
296      * 
297      * @param data the additional elements to be incorporated into min, max, etc.
298      * @param from the index of the first element within <tt>data</tt> to consider.
299      * @param to the index of the last element within <tt>data</tt> to consider. The method incorporates elements
300      *        <tt>data[from], ..., data[to]</tt>.
301      * @param inOut the old values in the following format:
302      *        <ul>
303      *        <li><tt>inOut[0]</tt> is the old minimum. <li><tt>inOut[1]</tt> is the old maximum. <li><tt>inOut[2]</tt>
304      *        is the old sum. <li><tt>inOut[3]</tt> is the old sum of squares.
305      *        </ul>
306      *        If no data sequence elements have so far been recorded set the values as follows
307      *        <ul>
308      *        <li><tt>inOut[0] = Double.POSITIVE_INFINITY</tt> as the old minimum. <li><tt>inOut[1] =
309      *        Double.NEGATIVE_INFINITY</tt> as the old maximum. <li><tt>inOut[2] = 0.0</tt> as the old sum. <li><tt>
310      *        inOut[3] = 0.0</tt> as the old sum of squares.
311      *        </ul>
312      */
313     public static void incrementalUpdate( DoubleArrayList data, int from, int to, double[] inOut ) {
314         throw new UnsupportedOperationException( "incrementalUpdate not supported with missing values" );
315     }
316 
317     /**
318      * <b>Not supported. </b>
319      * 
320      * @param data DoubleArrayList
321      * @param from int
322      * @param to int
323      * @param fromSumIndex int
324      * @param toSumIndex int
325      * @param sumOfPowers double[]
326      */
327     public static void incrementalUpdateSumsOfPowers( DoubleArrayList data, int from, int to, int fromSumIndex,
328             int toSumIndex, double[] sumOfPowers ) {
329         throw new UnsupportedOperationException( "incrementalUpdateSumsOfPowers not supported with missing values" );
330     }
331 
332     /**
333      * <b>Not supported. </b>
334      * 
335      * @param data DoubleArrayList
336      * @param weights DoubleArrayList
337      * @param from int
338      * @param to int
339      * @param inOut double[]
340      */
341     public static void incrementalWeightedUpdate( DoubleArrayList data, DoubleArrayList weights, int from, int to,
342             double[] inOut ) {
343         throw new UnsupportedOperationException( "incrementalWeightedUpdate not supported with missing values" );
344     }
345 
346     /**
347      * Returns the kurtosis (aka excess) of a data sequence.
348      * 
349      * @param moment4 the fourth central moment, which is <tt>moment(data,4,mean)</tt>.
350      * @param standardDeviation the standardDeviation.
351      * @return double
352      */
353     public static double kurtosis( double moment4, double standardDeviation ) {
354         return -3 + moment4 / ( standardDeviation * standardDeviation * standardDeviation * standardDeviation );
355     }
356 
357     /**
358      * Returns the kurtosis (aka excess) of a data sequence, which is <tt>-3 +
359      * moment(data,4,mean) / standardDeviation<sup>4</sup></tt>.
360      * 
361      * @param data DoubleArrayList
362      * @param mean double
363      * @param standardDeviation double
364      * @return double
365      */
366     public static double kurtosis( DoubleArrayList data, double mean, double standardDeviation ) {
367         return kurtosis( moment( data, 4, mean ), standardDeviation );
368     }
369 
370     /**
371      * <b>Not supported. </b>
372      * 
373      * @param data DoubleArrayList
374      * @param mean double
375      * @return double
376      */
377     public static double lag1( DoubleArrayList data, double mean ) {
378         throw new UnsupportedOperationException( "lag1 not supported with missing values" );
379     }
380 
381     /**
382      * @param dat
383      * @return the median absolute deviation from the median.
384      */
385     public static double mad( DoubleArrayList dat ) {
386         DoubleArrayList copy = removeMissing( dat.copy() );
387 
388         double median = median( copy );
389 
390         DoubleArrayList devsum = new DoubleArrayList( copy.size() );
391         for ( int i = 0; i < copy.size(); i++ ) {
392             double v = copy.getQuick( i );
393             devsum.add( Math.abs( v - median ) );
394         }
395 
396         devsum.sort();
397         return Descriptive.median( devsum );
398     }
399 
400     public static double max( DoubleArrayList input ) {
401         int size = input.size();
402         if ( size == 0 ) throw new IllegalArgumentException();
403 
404         double[] elements = input.elements();
405         double max = Double.MIN_VALUE;
406         for ( int i = 0; i < size; i++ ) {
407             if ( Double.isNaN( elements[i] ) ) continue;
408             if ( elements[i] > max ) max = elements[i];
409         }
410 
411         return max;
412     }
413 
414     /**
415      * Special mean calculation where we use the effective size as an input.
416      * 
417      * @param elements The data double array.
418      * @param effectiveSize The effective size used for the mean calculation.
419      * @return double
420      */
421     public static double mean( double[] elements, int effectiveSize ) {
422 
423         int length = elements.length;
424 
425         if ( 0 == effectiveSize ) {
426             return Double.NaN;
427         }
428 
429         double sum = 0.0;
430         int i, count;
431         count = 0;
432         double value;
433         for ( i = 0; i < length; i++ ) {
434             value = elements[i];
435             if ( Double.isNaN( value ) ) {
436                 continue;
437             }
438             sum += value;
439             count++;
440         }
441         if ( 0.0 == count ) {
442             return Double.NaN;
443         }
444         return sum / effectiveSize;
445     }
446 
447     /**
448      * @param data Values to be analyzed.
449      * @return Mean of the values in x. Missing values are ignored in the analysis.
450      */
451     public static double mean( DoubleArrayList data ) {
452         return sum( data ) / sizeWithoutMissingValues( data );
453     }
454 
455     /**
456      * Special mean calculation where we use the effective size as an input.
457      * 
458      * @param x The data
459      * @param effectiveSize The effective size used for the mean calculation.
460      * @return double
461      */
462     public static double mean( DoubleArrayList x, int effectiveSize ) {
463 
464         int length = x.size();
465 
466         if ( 0 == effectiveSize ) {
467             return Double.NaN;
468         }
469 
470         double sum = 0.0;
471         int i, count;
472         count = 0;
473         double value;
474         double[] elements = x.elements();
475         for ( i = 0; i < length; i++ ) {
476             value = elements[i];
477             if ( Double.isNaN( value ) ) {
478                 continue;
479             }
480             sum += value;
481             count++;
482         }
483         if ( 0.0 == count ) {
484             return Double.NaN;
485         }
486         return sum / effectiveSize;
487 
488     }
489 
490     /**
491      * Calculate the mean of the values above to a particular quantile of an array.
492      * 
493      * @param quantile A value from 0 to 1
494      * @param array Array for which we want to get the quantile.
495      * @return double
496      */
497     public static double meanAboveQuantile( double quantile, DoubleArrayList array ) {
498 
499         if ( quantile < 0.0 || quantile > 1.0 ) {
500             throw new IllegalArgumentException( "Quantile must be between 0 and 1" );
501         }
502 
503         double returnvalue = 0.0;
504         int k = 0;
505 
506         double quantileValue = DescriptiveWithMissing.quantile( array, quantile );
507 
508         for ( int i = 0; i < array.size(); i++ ) {
509             if ( array.get( i ) > quantileValue ) {
510                 returnvalue += array.get( i );
511                 k++;
512             }
513         }
514 
515         if ( k == 0 ) {
516             throw new ArithmeticException( "No values found above quantile" );
517         }
518 
519         return returnvalue / k;
520     }
521 
522     /**
523      * Returns the median. Missing values are ignored entirely.
524      * 
525      * @param data the data sequence, does not have to be sorted.
526      * @return double
527      */
528     public static double median( DoubleArrayList data ) {
529         return DescriptiveWithMissing.quantile( data, 0.5 );
530     }
531 
532     public static double min( DoubleArrayList input ) {
533         int size = input.size();
534         if ( size == 0 ) throw new IllegalArgumentException();
535 
536         double[] elements = input.elements();
537         double min = Double.MAX_VALUE;
538         for ( int i = 0; i < size; i++ ) {
539             if ( Double.isNaN( elements[i] ) ) continue;
540             if ( elements[i] < min ) min = elements[i];
541         }
542 
543         return min;
544     }
545 
546     /**
547      * Returns the moment of <tt>k</tt> -th order with constant <tt>c</tt> of a data sequence, which is
548      * <tt>Sum( (data[i]-c)<sup>k</sup> ) /
549      * data.size()</tt>.
550      * 
551      * @param data DoubleArrayList
552      * @param k int
553      * @param c double
554      * @return double
555      */
556     public static double moment( DoubleArrayList data, int k, double c ) {
557         return sumOfPowerDeviations( data, k, c ) / sizeWithoutMissingValues( data );
558     }
559 
560     /**
561      * Returns the product of a data sequence, which is <tt>Prod( data[i] )</tt>. Missing values are ignored. In other
562      * words: <tt>data[0]*data[1]*...*data[data.size()-1]</tt>. Note that you may easily get numeric overflows.
563      * 
564      * @param data DoubleArrayList
565      * @return double
566      */
567     public static double product( DoubleArrayList data ) {
568         int size = data.size();
569         double[] elements = data.elements();
570 
571         double product = 1;
572         for ( int i = size; --i >= 0; ) {
573             if ( Double.isNaN( elements[i] ) ) {
574                 continue;
575             }
576             product *= elements[i];
577 
578         }
579         return product;
580     }
581 
582     /**
583      * Returns the <tt>phi-</tt> quantile; that is, an element <tt>elem</tt> for which holds that <tt>phi</tt> percent
584      * of data elements are less than <tt>elem</tt>. Missing values are ignored. The quantile need not necessarily be
585      * contained in the data sequence, it can be a linear interpolation.
586      * 
587      * @param data the data sequence, does not have to be sorted.
588      * @param phi the percentage; must satisfy <tt>0 &lt;= phi &lt;= 1</tt>.
589      * @todo possibly implement so a copy is not made.
590      * @return double
591      */
592     public static double quantile( DoubleArrayList data, double phi ) {
593         DoubleArrayList sortedData = removeMissing( data.copy() );
594         sortedData.sort();
595         return Descriptive.quantile( sortedData, phi );
596     }
597 
598     /**
599      * Returns how many percent of the elements contained in the receiver are <tt>&lt;= element</tt>. Does linear
600      * interpolation if the element is not contained but lies in between two contained elements. Missing values are
601      * ignored.
602      * 
603      * @param data the list to be searched
604      * @param element the element to search for.
605      * @return the percentage <tt>phi</tt> of elements <tt>&lt;= element</tt>(<tt>0.0 &lt;= phi &lt;= 1.0)</tt>.
606      */
607     public static double quantileInverse( DoubleArrayList data, double element ) {
608         DoubleArrayList sortedData = removeMissing( data.copy() );
609         sortedData.sort();
610         return rankInterpolated( sortedData, element ) / sortedData.size();
611     }
612 
613     /**
614      * Returns the quantiles of the specified percentages. The quantiles need not necessarily be contained in the data
615      * sequence, it can be a linear interpolation.
616      * 
617      * @param sortedData the data sequence; <b>must be sorted ascending </b>.
618      * @param percentages the percentages for which quantiles are to be computed. Each percentage must be in the
619      *        interval <tt>[0.0,1.0]</tt>.
620      * @return the quantiles.
621      */
622     public static DoubleArrayList quantiles( DoubleArrayList sortedData, DoubleArrayList percentages ) {
623         int s = percentages.size();
624         DoubleArrayList quantiles = new DoubleArrayList( s );
625 
626         for ( int i = 0; i < s; i++ ) {
627             quantiles.add( quantile( sortedData, percentages.get( i ) ) );
628         }
629 
630         return quantiles;
631     }
632 
633     /**
634      * Returns the linearly interpolated number of elements in a list less or equal to a given element. Missing values
635      * are ignored. The rank is the number of elements <= element. Ranks are of the form
636      * <tt>{0, 1, 2,..., sortedList.size()}</tt>. If no element is <= element, then the rank is zero. If the element
637      * lies in between two contained elements, then linear interpolation is used and a non integer value is returned.
638      * 
639      * @param sortedList the list to be searched (must be sorted ascending).
640      * @param element the element to search for.
641      * @return the rank of the element.
642      * @todo possibly implement so a copy is not made.
643      */
644     public static double rankInterpolated( DoubleArrayList sortedList, double element ) {
645         return Descriptive.rankInterpolated( removeMissing( sortedList ), element );
646     }
647 
648     /**
649      * Makes a copy of the list that doesn't have the missing values.
650      * 
651      * @param data DoubleArrayList
652      * @return DoubleArrayList
653      */
654     public static DoubleArrayList removeMissing( DoubleArrayList data ) {
655         DoubleArrayList r = new DoubleArrayList( sizeWithoutMissingValues( data ) );
656         double[] elements = data.elements();
657         int size = data.size();
658         for ( int i = 0; i < size; i++ ) {
659             if ( Double.isNaN( elements[i] ) ) {
660                 continue;
661             }
662             r.add( elements[i] );
663         }
664         return r;
665     }
666 
667     /**
668      * Returns the sample kurtosis (aka excess) of a data sequence.
669      * 
670      * @param data DoubleArrayList
671      * @param mean double
672      * @param sampleVariance double
673      * @return double
674      */
675     public static double sampleKurtosis( DoubleArrayList data, double mean, double sampleVariance ) {
676         return sampleKurtosis( sizeWithoutMissingValues( data ), moment( data, 4, mean ), sampleVariance );
677     }
678 
679     /**
680      * Returns the sample skew of a data sequence.
681      * 
682      * @param data DoubleArrayList
683      * @param mean double
684      * @param sampleVariance double
685      * @return double
686      */
687     public static double sampleSkew( DoubleArrayList data, double mean, double sampleVariance ) {
688         return sampleSkew( sizeWithoutMissingValues( data ), moment( data, 3, mean ), sampleVariance );
689     }
690 
691     /**
692      * Returns the sample standard deviation.
693      * <p>
694      * This is included for compatibility with the superclass, but does not implement the correction used there.
695      * 
696      * @see cern.jet.stat.Descriptive#sampleStandardDeviation(int, double)
697      * @param size the number of elements of the data sequence.
698      * @param sampleVariance the <b>sample variance </b>.
699      */
700     public static double sampleStandardDeviation( int size, double sampleVariance ) {
701         return Math.sqrt( sampleVariance );
702     }
703 
704     /**
705      * Returns the sample variance of a data sequence. That is <tt>Sum (
706      * (data[i]-mean)^2 ) / (data.size()-1)</tt>.
707      * 
708      * @param data DoubleArrayList
709      * @param mean double
710      * @return double
711      */
712     public static double sampleVariance( DoubleArrayList data, double mean ) {
713         double[] elements = data.elements();
714         int effsize = sizeWithoutMissingValues( data );
715         int size = data.size();
716         double sum = 0;
717         // find the sum of the squares
718         for ( int i = size; --i >= 0; ) {
719             if ( Double.isNaN( elements[i] ) ) {
720                 continue;
721             }
722             double delta = elements[i] - mean;
723             sum += delta * delta;
724         }
725 
726         return sum / ( effsize - 1 );
727     }
728 
729     /**
730      * Return the size of the list, ignoring missing values.
731      * 
732      * @param list DoubleArrayList
733      * @return int
734      */
735     public static int sizeWithoutMissingValues( DoubleArrayList list ) {
736 
737         int size = 0;
738         for ( int i = 0; i < list.size(); i++ ) {
739             if ( !Double.isNaN( list.get( i ) ) ) {
740                 size++;
741             }
742         }
743         return size;
744     }
745 
746     /**
747      * Returns the skew of a data sequence, which is <tt>moment(data,3,mean) /
748      * standardDeviation<sup>3</sup></tt>.
749      * 
750      * @param data DoubleArrayList
751      * @param mean double
752      * @param standardDeviation double
753      * @return double
754      */
755     public static double skew( DoubleArrayList data, double mean, double standardDeviation ) {
756         return skew( moment( data, 3, mean ), standardDeviation );
757     }
758 
759     /**
760      * Standardize. Note that this does something slightly different than standardize in the superclass, because our
761      * sampleStandardDeviation does not use the correction of the superclass (which isn't really standard).
762      * 
763      * @param data DoubleArrayList
764      */
765     public static void standardize( DoubleArrayList data ) {
766         double mean = mean( data );
767         double stdev = Math.sqrt( sampleVariance( data, mean ) );
768         DescriptiveWithMissing.standardize( data, mean, stdev );
769     }
770 
771     /**
772      * Modifies a data sequence to be standardized. Mising values are ignored. Changes each element <tt>data[i]</tt> as
773      * follows: <tt>data[i] = (data[i]-mean)/standardDeviation</tt>.
774      * 
775      * @param data DoubleArrayList
776      * @param mean mean of data
777      * @param standardDeviation stdev of data
778      */
779     public static void standardize( DoubleArrayList data, double mean, double standardDeviation ) {
780         double[] elements = data.elements();
781         for ( int i = 0; i < elements.length; i++ ) {
782             if ( Double.isNaN( elements[i] ) ) {
783                 continue;
784             }
785             elements[i] = ( elements[i] - mean ) / standardDeviation;
786         }
787     }
788 
789     /**
790      * Returns the sum of a data sequence. That is <tt>Sum( data[i] )</tt>.
791      * 
792      * @param data DoubleArrayList
793      * @return double
794      */
795     public static double sum( DoubleArrayList data ) {
796         return sumOfPowerDeviations( data, 1, 0.0 );
797     }
798 
799     /**
800      * Returns the sum of inversions of a data sequence, which is <tt>Sum( 1.0 /
801      * data[i])</tt>.
802      * 
803      * @param data the data sequence.
804      * @param from the index of the first data element (inclusive).
805      * @param to the index of the last data element (inclusive).
806      * @return double
807      */
808     public static double sumOfInversions( DoubleArrayList data, int from, int to ) {
809         return sumOfPowerDeviations( data, -1, 0.0, from, to );
810     }
811 
812     /**
813      * Returns the sum of logarithms of a data sequence, which is <tt>Sum(
814      * Log(data[i])</tt>. Missing values are ignored.
815      * 
816      * @param data the data sequence.
817      * @param from the index of the first data element (inclusive).
818      * @param to the index of the last data element (inclusive).
819      * @return double
820      */
821     public static double sumOfLogarithms( DoubleArrayList data, int from, int to ) {
822         double[] elements = data.elements();
823         double logsum = 0;
824         for ( int i = from - 1; ++i <= to; ) {
825             if ( Double.isNaN( elements[i] ) ) {
826                 continue;
827             }
828             logsum += Math.log( elements[i] );
829         }
830         return logsum;
831     }
832 
833     /**
834      * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or
835      * <tt>k == -2 .. 4</tt>.
836      * 
837      * @param data DoubleArrayList
838      * @param k int
839      * @param c double
840      * @return double
841      */
842     public static double sumOfPowerDeviations( DoubleArrayList data, int k, double c ) {
843         return sumOfPowerDeviations( data, k, c, 0, data.size() - 1 );
844     }
845 
846     /**
847      * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt> for all <tt>i = from ..
848      * to</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or <tt>k == -2 .. 5</tt>. Missing values are
849      * ignored.
850      * 
851      * @param data DoubleArrayList
852      * @param k int
853      * @param c double
854      * @param from int
855      * @param to int
856      * @return double
857      */
858     public static double sumOfPowerDeviations( final DoubleArrayList data, final int k, final double c, final int from,
859             final int to ) {
860         final double[] elements = data.elements();
861         double sum = 0;
862         double v;
863         int i;
864         switch ( k ) { // optimized for speed
865             case -2:
866                 if ( c == 0.0 ) {
867                     for ( i = from - 1; ++i <= to; ) {
868                         if ( Double.isNaN( elements[i] ) ) {
869                             continue;
870                         }
871                         v = elements[i];
872                         sum += 1 / ( v * v );
873                     }
874                 } else {
875                     for ( i = from - 1; ++i <= to; ) {
876                         if ( Double.isNaN( elements[i] ) ) {
877                             continue;
878                         }
879                         v = elements[i] - c;
880                         sum += 1 / ( v * v );
881                     }
882                 }
883                 break;
884             case -1:
885                 if ( c == 0.0 ) {
886                     for ( i = from - 1; ++i <= to; ) {
887                         if ( Double.isNaN( elements[i] ) ) {
888                             continue;
889                         }
890                         sum += 1 / elements[i];
891                     }
892                 } else {
893                     for ( i = from - 1; ++i <= to; ) {
894                         if ( Double.isNaN( elements[i] ) ) {
895                             continue;
896                         }
897                         sum += 1 / ( elements[i] - c );
898                     }
899                 }
900                 break;
901             case 0:
902                 for ( i = from - 1; ++i <= to; ) {
903                     if ( Double.isNaN( elements[i] ) ) {
904                         continue;
905                     }
906                     sum += 1;
907                 }
908                 break;
909             case 1:
910                 if ( c == 0.0 ) {
911                     for ( i = from - 1; ++i <= to; ) {
912                         if ( Double.isNaN( elements[i] ) ) {
913                             continue;
914                         }
915                         sum += elements[i];
916                     }
917                 } else {
918                     for ( i = from - 1; ++i <= to; ) {
919                         if ( Double.isNaN( elements[i] ) ) {
920                             continue;
921                         }
922                         sum += elements[i] - c;
923                     }
924                 }
925                 break;
926             case 2:
927                 if ( c == 0.0 ) {
928                     for ( i = from - 1; ++i <= to; ) {
929                         if ( Double.isNaN( elements[i] ) ) {
930                             continue;
931                         }
932                         v = elements[i];
933                         sum += v * v;
934                     }
935                 } else {
936                     for ( i = from - 1; ++i <= to; ) {
937                         if ( Double.isNaN( elements[i] ) ) {
938                             continue;
939                         }
940                         v = elements[i] - c;
941                         sum += v * v;
942                     }
943                 }
944                 break;
945             case 3:
946                 if ( c == 0.0 ) {
947                     for ( i = from - 1; ++i <= to; ) {
948                         if ( Double.isNaN( elements[i] ) ) {
949                             continue;
950                         }
951                         v = elements[i];
952                         sum += v * v * v;
953                     }
954                 } else {
955                     for ( i = from - 1; ++i <= to; ) {
956                         if ( Double.isNaN( elements[i] ) ) {
957                             continue;
958                         }
959                         v = elements[i] - c;
960                         sum += v * v * v;
961                     }
962                 }
963                 break;
964             case 4:
965                 if ( c == 0.0 ) {
966                     for ( i = from - 1; ++i <= to; ) {
967                         if ( Double.isNaN( elements[i] ) ) {
968                             continue;
969                         }
970                         v = elements[i];
971                         sum += v * v * v * v;
972                     }
973                 } else {
974                     for ( i = from - 1; ++i <= to; ) {
975                         if ( Double.isNaN( elements[i] ) ) {
976                             continue;
977                         }
978                         v = elements[i] - c;
979                         sum += v * v * v * v;
980                     }
981                 }
982                 break;
983             case 5:
984                 if ( c == 0.0 ) {
985                     for ( i = from - 1; ++i <= to; ) {
986                         if ( Double.isNaN( elements[i] ) ) {
987                             continue;
988                         }
989                         v = elements[i];
990                         sum += v * v * v * v * v;
991                     }
992                 } else {
993                     for ( i = from - 1; ++i <= to; ) {
994                         if ( Double.isNaN( elements[i] ) ) {
995                             continue;
996                         }
997                         v = elements[i] - c;
998                         sum += v * v * v * v * v;
999                     }
1000                 }
1001                 break;
1002             default:
1003                 for ( i = from - 1; ++i <= to; ) {
1004                     if ( Double.isNaN( elements[i] ) ) {
1005                         continue;
1006                     }
1007                     sum += Math.pow( elements[i] - c, k );
1008                 }
1009                 break;
1010         }
1011         return sum;
1012     }
1013 
1014     /**
1015      * Returns the sum of powers of a data sequence, which is <tt>Sum (
1016      * data[i]<sup>k</sup> )</tt>.
1017      * 
1018      * @param data DoubleArrayList
1019      * @param k int
1020      * @return double
1021      */
1022     public static double sumOfPowers( DoubleArrayList data, int k ) {
1023         return sumOfPowerDeviations( data, k, 0 );
1024     }
1025 
1026     /**
1027      * Compute the sum of the squared deviations from the mean of a data sequence. Missing values are ignored.
1028      * 
1029      * @param data DoubleArrayList
1030      * @return double
1031      */
1032     public static double sumOfSquaredDeviations( DoubleArrayList data ) {
1033         return sumOfSquaredDeviations( sizeWithoutMissingValues( data ),
1034                 variance( sizeWithoutMissingValues( data ), sum( data ), sumOfSquares( data ) ) );
1035     }
1036 
1037     /**
1038      * Returns the sum of squares of a data sequence. Skips missing values.
1039      * 
1040      * @param data DoubleArrayList
1041      * @return double
1042      */
1043     public static double sumOfSquares( DoubleArrayList data ) {
1044         return sumOfPowerDeviations( data, 2, 0.0 );
1045     }
1046 
1047     /**
1048      * Returns the trimmed mean of a sorted data sequence. Missing values are completely ignored.
1049      * 
1050      * @param sortedData the data sequence; <b>must be sorted ascending </b>.
1051      * @param mean the mean of the (full) sorted data sequence.
1052      * @param left int the number of leading elements to trim.
1053      * @param right int number of trailing elements to trim.
1054      * @return double
1055      */
1056     public static double trimmedMean( DoubleArrayList sortedData, double mean, int left, int right ) {
1057         return Descriptive.trimmedMean( removeMissing( sortedData ), mean, left, right );
1058     }
1059 
1060     /**
1061      * Provided for convenience!
1062      * 
1063      * @param data DoubleArrayList
1064      * @return double
1065      */
1066     public static double variance( DoubleArrayList data ) {
1067         return variance( sizeWithoutMissingValues( data ), sum( data ), sumOfSquares( data ) );
1068     }
1069 
1070     /**
1071      * 
1072      */
1073     public static double variance( int sizeWithoutMissing, double sum, double sumOfSquares ) {
1074         return Descriptive.variance( sizeWithoutMissing, sum, sumOfSquares );
1075     }
1076 
1077     /**
1078      * Returns the weighted mean of a data sequence. That is <tt> Sum (data[i] *
1079      * weights[i]) / Sum ( weights[i] )</tt>.
1080      * 
1081      * @param data DoubleArrayList
1082      * @param weights DoubleArrayList
1083      * @return double
1084      */
1085     public static double weightedMean( DoubleArrayList data, DoubleArrayList weights ) {
1086         int size = data.size();
1087         if ( size != weights.size() || size == 0 ) {
1088             throw new IllegalArgumentException();
1089         }
1090 
1091         double[] elements = data.elements();
1092         double[] theWeights = weights.elements();
1093         double sum = 0.0;
1094         double weightsSum = 0.0;
1095         for ( int i = size; --i >= 0; ) {
1096             double w = theWeights[i];
1097             if ( Double.isNaN( elements[i] ) || Double.isNaN( w ) ) {
1098                 continue;
1099             }
1100             sum += elements[i] * w;
1101             weightsSum += w;
1102         }
1103 
1104         return sum / weightsSum;
1105     }
1106 
1107     /**
1108      * Returns the winsorized mean of a sorted data sequence.
1109      * 
1110      * @param sortedData DoubleArrayList, must already be sorted ascending
1111      * @param mean the mean of the (full) sorted data sequence.
1112      * @param left the number of leading elements to trim. Refers to the number of elements to trim
1113      *        <em>excluding any missing values</em>
1114      * @param right the number of trailing elements to trim <em>excluding any missing values</em>
1115      * @return double
1116      */
1117     public static double winsorizedMean( DoubleArrayList sortedData, double mean, int left, int right ) {
1118         DoubleArrayList sdm = removeMissing( sortedData );
1119         return Descriptive.winsorizedMean( sdm, mean( sdm ), left, right );
1120     }
1121 
1122     /* private methods */
1123 
1124     private DescriptiveWithMissing() {
1125     }
1126 
1127 } // end of class