1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 import cern.colt.list.DoubleArrayList;
22 import cern.jet.stat.Descriptive;
23
24 /**
25 * Mathematical functions for statistics that allow missing values without scotching the calculations.
26 * <p>
27 * Be careful because some methods from cern.jet.stat.Descriptive have not been overridden and will yield a
28 * UnsupportedOperationException if used.
29 * </p>
30 * <p>
31 * Some functions that come with DoubleArrayLists will not work in an entirely compatible way with missing values. For
32 * examples, size() reports the total number of elements, including missing values. To get a count of non-missing
33 * values, use this.sizeWithoutMissingValues(). The right one to use may vary.
34 * </p>
35 * <p>
36 * Not all methods need to be overridden. However, all methods that take a "size" parameter should be passed the results
37 * of sizeWithoutMissingValues(data), instead of data.size().
38 * <p>
39 * Based in part on code from the colt package: Copyright © 1999 CERN - European Organization for Nuclear Research.
40 *
41 * @see <a
42 * href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/Descriptive.html">cern.jet.stat.Descriptive
43 * </a>
44 * @author Paul Pavlidis
45 *
46 */
47 public class DescriptiveWithMissing extends cern.jet.stat.Descriptive {
48
49 /**
50 * <b>Not supported. </b>
51 *
52 * @param data DoubleArrayList
53 * @param lag int
54 * @param mean double
55 * @param variance double
56 * @return double
57 */
58 public static double autoCorrelation( DoubleArrayList data, int lag, double mean, double variance ) {
59 throw new UnsupportedOperationException( "autoCorrelation not supported with missing values" );
60 }
61
62 /**
63 * Highly optimized version of the correlation computation, where as much information is precomputed as possible.
64 * Use of this method only makes sense if many comparisons with the inputs x and y are being performed.
65 * <p>
66 * Implementation note: In correlation(DoubleArrayList x, DoubleArrayList y), profiling shows that calls to
67 * Double.NaN consume half the CPU time. The precomputation of the element-by-element squared values is another
68 * obvious optimization. There is also no checking for matching lengths of the arrays.
69 *
70 * @param x
71 * @param y
72 * @param selfSquaredX double array containing values of x_i^2 for each x.
73 * @param selfSquaredY
74 * @param nanStatusX boolean array containing value of Double.isNaN() for each X.
75 * @param nanStatusY
76 * @return
77 */
78 public static double correlation( double[] x, double[] y, double[] selfSquaredX, double[] selfSquaredY,
79 boolean[] nanStatusX, boolean[] nanStatusY ) {
80
81 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
82 int numused = 0;
83 syy = 0.0;
84 sxy = 0.0;
85 sxx = 0.0;
86 sx = 0.0;
87 sy = 0.0;
88
89 int length = x.length;
90 for ( int j = 0; j < length; j++ ) {
91 xj = x[j];
92 yj = y[j];
93
94 if ( nanStatusX[j] || nanStatusY[j] ) {
95 continue;
96 }
97 sx += xj;
98 sy += yj;
99 sxy += xj * yj;
100 sxx += selfSquaredX[j];
101 syy += selfSquaredY[j];
102 numused++;
103 }
104
105 if ( numused > 0 ) {
106 ay = sy / numused;
107 ax = sx / numused;
108 return ( sxy - sx * ay ) / Math.sqrt( ( sxx - sx * ax ) * ( syy - sy * ay ) );
109 }
110 return Double.NaN; // signifies that it could not be calculated.
111 }
112
113 /**
114 * Returns the correlation of two data sequences. That is
115 * <tt>covariance(data1,data2)/(standardDev1*standardDev2)</tt>. Missing values are ignored. This method is
116 * overridden to stop users from using the method in the superclass when missing values are present. The problem is
117 * that the standard deviation cannot be computed without knowning which values are not missing in both vectors to
118 * be compared. Thus the standardDev parameters are thrown away by this method.
119 *
120 * @param data1 DoubleArrayList
121 * @param standardDev1 double - not used
122 * @param data2 DoubleArrayList
123 * @param standardDev2 double - not used
124 * @return double
125 */
126 public static double correlation( DoubleArrayList data1, double standardDev1, DoubleArrayList data2,
127 double standardDev2 ) {
128 return correlation( data1, data2 );
129 }
130
131 /**
132 * Calculate the pearson correlation of two arrays. Missing values (NaNs) are ignored.
133 *
134 * @param x DoubleArrayList
135 * @param y DoubleArrayList
136 * @return double
137 */
138 public static double correlation( DoubleArrayList x, DoubleArrayList y ) {
139 int j;
140 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
141 int numused;
142 syy = 0.0;
143 sxy = 0.0;
144 sxx = 0.0;
145 sx = 0.0;
146 sy = 0.0;
147 numused = 0;
148 if ( x.size() != y.size() ) {
149 throw new ArithmeticException( "Unequal vector sizes: " + x.size() + " != " + y.size() );
150 }
151
152 double[] xel = x.elements();
153 double[] yel = y.elements();
154
155 int length = x.size();
156 for ( j = 0; j < length; j++ ) {
157 xj = xel[j];
158 yj = yel[j];
159
160 if ( !Double.isNaN( xj ) && !Double.isNaN( yj ) ) {
161 sx += xj;
162 sy += yj;
163 sxy += xj * yj;
164 sxx += xj * xj;
165 syy += yj * yj;
166 numused++;
167 }
168 }
169
170 if ( numused > 0 ) {
171 ay = sy / numused;
172 ax = sx / numused;
173 return ( sxy - sx * ay ) / Math.sqrt( ( sxx - sx * ax ) * ( syy - sy * ay ) );
174 }
175 return Double.NaN; // signifies that it could not be calculated.
176
177 }
178
179 /**
180 * Returns the SAMPLE covariance of two data sequences. Pairs of values are only considered if both are not NaN. If
181 * there are no non-missing pairs, the covariance is zero.
182 *
183 * @param data1 the first vector
184 * @param data2 the second vector
185 * @return double
186 */
187 public static double covariance( DoubleArrayList data1, DoubleArrayList data2 ) {
188 int size = data1.size();
189 if ( size != data2.size() || size == 0 ) {
190 throw new IllegalArgumentException();
191 }
192 double[] elements1 = data1.elements();
193 double[] elements2 = data2.elements();
194
195 /* initialize sumx and sumy to the first non-NaN pair of values */
196
197 int i = 0;
198 double sumx = 0.0, sumy = 0.0, Sxy = 0.0;
199 while ( i < size ) {
200 sumx = elements1[i];
201 sumy = elements2[i];
202 if ( !Double.isNaN( elements1[i] ) && !Double.isNaN( elements2[i] ) ) {
203 break;
204 }
205 i++;
206 }
207 i++;
208 int usedPairs = 1;
209 for ( ; i < size; ++i ) {
210 double x = elements1[i];
211 double y = elements2[i];
212 if ( Double.isNaN( x ) || Double.isNaN( y ) ) {
213 continue;
214 }
215
216 sumx += x;
217 Sxy += ( x - sumx / ( usedPairs + 1 ) ) * ( y - sumy / usedPairs );
218 sumy += y;
219 usedPairs++;
220 }
221 return Sxy / ( usedPairs - 1 );
222 }
223
224 /**
225 * Durbin-Watson computation. This measures the serial correlation in a data series.
226 *
227 * @param data DoubleArrayList
228 * @return double
229 * @todo this will still break in some situations where there are missing values
230 */
231 public static double durbinWatson( DoubleArrayList data ) {
232 int size = data.size();
233 if ( size < 2 ) {
234 throw new IllegalArgumentException( "data sequence must contain at least two values." );
235 }
236
237 double[] elements = data.elements();
238 double run = 0;
239 double run_sq = 0;
240 int firstNotNaN = 0;
241 while ( firstNotNaN < size ) {
242 run_sq = elements[firstNotNaN] * elements[firstNotNaN];
243
244 if ( !Double.isNaN( elements[firstNotNaN] ) ) {
245 break;
246 }
247
248 firstNotNaN++;
249 }
250
251 if ( firstNotNaN > 0 && size - firstNotNaN < 2 ) {
252 throw new IllegalArgumentException( "data sequence must contain at least two non-missing values." );
253
254 }
255
256 for ( int i = firstNotNaN + 1; i < size; i++ ) {
257 int gap = 1;
258 while ( i < size && Double.isNaN( elements[i] ) ) {
259 gap++;
260 i++;
261 continue;
262 }
263 if ( i >= size ) continue; // missing value at end will cause this.
264 double x = elements[i] - elements[i - gap];
265 /** */
266 run += x * x;
267 run_sq += elements[i] * elements[i];
268 }
269
270 return run / run_sq;
271 }
272
273 /**
274 * Returns the geometric mean of a data sequence. Missing values are ignored. Note that for a geometric mean to be
275 * meaningful, the minimum of the data sequence must not be less or equal to zero. <br>
276 * The geometric mean is given by <tt>pow( Product( data[i] ),
277 * 1/data.size())</tt>. This method tries to avoid overflows at the expense of an equivalent but somewhat slow
278 * definition: <tt>geo = Math.exp( Sum(
279 * Log(data[i]) ) / data.size())</tt>.
280 *
281 * @param data DoubleArrayList
282 * @return double
283 */
284 public static double geometricMean( DoubleArrayList data ) {
285 return geometricMean( sizeWithoutMissingValues( data ), sumOfLogarithms( data, 0, data.size() - 1 ) );
286 }
287
288 /**
289 * Incrementally maintains and updates minimum, maximum, sum and sum of squares of a data sequence.
290 * <p>
291 * Assume we have already recorded some data sequence elements and know their minimum, maximum, sum and sum of
292 * squares. Assume further, we are to record some more elements and to derive updated values of minimum, maximum,
293 * sum and sum of squares. This method computes those updated values without needing to know the already recorded
294 * elements. This is interesting for interactive online monitoring and/or applications that cannot keep the entire
295 * huge data sequence in memory.
296 *
297 * @param data the additional elements to be incorporated into min, max, etc.
298 * @param from the index of the first element within <tt>data</tt> to consider.
299 * @param to the index of the last element within <tt>data</tt> to consider. The method incorporates elements
300 * <tt>data[from], ..., data[to]</tt>.
301 * @param inOut the old values in the following format:
302 * <ul>
303 * <li><tt>inOut[0]</tt> is the old minimum. <li><tt>inOut[1]</tt> is the old maximum. <li><tt>inOut[2]</tt>
304 * is the old sum. <li><tt>inOut[3]</tt> is the old sum of squares.
305 * </ul>
306 * If no data sequence elements have so far been recorded set the values as follows
307 * <ul>
308 * <li><tt>inOut[0] = Double.POSITIVE_INFINITY</tt> as the old minimum. <li><tt>inOut[1] =
309 * Double.NEGATIVE_INFINITY</tt> as the old maximum. <li><tt>inOut[2] = 0.0</tt> as the old sum. <li><tt>
310 * inOut[3] = 0.0</tt> as the old sum of squares.
311 * </ul>
312 */
313 public static void incrementalUpdate( DoubleArrayList data, int from, int to, double[] inOut ) {
314 throw new UnsupportedOperationException( "incrementalUpdate not supported with missing values" );
315 }
316
317 /**
318 * <b>Not supported. </b>
319 *
320 * @param data DoubleArrayList
321 * @param from int
322 * @param to int
323 * @param fromSumIndex int
324 * @param toSumIndex int
325 * @param sumOfPowers double[]
326 */
327 public static void incrementalUpdateSumsOfPowers( DoubleArrayList data, int from, int to, int fromSumIndex,
328 int toSumIndex, double[] sumOfPowers ) {
329 throw new UnsupportedOperationException( "incrementalUpdateSumsOfPowers not supported with missing values" );
330 }
331
332 /**
333 * <b>Not supported. </b>
334 *
335 * @param data DoubleArrayList
336 * @param weights DoubleArrayList
337 * @param from int
338 * @param to int
339 * @param inOut double[]
340 */
341 public static void incrementalWeightedUpdate( DoubleArrayList data, DoubleArrayList weights, int from, int to,
342 double[] inOut ) {
343 throw new UnsupportedOperationException( "incrementalWeightedUpdate not supported with missing values" );
344 }
345
346 /**
347 * Returns the kurtosis (aka excess) of a data sequence.
348 *
349 * @param moment4 the fourth central moment, which is <tt>moment(data,4,mean)</tt>.
350 * @param standardDeviation the standardDeviation.
351 * @return double
352 */
353 public static double kurtosis( double moment4, double standardDeviation ) {
354 return -3 + moment4 / ( standardDeviation * standardDeviation * standardDeviation * standardDeviation );
355 }
356
357 /**
358 * Returns the kurtosis (aka excess) of a data sequence, which is <tt>-3 +
359 * moment(data,4,mean) / standardDeviation<sup>4</sup></tt>.
360 *
361 * @param data DoubleArrayList
362 * @param mean double
363 * @param standardDeviation double
364 * @return double
365 */
366 public static double kurtosis( DoubleArrayList data, double mean, double standardDeviation ) {
367 return kurtosis( moment( data, 4, mean ), standardDeviation );
368 }
369
370 /**
371 * <b>Not supported. </b>
372 *
373 * @param data DoubleArrayList
374 * @param mean double
375 * @return double
376 */
377 public static double lag1( DoubleArrayList data, double mean ) {
378 throw new UnsupportedOperationException( "lag1 not supported with missing values" );
379 }
380
381 /**
382 * @param dat
383 * @return the median absolute deviation from the median.
384 */
385 public static double mad( DoubleArrayList dat ) {
386 DoubleArrayList copy = removeMissing( dat.copy() );
387
388 double median = median( copy );
389
390 DoubleArrayList devsum = new DoubleArrayList( copy.size() );
391 for ( int i = 0; i < copy.size(); i++ ) {
392 double v = copy.getQuick( i );
393 devsum.add( Math.abs( v - median ) );
394 }
395
396 devsum.sort();
397 return Descriptive.median( devsum );
398 }
399
400 public static double max( DoubleArrayList input ) {
401 int size = input.size();
402 if ( size == 0 ) throw new IllegalArgumentException();
403
404 double[] elements = input.elements();
405 double max = Double.MIN_VALUE;
406 for ( int i = 0; i < size; i++ ) {
407 if ( Double.isNaN( elements[i] ) ) continue;
408 if ( elements[i] > max ) max = elements[i];
409 }
410
411 return max;
412 }
413
414 /**
415 * Special mean calculation where we use the effective size as an input.
416 *
417 * @param elements The data double array.
418 * @param effectiveSize The effective size used for the mean calculation.
419 * @return double
420 */
421 public static double mean( double[] elements, int effectiveSize ) {
422
423 int length = elements.length;
424
425 if ( 0 == effectiveSize ) {
426 return Double.NaN;
427 }
428
429 double sum = 0.0;
430 int i, count;
431 count = 0;
432 double value;
433 for ( i = 0; i < length; i++ ) {
434 value = elements[i];
435 if ( Double.isNaN( value ) ) {
436 continue;
437 }
438 sum += value;
439 count++;
440 }
441 if ( 0.0 == count ) {
442 return Double.NaN;
443 }
444 return sum / effectiveSize;
445 }
446
447 /**
448 * @param data Values to be analyzed.
449 * @return Mean of the values in x. Missing values are ignored in the analysis.
450 */
451 public static double mean( DoubleArrayList data ) {
452 return sum( data ) / sizeWithoutMissingValues( data );
453 }
454
455 /**
456 * Special mean calculation where we use the effective size as an input.
457 *
458 * @param x The data
459 * @param effectiveSize The effective size used for the mean calculation.
460 * @return double
461 */
462 public static double mean( DoubleArrayList x, int effectiveSize ) {
463
464 int length = x.size();
465
466 if ( 0 == effectiveSize ) {
467 return Double.NaN;
468 }
469
470 double sum = 0.0;
471 int i, count;
472 count = 0;
473 double value;
474 double[] elements = x.elements();
475 for ( i = 0; i < length; i++ ) {
476 value = elements[i];
477 if ( Double.isNaN( value ) ) {
478 continue;
479 }
480 sum += value;
481 count++;
482 }
483 if ( 0.0 == count ) {
484 return Double.NaN;
485 }
486 return sum / effectiveSize;
487
488 }
489
490 /**
491 * Calculate the mean of the values above to a particular quantile of an array.
492 *
493 * @param quantile A value from 0 to 1
494 * @param array Array for which we want to get the quantile.
495 * @return double
496 */
497 public static double meanAboveQuantile( double quantile, DoubleArrayList array ) {
498
499 if ( quantile < 0.0 || quantile > 1.0 ) {
500 throw new IllegalArgumentException( "Quantile must be between 0 and 1" );
501 }
502
503 double returnvalue = 0.0;
504 int k = 0;
505
506 double quantileValue = DescriptiveWithMissing.quantile( array, quantile );
507
508 for ( int i = 0; i < array.size(); i++ ) {
509 if ( array.get( i ) > quantileValue ) {
510 returnvalue += array.get( i );
511 k++;
512 }
513 }
514
515 if ( k == 0 ) {
516 throw new ArithmeticException( "No values found above quantile" );
517 }
518
519 return returnvalue / k;
520 }
521
522 /**
523 * Returns the median. Missing values are ignored entirely.
524 *
525 * @param data the data sequence, does not have to be sorted.
526 * @return double
527 */
528 public static double median( DoubleArrayList data ) {
529 return DescriptiveWithMissing.quantile( data, 0.5 );
530 }
531
532 public static double min( DoubleArrayList input ) {
533 int size = input.size();
534 if ( size == 0 ) throw new IllegalArgumentException();
535
536 double[] elements = input.elements();
537 double min = Double.MAX_VALUE;
538 for ( int i = 0; i < size; i++ ) {
539 if ( Double.isNaN( elements[i] ) ) continue;
540 if ( elements[i] < min ) min = elements[i];
541 }
542
543 return min;
544 }
545
546 /**
547 * Returns the moment of <tt>k</tt> -th order with constant <tt>c</tt> of a data sequence, which is
548 * <tt>Sum( (data[i]-c)<sup>k</sup> ) /
549 * data.size()</tt>.
550 *
551 * @param data DoubleArrayList
552 * @param k int
553 * @param c double
554 * @return double
555 */
556 public static double moment( DoubleArrayList data, int k, double c ) {
557 return sumOfPowerDeviations( data, k, c ) / sizeWithoutMissingValues( data );
558 }
559
560 /**
561 * Returns the product of a data sequence, which is <tt>Prod( data[i] )</tt>. Missing values are ignored. In other
562 * words: <tt>data[0]*data[1]*...*data[data.size()-1]</tt>. Note that you may easily get numeric overflows.
563 *
564 * @param data DoubleArrayList
565 * @return double
566 */
567 public static double product( DoubleArrayList data ) {
568 int size = data.size();
569 double[] elements = data.elements();
570
571 double product = 1;
572 for ( int i = size; --i >= 0; ) {
573 if ( Double.isNaN( elements[i] ) ) {
574 continue;
575 }
576 product *= elements[i];
577
578 }
579 return product;
580 }
581
582 /**
583 * Returns the <tt>phi-</tt> quantile; that is, an element <tt>elem</tt> for which holds that <tt>phi</tt> percent
584 * of data elements are less than <tt>elem</tt>. Missing values are ignored. The quantile need not necessarily be
585 * contained in the data sequence, it can be a linear interpolation.
586 *
587 * @param data the data sequence, does not have to be sorted.
588 * @param phi the percentage; must satisfy <tt>0 <= phi <= 1</tt>.
589 * @todo possibly implement so a copy is not made.
590 * @return double
591 */
592 public static double quantile( DoubleArrayList data, double phi ) {
593 DoubleArrayList sortedData = removeMissing( data.copy() );
594 sortedData.sort();
595 return Descriptive.quantile( sortedData, phi );
596 }
597
598 /**
599 * Returns how many percent of the elements contained in the receiver are <tt><= element</tt>. Does linear
600 * interpolation if the element is not contained but lies in between two contained elements. Missing values are
601 * ignored.
602 *
603 * @param data the list to be searched
604 * @param element the element to search for.
605 * @return the percentage <tt>phi</tt> of elements <tt><= element</tt>(<tt>0.0 <= phi <= 1.0)</tt>.
606 */
607 public static double quantileInverse( DoubleArrayList data, double element ) {
608 DoubleArrayList sortedData = removeMissing( data.copy() );
609 sortedData.sort();
610 return rankInterpolated( sortedData, element ) / sortedData.size();
611 }
612
613 /**
614 * Returns the quantiles of the specified percentages. The quantiles need not necessarily be contained in the data
615 * sequence, it can be a linear interpolation.
616 *
617 * @param sortedData the data sequence; <b>must be sorted ascending </b>.
618 * @param percentages the percentages for which quantiles are to be computed. Each percentage must be in the
619 * interval <tt>[0.0,1.0]</tt>.
620 * @return the quantiles.
621 */
622 public static DoubleArrayList quantiles( DoubleArrayList sortedData, DoubleArrayList percentages ) {
623 int s = percentages.size();
624 DoubleArrayList quantiles = new DoubleArrayList( s );
625
626 for ( int i = 0; i < s; i++ ) {
627 quantiles.add( quantile( sortedData, percentages.get( i ) ) );
628 }
629
630 return quantiles;
631 }
632
633 /**
634 * Returns the linearly interpolated number of elements in a list less or equal to a given element. Missing values
635 * are ignored. The rank is the number of elements <= element. Ranks are of the form
636 * <tt>{0, 1, 2,..., sortedList.size()}</tt>. If no element is <= element, then the rank is zero. If the element
637 * lies in between two contained elements, then linear interpolation is used and a non integer value is returned.
638 *
639 * @param sortedList the list to be searched (must be sorted ascending).
640 * @param element the element to search for.
641 * @return the rank of the element.
642 * @todo possibly implement so a copy is not made.
643 */
644 public static double rankInterpolated( DoubleArrayList sortedList, double element ) {
645 return Descriptive.rankInterpolated( removeMissing( sortedList ), element );
646 }
647
648 /**
649 * Makes a copy of the list that doesn't have the missing values.
650 *
651 * @param data DoubleArrayList
652 * @return DoubleArrayList
653 */
654 public static DoubleArrayList removeMissing( DoubleArrayList data ) {
655 DoubleArrayList r = new DoubleArrayList( sizeWithoutMissingValues( data ) );
656 double[] elements = data.elements();
657 int size = data.size();
658 for ( int i = 0; i < size; i++ ) {
659 if ( Double.isNaN( elements[i] ) ) {
660 continue;
661 }
662 r.add( elements[i] );
663 }
664 return r;
665 }
666
667 /**
668 * Returns the sample kurtosis (aka excess) of a data sequence.
669 *
670 * @param data DoubleArrayList
671 * @param mean double
672 * @param sampleVariance double
673 * @return double
674 */
675 public static double sampleKurtosis( DoubleArrayList data, double mean, double sampleVariance ) {
676 return sampleKurtosis( sizeWithoutMissingValues( data ), moment( data, 4, mean ), sampleVariance );
677 }
678
679 /**
680 * Returns the sample skew of a data sequence.
681 *
682 * @param data DoubleArrayList
683 * @param mean double
684 * @param sampleVariance double
685 * @return double
686 */
687 public static double sampleSkew( DoubleArrayList data, double mean, double sampleVariance ) {
688 return sampleSkew( sizeWithoutMissingValues( data ), moment( data, 3, mean ), sampleVariance );
689 }
690
691 /**
692 * Returns the sample standard deviation.
693 * <p>
694 * This is included for compatibility with the superclass, but does not implement the correction used there.
695 *
696 * @see cern.jet.stat.Descriptive#sampleStandardDeviation(int, double)
697 * @param size the number of elements of the data sequence.
698 * @param sampleVariance the <b>sample variance </b>.
699 */
700 public static double sampleStandardDeviation( int size, double sampleVariance ) {
701 return Math.sqrt( sampleVariance );
702 }
703
704 /**
705 * Returns the sample variance of a data sequence. That is <tt>Sum (
706 * (data[i]-mean)^2 ) / (data.size()-1)</tt>.
707 *
708 * @param data DoubleArrayList
709 * @param mean double
710 * @return double
711 */
712 public static double sampleVariance( DoubleArrayList data, double mean ) {
713 double[] elements = data.elements();
714 int effsize = sizeWithoutMissingValues( data );
715 int size = data.size();
716 double sum = 0;
717 // find the sum of the squares
718 for ( int i = size; --i >= 0; ) {
719 if ( Double.isNaN( elements[i] ) ) {
720 continue;
721 }
722 double delta = elements[i] - mean;
723 sum += delta * delta;
724 }
725
726 return sum / ( effsize - 1 );
727 }
728
729 /**
730 * Return the size of the list, ignoring missing values.
731 *
732 * @param list DoubleArrayList
733 * @return int
734 */
735 public static int sizeWithoutMissingValues( DoubleArrayList list ) {
736
737 int size = 0;
738 for ( int i = 0; i < list.size(); i++ ) {
739 if ( !Double.isNaN( list.get( i ) ) ) {
740 size++;
741 }
742 }
743 return size;
744 }
745
746 /**
747 * Returns the skew of a data sequence, which is <tt>moment(data,3,mean) /
748 * standardDeviation<sup>3</sup></tt>.
749 *
750 * @param data DoubleArrayList
751 * @param mean double
752 * @param standardDeviation double
753 * @return double
754 */
755 public static double skew( DoubleArrayList data, double mean, double standardDeviation ) {
756 return skew( moment( data, 3, mean ), standardDeviation );
757 }
758
759 /**
760 * Standardize. Note that this does something slightly different than standardize in the superclass, because our
761 * sampleStandardDeviation does not use the correction of the superclass (which isn't really standard).
762 *
763 * @param data DoubleArrayList
764 */
765 public static void standardize( DoubleArrayList data ) {
766 double mean = mean( data );
767 double stdev = Math.sqrt( sampleVariance( data, mean ) );
768 DescriptiveWithMissing.standardize( data, mean, stdev );
769 }
770
771 /**
772 * Modifies a data sequence to be standardized. Mising values are ignored. Changes each element <tt>data[i]</tt> as
773 * follows: <tt>data[i] = (data[i]-mean)/standardDeviation</tt>.
774 *
775 * @param data DoubleArrayList
776 * @param mean mean of data
777 * @param standardDeviation stdev of data
778 */
779 public static void standardize( DoubleArrayList data, double mean, double standardDeviation ) {
780 double[] elements = data.elements();
781 for ( int i = 0; i < elements.length; i++ ) {
782 if ( Double.isNaN( elements[i] ) ) {
783 continue;
784 }
785 elements[i] = ( elements[i] - mean ) / standardDeviation;
786 }
787 }
788
789 /**
790 * Returns the sum of a data sequence. That is <tt>Sum( data[i] )</tt>.
791 *
792 * @param data DoubleArrayList
793 * @return double
794 */
795 public static double sum( DoubleArrayList data ) {
796 return sumOfPowerDeviations( data, 1, 0.0 );
797 }
798
799 /**
800 * Returns the sum of inversions of a data sequence, which is <tt>Sum( 1.0 /
801 * data[i])</tt>.
802 *
803 * @param data the data sequence.
804 * @param from the index of the first data element (inclusive).
805 * @param to the index of the last data element (inclusive).
806 * @return double
807 */
808 public static double sumOfInversions( DoubleArrayList data, int from, int to ) {
809 return sumOfPowerDeviations( data, -1, 0.0, from, to );
810 }
811
812 /**
813 * Returns the sum of logarithms of a data sequence, which is <tt>Sum(
814 * Log(data[i])</tt>. Missing values are ignored.
815 *
816 * @param data the data sequence.
817 * @param from the index of the first data element (inclusive).
818 * @param to the index of the last data element (inclusive).
819 * @return double
820 */
821 public static double sumOfLogarithms( DoubleArrayList data, int from, int to ) {
822 double[] elements = data.elements();
823 double logsum = 0;
824 for ( int i = from - 1; ++i <= to; ) {
825 if ( Double.isNaN( elements[i] ) ) {
826 continue;
827 }
828 logsum += Math.log( elements[i] );
829 }
830 return logsum;
831 }
832
833 /**
834 * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or
835 * <tt>k == -2 .. 4</tt>.
836 *
837 * @param data DoubleArrayList
838 * @param k int
839 * @param c double
840 * @return double
841 */
842 public static double sumOfPowerDeviations( DoubleArrayList data, int k, double c ) {
843 return sumOfPowerDeviations( data, k, c, 0, data.size() - 1 );
844 }
845
846 /**
847 * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt> for all <tt>i = from ..
848 * to</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or <tt>k == -2 .. 5</tt>. Missing values are
849 * ignored.
850 *
851 * @param data DoubleArrayList
852 * @param k int
853 * @param c double
854 * @param from int
855 * @param to int
856 * @return double
857 */
858 public static double sumOfPowerDeviations( final DoubleArrayList data, final int k, final double c, final int from,
859 final int to ) {
860 final double[] elements = data.elements();
861 double sum = 0;
862 double v;
863 int i;
864 switch ( k ) { // optimized for speed
865 case -2:
866 if ( c == 0.0 ) {
867 for ( i = from - 1; ++i <= to; ) {
868 if ( Double.isNaN( elements[i] ) ) {
869 continue;
870 }
871 v = elements[i];
872 sum += 1 / ( v * v );
873 }
874 } else {
875 for ( i = from - 1; ++i <= to; ) {
876 if ( Double.isNaN( elements[i] ) ) {
877 continue;
878 }
879 v = elements[i] - c;
880 sum += 1 / ( v * v );
881 }
882 }
883 break;
884 case -1:
885 if ( c == 0.0 ) {
886 for ( i = from - 1; ++i <= to; ) {
887 if ( Double.isNaN( elements[i] ) ) {
888 continue;
889 }
890 sum += 1 / elements[i];
891 }
892 } else {
893 for ( i = from - 1; ++i <= to; ) {
894 if ( Double.isNaN( elements[i] ) ) {
895 continue;
896 }
897 sum += 1 / ( elements[i] - c );
898 }
899 }
900 break;
901 case 0:
902 for ( i = from - 1; ++i <= to; ) {
903 if ( Double.isNaN( elements[i] ) ) {
904 continue;
905 }
906 sum += 1;
907 }
908 break;
909 case 1:
910 if ( c == 0.0 ) {
911 for ( i = from - 1; ++i <= to; ) {
912 if ( Double.isNaN( elements[i] ) ) {
913 continue;
914 }
915 sum += elements[i];
916 }
917 } else {
918 for ( i = from - 1; ++i <= to; ) {
919 if ( Double.isNaN( elements[i] ) ) {
920 continue;
921 }
922 sum += elements[i] - c;
923 }
924 }
925 break;
926 case 2:
927 if ( c == 0.0 ) {
928 for ( i = from - 1; ++i <= to; ) {
929 if ( Double.isNaN( elements[i] ) ) {
930 continue;
931 }
932 v = elements[i];
933 sum += v * v;
934 }
935 } else {
936 for ( i = from - 1; ++i <= to; ) {
937 if ( Double.isNaN( elements[i] ) ) {
938 continue;
939 }
940 v = elements[i] - c;
941 sum += v * v;
942 }
943 }
944 break;
945 case 3:
946 if ( c == 0.0 ) {
947 for ( i = from - 1; ++i <= to; ) {
948 if ( Double.isNaN( elements[i] ) ) {
949 continue;
950 }
951 v = elements[i];
952 sum += v * v * v;
953 }
954 } else {
955 for ( i = from - 1; ++i <= to; ) {
956 if ( Double.isNaN( elements[i] ) ) {
957 continue;
958 }
959 v = elements[i] - c;
960 sum += v * v * v;
961 }
962 }
963 break;
964 case 4:
965 if ( c == 0.0 ) {
966 for ( i = from - 1; ++i <= to; ) {
967 if ( Double.isNaN( elements[i] ) ) {
968 continue;
969 }
970 v = elements[i];
971 sum += v * v * v * v;
972 }
973 } else {
974 for ( i = from - 1; ++i <= to; ) {
975 if ( Double.isNaN( elements[i] ) ) {
976 continue;
977 }
978 v = elements[i] - c;
979 sum += v * v * v * v;
980 }
981 }
982 break;
983 case 5:
984 if ( c == 0.0 ) {
985 for ( i = from - 1; ++i <= to; ) {
986 if ( Double.isNaN( elements[i] ) ) {
987 continue;
988 }
989 v = elements[i];
990 sum += v * v * v * v * v;
991 }
992 } else {
993 for ( i = from - 1; ++i <= to; ) {
994 if ( Double.isNaN( elements[i] ) ) {
995 continue;
996 }
997 v = elements[i] - c;
998 sum += v * v * v * v * v;
999 }
1000 }
1001 break;
1002 default:
1003 for ( i = from - 1; ++i <= to; ) {
1004 if ( Double.isNaN( elements[i] ) ) {
1005 continue;
1006 }
1007 sum += Math.pow( elements[i] - c, k );
1008 }
1009 break;
1010 }
1011 return sum;
1012 }
1013
1014 /**
1015 * Returns the sum of powers of a data sequence, which is <tt>Sum (
1016 * data[i]<sup>k</sup> )</tt>.
1017 *
1018 * @param data DoubleArrayList
1019 * @param k int
1020 * @return double
1021 */
1022 public static double sumOfPowers( DoubleArrayList data, int k ) {
1023 return sumOfPowerDeviations( data, k, 0 );
1024 }
1025
1026 /**
1027 * Compute the sum of the squared deviations from the mean of a data sequence. Missing values are ignored.
1028 *
1029 * @param data DoubleArrayList
1030 * @return double
1031 */
1032 public static double sumOfSquaredDeviations( DoubleArrayList data ) {
1033 return sumOfSquaredDeviations( sizeWithoutMissingValues( data ),
1034 variance( sizeWithoutMissingValues( data ), sum( data ), sumOfSquares( data ) ) );
1035 }
1036
1037 /**
1038 * Returns the sum of squares of a data sequence. Skips missing values.
1039 *
1040 * @param data DoubleArrayList
1041 * @return double
1042 */
1043 public static double sumOfSquares( DoubleArrayList data ) {
1044 return sumOfPowerDeviations( data, 2, 0.0 );
1045 }
1046
1047 /**
1048 * Returns the trimmed mean of a sorted data sequence. Missing values are completely ignored.
1049 *
1050 * @param sortedData the data sequence; <b>must be sorted ascending </b>.
1051 * @param mean the mean of the (full) sorted data sequence.
1052 * @param left int the number of leading elements to trim.
1053 * @param right int number of trailing elements to trim.
1054 * @return double
1055 */
1056 public static double trimmedMean( DoubleArrayList sortedData, double mean, int left, int right ) {
1057 return Descriptive.trimmedMean( removeMissing( sortedData ), mean, left, right );
1058 }
1059
1060 /**
1061 * Provided for convenience!
1062 *
1063 * @param data DoubleArrayList
1064 * @return double
1065 */
1066 public static double variance( DoubleArrayList data ) {
1067 return variance( sizeWithoutMissingValues( data ), sum( data ), sumOfSquares( data ) );
1068 }
1069
1070 /**
1071 *
1072 */
1073 public static double variance( int sizeWithoutMissing, double sum, double sumOfSquares ) {
1074 return Descriptive.variance( sizeWithoutMissing, sum, sumOfSquares );
1075 }
1076
1077 /**
1078 * Returns the weighted mean of a data sequence. That is <tt> Sum (data[i] *
1079 * weights[i]) / Sum ( weights[i] )</tt>.
1080 *
1081 * @param data DoubleArrayList
1082 * @param weights DoubleArrayList
1083 * @return double
1084 */
1085 public static double weightedMean( DoubleArrayList data, DoubleArrayList weights ) {
1086 int size = data.size();
1087 if ( size != weights.size() || size == 0 ) {
1088 throw new IllegalArgumentException();
1089 }
1090
1091 double[] elements = data.elements();
1092 double[] theWeights = weights.elements();
1093 double sum = 0.0;
1094 double weightsSum = 0.0;
1095 for ( int i = size; --i >= 0; ) {
1096 double w = theWeights[i];
1097 if ( Double.isNaN( elements[i] ) || Double.isNaN( w ) ) {
1098 continue;
1099 }
1100 sum += elements[i] * w;
1101 weightsSum += w;
1102 }
1103
1104 return sum / weightsSum;
1105 }
1106
1107 /**
1108 * Returns the winsorized mean of a sorted data sequence.
1109 *
1110 * @param sortedData DoubleArrayList, must already be sorted ascending
1111 * @param mean the mean of the (full) sorted data sequence.
1112 * @param left the number of leading elements to trim. Refers to the number of elements to trim
1113 * <em>excluding any missing values</em>
1114 * @param right the number of trailing elements to trim <em>excluding any missing values</em>
1115 * @return double
1116 */
1117 public static double winsorizedMean( DoubleArrayList sortedData, double mean, int left, int right ) {
1118 DoubleArrayList sdm = removeMissing( sortedData );
1119 return Descriptive.winsorizedMean( sdm, mean( sdm ), left, right );
1120 }
1121
1122 /* private methods */
1123
1124 private DescriptiveWithMissing() {
1125 }
1126
1127 } // end of class