1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 import cern.colt.list.DoubleArrayList;
22 import cern.jet.stat.Descriptive;
23
24 /**
25 * Mathematical functions for statistics that allow missing values without scotching the calculations.
26 * <p>
27 * Some functions that come with DoubleArrayLists will not work in an entirely compatible way with missing values. For
28 * examples, size() reports the total number of elements, including missing values. To get a count of non-missing
29 * values, use this.sizeWithoutMissingValues(). The right one to use may vary.
30 * </p>
31 * <p>
32 * Not all methods need to be overridden. However, all methods that take a "size" parameter should be passed the results
33 * of sizeWithoutMissingValues(data), instead of data.size().
34 * <p>
35 * Based in part on code from the colt package: Copyright © 1999 CERN - European Organization for Nuclear Research.
36 *
37 * @author Paul Pavlidis
38 * @see <a
39 * href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/Descriptive.html">cern.jet.stat.Descriptive
40 * </a>
41 */
42 public class DescriptiveWithMissing {
43
44 /**
45 * Highly optimized version of the correlation computation, where as much information is precomputed as possible.
46 * Use of this method only makes sense if many comparisons with the inputs x and y are being performed.
47 * <p>
48 * Implementation note: In correlation(DoubleArrayList x, DoubleArrayList y), profiling shows that calls to
49 * Double.NaN consume half the CPU time. The precomputation of the element-by-element squared values is another
50 * obvious optimization. There is also no checking for matching lengths of the arrays.
51 *
52 * @param x double array containing values of x_i for each X.
53 * @param y double array containing values of y_i for each Y.
54 * @param selfSquaredX double array containing values of x_i^2 for each X.
55 * @param selfSquaredY double array containing values of y_i^2 for each Y
56 * @param nanStatusX boolean array containing value of {@link Double#isNaN(double)} for each X.
57 * @param nanStatusY boolean array containing value of {@link Double#isNaN(double)} for each Y.
58 * @return
59 */
60 public static double correlation( double[] x, double[] y, double[] selfSquaredX, double[] selfSquaredY, boolean[] nanStatusX, boolean[] nanStatusY ) {
61 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
62 int numused = 0;
63 syy = 0.0;
64 sxy = 0.0;
65 sxx = 0.0;
66 sx = 0.0;
67 sy = 0.0;
68
69 int length = x.length;
70
71 if ( y.length != length || selfSquaredX.length != length || selfSquaredY.length != length || nanStatusX.length != length || nanStatusY.length != length ) {
72 throw new ArithmeticException();
73 }
74
75 for ( int j = 0; j < length; j++ ) {
76 xj = x[j];
77 yj = y[j];
78
79 if ( nanStatusX[j] || nanStatusY[j] ) {
80 continue;
81 }
82 sx += xj;
83 sy += yj;
84 sxy += xj * yj;
85 sxx += selfSquaredX[j];
86 syy += selfSquaredY[j];
87 numused++;
88 }
89
90 if ( numused > 0 ) {
91 ay = sy / numused;
92 ax = sx / numused;
93 return ( sxy - sx * ay ) / Math.sqrt( ( sxx - sx * ax ) * ( syy - sy * ay ) );
94 }
95 return Double.NaN; // signifies that it could not be calculated.
96 }
97
98 /**
99 * Calculate the pearson correlation of two arrays. Missing values (NaNs) are ignored.
100 *
101 * @param x DoubleArrayList
102 * @param y DoubleArrayList
103 * @return double
104 */
105 public static double correlation( DoubleArrayList x, DoubleArrayList y ) {
106 int j;
107 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
108 int numused;
109 syy = 0.0;
110 sxy = 0.0;
111 sxx = 0.0;
112 sx = 0.0;
113 sy = 0.0;
114 numused = 0;
115 if ( x.size() != y.size() ) {
116 throw new ArithmeticException( "Unequal vector sizes: " + x.size() + " != " + y.size() );
117 }
118
119 double[] xel = x.elements();
120 double[] yel = y.elements();
121
122 int length = x.size();
123 for ( j = 0; j < length; j++ ) {
124 xj = xel[j];
125 yj = yel[j];
126
127 if ( !Double.isNaN( xj ) && !Double.isNaN( yj ) ) {
128 sx += xj;
129 sy += yj;
130 sxy += xj * yj;
131 sxx += xj * xj;
132 syy += yj * yj;
133 numused++;
134 }
135 }
136
137 if ( numused > 0 ) {
138 ay = sy / numused;
139 ax = sx / numused;
140 return ( sxy - sx * ay ) / Math.sqrt( ( sxx - sx * ax ) * ( syy - sy * ay ) );
141 }
142
143 return Double.NaN; // signifies that it could not be calculated.
144 }
145
146 /**
147 * Returns the SAMPLE covariance of two data sequences. Pairs of values are only considered if both are not NaN. If
148 * there are no non-missing pairs, the covariance is zero.
149 *
150 * @param data1 the first vector
151 * @param data2 the second vector
152 * @return double
153 */
154 public static double covariance( DoubleArrayList data1, DoubleArrayList data2 ) {
155 int size = data1.size();
156 if ( size != data2.size() || size == 0 ) {
157 throw new IllegalArgumentException();
158 }
159 double[] elements1 = data1.elements();
160 double[] elements2 = data2.elements();
161
162 /* initialize sumx and sumy to the first non-NaN pair of values */
163
164 int i = 0;
165 double sumx = 0.0, sumy = 0.0, Sxy = 0.0;
166 while ( i < size ) {
167 sumx = elements1[i];
168 sumy = elements2[i];
169 if ( !Double.isNaN( elements1[i] ) && !Double.isNaN( elements2[i] ) ) {
170 break;
171 }
172 i++;
173 }
174 i++;
175 int usedPairs = 1;
176 for ( ; i < size; ++i ) {
177 double x = elements1[i];
178 double y = elements2[i];
179 if ( Double.isNaN( x ) || Double.isNaN( y ) ) {
180 continue;
181 }
182
183 sumx += x;
184 Sxy += ( x - sumx / ( usedPairs + 1 ) ) * ( y - sumy / usedPairs );
185 sumy += y;
186 usedPairs++;
187 }
188 return Sxy / ( usedPairs - 1 );
189 }
190
191 /**
192 * Durbin-Watson computation. This measures the serial correlation in a data series.
193 *
194 * @param data DoubleArrayList
195 * @return double
196 * @todo this will still break in some situations where there are missing values
197 */
198 public static double durbinWatson( DoubleArrayList data ) {
199 int size = data.size();
200 if ( size < 2 ) {
201 throw new IllegalArgumentException( "data sequence must contain at least two values." );
202 }
203
204 double[] elements = data.elements();
205 double run = 0;
206 double run_sq = 0;
207 int firstNotNaN = 0;
208 while ( firstNotNaN < size ) {
209 run_sq = elements[firstNotNaN] * elements[firstNotNaN];
210
211 if ( !Double.isNaN( elements[firstNotNaN] ) ) {
212 break;
213 }
214
215 firstNotNaN++;
216 }
217
218 if ( firstNotNaN > 0 && size - firstNotNaN < 2 ) {
219 throw new IllegalArgumentException( "data sequence must contain at least two non-missing values." );
220
221 }
222
223 for ( int i = firstNotNaN + 1; i < size; i++ ) {
224 int gap = 1;
225 while ( i < size && Double.isNaN( elements[i] ) ) {
226 gap++;
227 i++;
228 continue;
229 }
230 if ( i >= size ) continue; // missing value at end will cause this.
231 double x = elements[i] - elements[i - gap];
232 /** */
233 run += x * x;
234 run_sq += elements[i] * elements[i];
235 }
236
237 return run / run_sq;
238 }
239
240 /**
241 * Returns the geometric mean of a data sequence. Missing values are ignored. Note that for a geometric mean to be
242 * meaningful, the minimum of the data sequence must not be less or equal to zero. <br>
243 * The geometric mean is given by <tt>pow( Product( data[i] ),
244 * 1/data.size())</tt>. This method tries to avoid overflows at the expense of an equivalent but somewhat slow
245 * definition: <tt>geo = Math.exp( Sum(
246 * Log(data[i]) ) / data.size())</tt>.
247 *
248 * @param data DoubleArrayList
249 * @return double
250 */
251 public static double geometricMean( DoubleArrayList data ) {
252 return Descriptive.geometricMean( sizeWithoutMissingValues( data ), sumOfLogarithms( data, 0, data.size() - 1 ) );
253 }
254
255 /**
256 * Returns the kurtosis (aka excess) of a data sequence, which is <tt>-3 +
257 * moment(data,4,mean) / standardDeviation<sup>4</sup></tt>.
258 *
259 * @param data DoubleArrayList
260 * @param mean double
261 * @param standardDeviation double
262 * @return double
263 */
264 public static double kurtosis( DoubleArrayList data, double mean, double standardDeviation ) {
265 return Descriptive.kurtosis( moment( data, 4, mean ), standardDeviation );
266 }
267
268 /**
269 * Returns the median absolute deviation from the median.
270 *
271 * @param data the data, does not have to be sorted
272 */
273 public static double mad( DoubleArrayList data ) {
274 data = data.copy();
275 sortAndRemoveMissing( data );
276 double median = Descriptive.median( data );
277 DoubleArrayList devsum = new DoubleArrayList( data.size() );
278 for ( int i = 0; i < data.size(); i++ ) {
279 double v = data.getQuick( i );
280 devsum.add( Math.abs( v - median ) );
281 }
282 devsum.sort();
283 return Descriptive.median( devsum );
284 }
285
286 public static double max( DoubleArrayList input ) {
287 int size = input.size();
288 if ( size == 0 ) throw new IllegalArgumentException();
289
290 double[] elements = input.elements();
291 double max = Double.MIN_VALUE;
292 for ( int i = 0; i < size; i++ ) {
293 if ( Double.isNaN( elements[i] ) ) continue;
294 if ( elements[i] > max ) max = elements[i];
295 }
296
297 return max;
298 }
299
300 /**
301 * @param data Values to be analyzed.
302 * @return Mean of the values in x. Missing values are ignored in the analysis.
303 */
304 public static double mean( DoubleArrayList data ) {
305 return sum( data ) / sizeWithoutMissingValues( data );
306 }
307
308 /**
309 * Special mean calculation where we use the effective size as an input.
310 *
311 * @param x The data
312 * @param effectiveSize The effective size used for the mean calculation.
313 * @return double
314 */
315 public static double mean( DoubleArrayList x, int effectiveSize ) {
316
317 int length = x.size();
318
319 if ( 0 == effectiveSize ) {
320 return Double.NaN;
321 }
322
323 double sum = 0.0;
324 int i, count;
325 count = 0;
326 double value;
327 double[] elements = x.elements();
328 for ( i = 0; i < length; i++ ) {
329 value = elements[i];
330 if ( Double.isNaN( value ) ) {
331 continue;
332 }
333 sum += value;
334 count++;
335 }
336 if ( 0.0 == count ) {
337 return Double.NaN;
338 }
339 return sum / effectiveSize;
340
341 }
342
343 /**
344 * Calculate the mean of the values above to a particular quantile of an array.
345 *
346 * @param data Array for which we want to get the quantile.
347 * @param quantile A value from 0 to 1
348 * @return double
349 */
350 public static double meanAboveQuantile( DoubleArrayList data, double quantile ) {
351 if ( quantile < 0.0 || quantile > 1.0 ) {
352 throw new IllegalArgumentException( "Quantile must be between 0 and 1" );
353 }
354
355 data = data.copy();
356 sortAndRemoveMissing( data );
357 double quantileValue = Descriptive.quantile( data, quantile );
358
359 double returnvalue = 0.0;
360 int k = 0;
361
362 for ( int i = 0; i < data.size(); i++ ) {
363 if ( data.get( i ) > quantileValue ) {
364 returnvalue += data.get( i );
365 k++;
366 }
367 }
368
369 if ( k == 0 ) {
370 throw new ArithmeticException( "No values found above quantile" );
371 }
372
373 return returnvalue / k;
374 }
375
376 /**
377 * Returns the median. Missing values are ignored entirely.
378 *
379 * @param data the data sequence, does not have to be sorted.
380 * @return double
381 */
382 public static double median( DoubleArrayList data ) {
383 data = data.copy();
384 sortAndRemoveMissing( data );
385 return Descriptive.median( data );
386 }
387
388 public static double min( DoubleArrayList input ) {
389 int size = input.size();
390 if ( size == 0 ) throw new IllegalArgumentException();
391
392 double[] elements = input.elements();
393 double min = Double.MAX_VALUE;
394 for ( int i = 0; i < size; i++ ) {
395 if ( Double.isNaN( elements[i] ) ) continue;
396 if ( elements[i] < min ) min = elements[i];
397 }
398
399 return min;
400 }
401
402 /**
403 * Returns the moment of <tt>k</tt> -th order with constant <tt>c</tt> of a data sequence, which is
404 * <tt>Sum( (data[i]-c)<sup>k</sup> ) /
405 * data.size()</tt>.
406 *
407 * @param data DoubleArrayList
408 * @param k int
409 * @param c double
410 * @return double
411 */
412 public static double moment( DoubleArrayList data, int k, double c ) {
413 return sumOfPowerDeviations( data, k, c ) / sizeWithoutMissingValues( data );
414 }
415
416 /**
417 * Returns the product of a data sequence, which is <tt>Prod( data[i] )</tt>. Missing values are ignored. In other
418 * words: <tt>data[0]*data[1]*...*data[data.size()-1]</tt>. Note that you may easily get numeric overflows.
419 *
420 * @param data DoubleArrayList
421 * @return double
422 */
423 public static double product( DoubleArrayList data ) {
424 int size = data.size();
425 double[] elements = data.elements();
426
427 double product = 1;
428 for ( int i = size; --i >= 0; ) {
429 if ( Double.isNaN( elements[i] ) ) {
430 continue;
431 }
432 product *= elements[i];
433
434 }
435 return product;
436 }
437
438 /**
439 * Returns the <tt>phi-</tt> quantile; that is, an element <tt>elem</tt> for which holds that <tt>phi</tt> percent
440 * of data elements are less than <tt>elem</tt>. Missing values are ignored. The quantile need not necessarily be
441 * contained in the data sequence, it can be a linear interpolation.
442 *
443 * @param data the data sequence, does not have to be sorted.
444 * @param phi the percentage; must satisfy <tt>0 <= phi <= 1</tt>.
445 * @return double
446 * @todo possibly implement so a copy is not made.
447 */
448 public static double quantile( DoubleArrayList data, double phi ) {
449 data = data.copy();
450 sortAndRemoveMissing( data );
451 return Descriptive.quantile( data, phi );
452 }
453
454 /**
455 * Returns how many percent of the elements contained in the receiver are <tt><= element</tt>. Does linear
456 * interpolation if the element is not contained but lies in between two contained elements. Missing values are
457 * ignored.
458 *
459 * @param data the list to be searched
460 * @param element the element to search for.
461 * @return the percentage <tt>phi</tt> of elements <tt><= element</tt>(<tt>0.0 <= phi <= 1.0)</tt>.
462 */
463 public static double quantileInverse( DoubleArrayList data, double element ) {
464 data = data.copy();
465 sortAndRemoveMissing( data );
466 return rankInterpolated( data, element ) / data.size();
467 }
468
469 /**
470 * Returns the quantiles of the specified percentages. The quantiles need not necessarily be contained in the data
471 * sequence, it can be a linear interpolation.
472 *
473 * @param data the data sequence; does not have to be sorted
474 * @param percentages the percentages for which quantiles are to be computed. Each percentage must be in the
475 * interval <tt>[0.0,1.0]</tt>.
476 * @return the quantiles.
477 */
478 public static DoubleArrayList quantiles( DoubleArrayList data, DoubleArrayList percentages ) {
479 data = data.copy();
480 sortAndRemoveMissing( data );
481 int s = percentages.size();
482 DoubleArrayList quantiles = new DoubleArrayList( s );
483 for ( int i = 0; i < s; i++ ) {
484 quantiles.add( Descriptive.quantile( data, percentages.get( i ) ) );
485 }
486 return quantiles;
487 }
488
489 /**
490 * Returns the linearly interpolated number of elements in a list less or equal to a given element. Missing values
491 * are ignored. The rank is the number of elements <= element. Ranks are of the form
492 * <tt>{0, 1, 2,..., sortedList.size()}</tt>. If no element is <= element, then the rank is zero. If the element
493 * lies in between two contained elements, then linear interpolation is used and a non integer value is returned.
494 *
495 * @param data the list to be searched, does not have to be sorted
496 * @param element the element to search for.
497 * @return the rank of the element.
498 * @todo possibly implement so a copy is not made.
499 */
500 public static double rankInterpolated( DoubleArrayList data, double element ) {
501 data = data.copy();
502 sortAndRemoveMissing( data );
503 return Descriptive.rankInterpolated( data, element );
504 }
505
506 /**
507 * Returns the sample kurtosis (aka excess) of a data sequence.
508 *
509 * @param data DoubleArrayList
510 * @param mean double
511 * @param sampleVariance double
512 * @return double
513 */
514 public static double sampleKurtosis( DoubleArrayList data, double mean, double sampleVariance ) {
515 return Descriptive.sampleKurtosis( sizeWithoutMissingValues( data ), moment( data, 4, mean ), sampleVariance );
516 }
517
518 /**
519 * Returns the sample skew of a data sequence.
520 *
521 * @param data DoubleArrayList
522 * @param mean double
523 * @param sampleVariance double
524 * @return double
525 */
526 public static double sampleSkew( DoubleArrayList data, double mean, double sampleVariance ) {
527 return Descriptive.sampleSkew( sizeWithoutMissingValues( data ), moment( data, 3, mean ), sampleVariance );
528 }
529
530 /**
531 * Returns the sample variance of a data sequence. That is <tt>Sum (
532 * (data[i]-mean)^2 ) / (data.size()-1)</tt>.
533 *
534 * @param data DoubleArrayList
535 * @param mean double
536 * @return double
537 */
538 public static double sampleVariance( DoubleArrayList data, double mean ) {
539 double[] elements = data.elements();
540 int size = data.size();
541 double sum = 0;
542 int effsize = 0;
543 // find the sum of the squares
544 for ( int i = size; --i >= 0; ) {
545 if ( Double.isNaN( elements[i] ) ) {
546 continue;
547 }
548 double delta = elements[i] - mean;
549 sum += delta * delta;
550 effsize++;
551 }
552
553 return sum / ( effsize - 1 );
554 }
555
556 /**
557 * Returns the skew of a data sequence, which is <tt>moment(data,3,mean) /
558 * standardDeviation<sup>3</sup></tt>.
559 *
560 * @param data DoubleArrayList
561 * @param mean double
562 * @param standardDeviation double
563 * @return double
564 */
565 public static double skew( DoubleArrayList data, double mean, double standardDeviation ) {
566 return Descriptive.skew( moment( data, 3, mean ), standardDeviation );
567 }
568
569 /**
570 * Standardize. Note that this does something slightly different than standardize in the superclass, because our
571 * sampleStandardDeviation does not use the correction of the superclass (which isn't really standard).
572 *
573 * @param data DoubleArrayList
574 */
575 public static void standardize( DoubleArrayList data ) {
576 double mean = mean( data );
577 double stdev = Math.sqrt( sampleVariance( data, mean ) );
578 standardize( data, mean, stdev );
579 }
580
581 /**
582 * Modifies a data sequence to be standardized. Missing values are ignored. Changes each element <tt>data[i]</tt> as
583 * follows: <tt>data[i] = (data[i]-mean)/standardDeviation</tt> unless the standard deviation is 0.00 or very close to zero (indicating the data are constant),
584 * in which case we return a vector of zeros (in effect just doing mean subtraction)
585 *
586 * @param data DoubleArrayList
587 * @param mean mean of data
588 * @param standardDeviation stdev of data. |stdev| < Constants.TINY is treated as zero.
589 */
590 public static void standardize( DoubleArrayList data, double mean, double standardDeviation ) {
591 double[] elements = data.elements();
592 int size = data.size();
593
594 if ( Math.abs( 0.0 - standardDeviation ) < Constants.TINY ) {
595 for ( int i = 0; i < size; i++ ) {
596 if ( !Double.isNaN( elements[i] ) ) {
597 elements[i] = 0.0;
598 }
599 }
600 return;
601 }
602
603 for ( int i = 0; i < size; i++ ) {
604 if ( Double.isNaN( elements[i] ) ) {
605 continue;
606 }
607 elements[i] = ( elements[i] - mean ) / standardDeviation;
608 }
609 }
610
611 /**
612 * Returns the sum of a data sequence. That is <tt>Sum( data[i] )</tt>.
613 *
614 * @param data DoubleArrayList
615 * @return double
616 */
617 public static double sum( DoubleArrayList data ) {
618 return sumOfPowerDeviations( data, 1, 0.0 );
619 }
620
621 /**
622 * Returns the sum of inversions of a data sequence, which is <tt>Sum( 1.0 /
623 * data[i])</tt>.
624 *
625 * @param data the data sequence.
626 * @param from the index of the first data element (inclusive).
627 * @param to the index of the last data element (inclusive).
628 * @return double
629 */
630 public static double sumOfInversions( DoubleArrayList data, int from, int to ) {
631 return sumOfPowerDeviations( data, -1, 0.0, from, to );
632 }
633
634 /**
635 * Returns the sum of logarithms of a data sequence, which is <tt>Sum(
636 * Log(data[i])</tt>. Missing values are ignored.
637 *
638 * @param data the data sequence.
639 * @param from the index of the first data element (inclusive).
640 * @param to the index of the last data element (inclusive).
641 * @return double
642 */
643 public static double sumOfLogarithms( DoubleArrayList data, int from, int to ) {
644 double[] elements = data.elements();
645 double logsum = 0;
646 for ( int i = from - 1; ++i <= to; ) {
647 if ( Double.isNaN( elements[i] ) ) {
648 continue;
649 }
650 logsum += Math.log( elements[i] );
651 }
652 return logsum;
653 }
654
655 /**
656 * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or
657 * <tt>k == -2 .. 4</tt>.
658 *
659 * @param data DoubleArrayList
660 * @param k int
661 * @param c double
662 * @return double
663 */
664 public static double sumOfPowerDeviations( DoubleArrayList data, int k, double c ) {
665 return sumOfPowerDeviations( data, k, c, 0, data.size() - 1 );
666 }
667
668 /**
669 * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt> for all <tt>i = from ..
670 * to</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or <tt>k == -2 .. 5</tt>. Missing values are
671 * ignored.
672 *
673 * @param data DoubleArrayList
674 * @param k int
675 * @param c double
676 * @param from int
677 * @param to int
678 * @return double
679 */
680 public static double sumOfPowerDeviations( final DoubleArrayList data, final int k, final double c, final int from, final int to ) {
681 final double[] elements = data.elements();
682 double sum = 0;
683 double v;
684 int i;
685 switch ( k ) { // optimized for speed
686 case -2:
687 if ( c == 0.0 ) {
688 for ( i = from - 1; ++i <= to; ) {
689 if ( Double.isNaN( elements[i] ) ) {
690 continue;
691 }
692 v = elements[i];
693 sum += 1 / ( v * v );
694 }
695 } else {
696 for ( i = from - 1; ++i <= to; ) {
697 if ( Double.isNaN( elements[i] ) ) {
698 continue;
699 }
700 v = elements[i] - c;
701 sum += 1 / ( v * v );
702 }
703 }
704 break;
705 case -1:
706 if ( c == 0.0 ) {
707 for ( i = from - 1; ++i <= to; ) {
708 if ( Double.isNaN( elements[i] ) ) {
709 continue;
710 }
711 sum += 1 / elements[i];
712 }
713 } else {
714 for ( i = from - 1; ++i <= to; ) {
715 if ( Double.isNaN( elements[i] ) ) {
716 continue;
717 }
718 sum += 1 / ( elements[i] - c );
719 }
720 }
721 break;
722 case 0:
723 for ( i = from - 1; ++i <= to; ) {
724 if ( Double.isNaN( elements[i] ) ) {
725 continue;
726 }
727 sum += 1;
728 }
729 break;
730 case 1:
731 if ( c == 0.0 ) {
732 for ( i = from - 1; ++i <= to; ) {
733 if ( Double.isNaN( elements[i] ) ) {
734 continue;
735 }
736 sum += elements[i];
737 }
738 } else {
739 for ( i = from - 1; ++i <= to; ) {
740 if ( Double.isNaN( elements[i] ) ) {
741 continue;
742 }
743 sum += elements[i] - c;
744 }
745 }
746 break;
747 case 2:
748 if ( c == 0.0 ) {
749 for ( i = from - 1; ++i <= to; ) {
750 if ( Double.isNaN( elements[i] ) ) {
751 continue;
752 }
753 v = elements[i];
754 sum += v * v;
755 }
756 } else {
757 for ( i = from - 1; ++i <= to; ) {
758 if ( Double.isNaN( elements[i] ) ) {
759 continue;
760 }
761 v = elements[i] - c;
762 sum += v * v;
763 }
764 }
765 break;
766 case 3:
767 if ( c == 0.0 ) {
768 for ( i = from - 1; ++i <= to; ) {
769 if ( Double.isNaN( elements[i] ) ) {
770 continue;
771 }
772 v = elements[i];
773 sum += v * v * v;
774 }
775 } else {
776 for ( i = from - 1; ++i <= to; ) {
777 if ( Double.isNaN( elements[i] ) ) {
778 continue;
779 }
780 v = elements[i] - c;
781 sum += v * v * v;
782 }
783 }
784 break;
785 case 4:
786 if ( c == 0.0 ) {
787 for ( i = from - 1; ++i <= to; ) {
788 if ( Double.isNaN( elements[i] ) ) {
789 continue;
790 }
791 v = elements[i];
792 sum += v * v * v * v;
793 }
794 } else {
795 for ( i = from - 1; ++i <= to; ) {
796 if ( Double.isNaN( elements[i] ) ) {
797 continue;
798 }
799 v = elements[i] - c;
800 sum += v * v * v * v;
801 }
802 }
803 break;
804 case 5:
805 if ( c == 0.0 ) {
806 for ( i = from - 1; ++i <= to; ) {
807 if ( Double.isNaN( elements[i] ) ) {
808 continue;
809 }
810 v = elements[i];
811 sum += v * v * v * v * v;
812 }
813 } else {
814 for ( i = from - 1; ++i <= to; ) {
815 if ( Double.isNaN( elements[i] ) ) {
816 continue;
817 }
818 v = elements[i] - c;
819 sum += v * v * v * v * v;
820 }
821 }
822 break;
823 default:
824 for ( i = from - 1; ++i <= to; ) {
825 if ( Double.isNaN( elements[i] ) ) {
826 continue;
827 }
828 sum += Math.pow( elements[i] - c, k );
829 }
830 break;
831 }
832 return sum;
833 }
834
835 /**
836 * Returns the sum of powers of a data sequence, which is <tt>Sum (
837 * data[i]<sup>k</sup> )</tt>.
838 *
839 * @param data DoubleArrayList
840 * @param k int
841 * @return double
842 */
843 public static double sumOfPowers( DoubleArrayList data, int k ) {
844 return sumOfPowerDeviations( data, k, 0 );
845 }
846
847 /**
848 * Compute the sum of the squared deviations from the mean of a data sequence. Missing values are ignored.
849 *
850 * @param data DoubleArrayList
851 * @return double
852 */
853 public static double sumOfSquaredDeviations( DoubleArrayList data ) {
854 int size = sizeWithoutMissingValues( data );
855 return Descriptive.sumOfSquaredDeviations( size, Descriptive.variance( size, sum( data ), sumOfSquares( data ) ) );
856 }
857
858 /**
859 * Returns the sum of squares of a data sequence. Skips missing values.
860 *
861 * @param data DoubleArrayList
862 * @return double
863 */
864 public static double sumOfSquares( DoubleArrayList data ) {
865 return sumOfPowerDeviations( data, 2, 0.0 );
866 }
867
868 /**
869 * Returns the trimmed mean of a data sequence. Missing values are completely ignored.
870 *
871 * @param data the data sequence
872 * @param left int the number of leading elements to trim.
873 * @param right int number of trailing elements to trim.
874 * @return double
875 */
876 public static double trimmedMean( DoubleArrayList data, int left, int right ) {
877 data = data.copy();
878 sortAndRemoveMissing( data );
879 return Descriptive.trimmedMean( data, Descriptive.mean( data ), left, right );
880 }
881
882 /**
883 * Provided for convenience!
884 *
885 * @param data DoubleArrayList
886 * @return double
887 */
888 public static double variance( DoubleArrayList data ) {
889 return Descriptive.variance( sizeWithoutMissingValues( data ), sum( data ), sumOfSquares( data ) );
890 }
891
892 /**
893 * Returns the weighted mean of a data sequence. That is <tt> Sum (data[i] *
894 * weights[i]) / Sum ( weights[i] )</tt>.
895 *
896 * @param data DoubleArrayList
897 * @param weights DoubleArrayList
898 * @return double
899 */
900 public static double weightedMean( DoubleArrayList data, DoubleArrayList weights ) {
901 int size = data.size();
902 if ( size != weights.size() || size == 0 ) {
903 throw new IllegalArgumentException();
904 }
905
906 double[] elements = data.elements();
907 double[] theWeights = weights.elements();
908 double sum = 0.0;
909 double weightsSum = 0.0;
910 for ( int i = size; --i >= 0; ) {
911 double w = theWeights[i];
912 if ( Double.isNaN( elements[i] ) || Double.isNaN( w ) ) {
913 continue;
914 }
915 sum += elements[i] * w;
916 weightsSum += w;
917 }
918
919 return sum / weightsSum;
920 }
921
922 /**
923 * Makes a copy of the list that doesn't have the missing values and sort the values.
924 * <p>
925 * This takes advantage that {@link Double#NaN} are always sorted last, so we can simply set the size of the
926 * resulting array to the first NaN value.
927 */
928 private static void sortAndRemoveMissing( DoubleArrayList data ) {
929 data.sort();
930 double[] elements = data.elements();
931 int size = data.size();
932 for ( int i = 0; i < size; i++ ) {
933 if ( Double.isNaN( elements[i] ) ) {
934 data.setSize( i );
935 break;
936 }
937 }
938 }
939
940 /**
941 * Return the size of the list, ignoring missing values.
942 *
943 * @param list DoubleArrayList
944 * @return int
945 */
946 private static int sizeWithoutMissingValues( DoubleArrayList list ) {
947 int size = 0;
948 for ( int i = 0; i < list.size(); i++ ) {
949 if ( !Double.isNaN( list.get( i ) ) ) {
950 size++;
951 }
952 }
953 return size;
954 }
955
956 /* private methods */
957
958 private DescriptiveWithMissing() {
959 }
960
961 } // end of class