1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 import cern.colt.list.DoubleArrayList;
22 import cern.jet.stat.Descriptive;
23
24 /**
25 * Mathematical functions for statistics that allow missing values without scotching the calculations.
26 * <p>
27 * Be careful because some methods from cern.jet.stat.Descriptive have not been overridden and will yield a
28 * UnsupportedOperationException if used.
29 * </p>
30 * <p>
31 * Some functions that come with DoubleArrayLists will not work in an entirely compatible way with missing values. For
32 * examples, size() reports the total number of elements, including missing values. To get a count of non-missing
33 * values, use this.sizeWithoutMissingValues(). The right one to use may vary.
34 * </p>
35 * <p>
36 * Not all methods need to be overridden. However, all methods that take a "size" parameter should be passed the results
37 * of sizeWithoutMissingValues(data), instead of data.size().
38 * <p>
39 * Based in part on code from the colt package: Copyright © 1999 CERN - European Organization for Nuclear Research.
40 *
41 * @author Paul Pavlidis
42 * @see <a
43 * href="http://hoschek.home.cern.ch/hoschek/colt/V1.0.3/doc/cern/jet/stat/Descriptive.html">cern.jet.stat.Descriptive
44 * </a>
45 */
46 public class DescriptiveWithMissing extends cern.jet.stat.Descriptive {
47
48 /**
49 * <b>Not supported. </b>
50 *
51 * @param data DoubleArrayList
52 * @param lag int
53 * @param mean double
54 * @param variance double
55 * @return double
56 */
57 public static double autoCorrelation(DoubleArrayList data, int lag, double mean, double variance) {
58 throw new UnsupportedOperationException("autoCorrelation not supported with missing values");
59 }
60
61 /**
62 * Highly optimized version of the correlation computation, where as much information is precomputed as possible.
63 * Use of this method only makes sense if many comparisons with the inputs x and y are being performed.
64 * <p>
65 * Implementation note: In correlation(DoubleArrayList x, DoubleArrayList y), profiling shows that calls to
66 * Double.NaN consume half the CPU time. The precomputation of the element-by-element squared values is another
67 * obvious optimization. There is also no checking for matching lengths of the arrays.
68 *
69 * @param x
70 * @param y
71 * @param selfSquaredX double array containing values of x_i^2 for each x.
72 * @param selfSquaredY
73 * @param nanStatusX boolean array containing value of Double.isNaN() for each X.
74 * @param nanStatusY
75 * @return
76 */
77 public static double correlation(double[] x, double[] y, double[] selfSquaredX, double[] selfSquaredY, boolean[] nanStatusX, boolean[] nanStatusY) {
78
79 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
80 int numused = 0;
81 syy = 0.0;
82 sxy = 0.0;
83 sxx = 0.0;
84 sx = 0.0;
85 sy = 0.0;
86
87 int length = x.length;
88 for (int j = 0; j < length; j++) {
89 xj = x[j];
90 yj = y[j];
91
92 if (nanStatusX[j] || nanStatusY[j]) {
93 continue;
94 }
95 sx += xj;
96 sy += yj;
97 sxy += xj * yj;
98 sxx += selfSquaredX[j];
99 syy += selfSquaredY[j];
100 numused++;
101 }
102
103 if (numused > 0) {
104 ay = sy / numused;
105 ax = sx / numused;
106 return (sxy - sx * ay) / Math.sqrt((sxx - sx * ax) * (syy - sy * ay));
107 }
108 return Double.NaN; // signifies that it could not be calculated.
109 }
110
111 /**
112 * Returns the correlation of two data sequences. That is
113 * <tt>covariance(data1,data2)/(standardDev1*standardDev2)</tt>. Missing values are ignored. This method is
114 * overridden to stop users from using the method in the superclass when missing values are present. The problem is
115 * that the standard deviation cannot be computed without knowning which values are not missing in both vectors to
116 * be compared. Thus the standardDev parameters are thrown away by this method.
117 *
118 * @param data1 DoubleArrayList
119 * @param standardDev1 double - not used
120 * @param data2 DoubleArrayList
121 * @param standardDev2 double - not used
122 * @return double
123 */
124 public static double correlation(DoubleArrayList data1, double standardDev1, DoubleArrayList data2, double standardDev2) {
125 return correlation(data1, data2);
126 }
127
128 /**
129 * Calculate the pearson correlation of two arrays. Missing values (NaNs) are ignored.
130 *
131 * @param x DoubleArrayList
132 * @param y DoubleArrayList
133 * @return double
134 */
135 public static double correlation(DoubleArrayList x, DoubleArrayList y) {
136 int j;
137 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
138 int numused;
139 syy = 0.0;
140 sxy = 0.0;
141 sxx = 0.0;
142 sx = 0.0;
143 sy = 0.0;
144 numused = 0;
145 if (x.size() != y.size()) {
146 throw new ArithmeticException("Unequal vector sizes: " + x.size() + " != " + y.size());
147 }
148
149 double[] xel = x.elements();
150 double[] yel = y.elements();
151
152 int length = x.size();
153 for (j = 0; j < length; j++) {
154 xj = xel[j];
155 yj = yel[j];
156
157 if (!Double.isNaN(xj) && !Double.isNaN(yj)) {
158 sx += xj;
159 sy += yj;
160 sxy += xj * yj;
161 sxx += xj * xj;
162 syy += yj * yj;
163 numused++;
164 }
165 }
166
167 if (numused > 0) {
168 ay = sy / numused;
169 ax = sx / numused;
170 return (sxy - sx * ay) / Math.sqrt((sxx - sx * ax) * (syy - sy * ay));
171 }
172 return Double.NaN; // signifies that it could not be calculated.
173
174 }
175
176 /**
177 * Returns the SAMPLE covariance of two data sequences. Pairs of values are only considered if both are not NaN. If
178 * there are no non-missing pairs, the covariance is zero.
179 *
180 * @param data1 the first vector
181 * @param data2 the second vector
182 * @return double
183 */
184 public static double covariance(DoubleArrayList data1, DoubleArrayList data2) {
185 int size = data1.size();
186 if (size != data2.size() || size == 0) {
187 throw new IllegalArgumentException();
188 }
189 double[] elements1 = data1.elements();
190 double[] elements2 = data2.elements();
191
192 /* initialize sumx and sumy to the first non-NaN pair of values */
193
194 int i = 0;
195 double sumx = 0.0, sumy = 0.0, Sxy = 0.0;
196 while (i < size) {
197 sumx = elements1[i];
198 sumy = elements2[i];
199 if (!Double.isNaN(elements1[i]) && !Double.isNaN(elements2[i])) {
200 break;
201 }
202 i++;
203 }
204 i++;
205 int usedPairs = 1;
206 for (; i < size; ++i) {
207 double x = elements1[i];
208 double y = elements2[i];
209 if (Double.isNaN(x) || Double.isNaN(y)) {
210 continue;
211 }
212
213 sumx += x;
214 Sxy += (x - sumx / (usedPairs + 1)) * (y - sumy / usedPairs);
215 sumy += y;
216 usedPairs++;
217 }
218 return Sxy / (usedPairs - 1);
219 }
220
221 /**
222 * Durbin-Watson computation. This measures the serial correlation in a data series.
223 *
224 * @param data DoubleArrayList
225 * @return double
226 * @todo this will still break in some situations where there are missing values
227 */
228 public static double durbinWatson(DoubleArrayList data) {
229 int size = data.size();
230 if (size < 2) {
231 throw new IllegalArgumentException("data sequence must contain at least two values.");
232 }
233
234 double[] elements = data.elements();
235 double run = 0;
236 double run_sq = 0;
237 int firstNotNaN = 0;
238 while (firstNotNaN < size) {
239 run_sq = elements[firstNotNaN] * elements[firstNotNaN];
240
241 if (!Double.isNaN(elements[firstNotNaN])) {
242 break;
243 }
244
245 firstNotNaN++;
246 }
247
248 if (firstNotNaN > 0 && size - firstNotNaN < 2) {
249 throw new IllegalArgumentException("data sequence must contain at least two non-missing values.");
250
251 }
252
253 for (int i = firstNotNaN + 1; i < size; i++) {
254 int gap = 1;
255 while (i < size && Double.isNaN(elements[i])) {
256 gap++;
257 i++;
258 continue;
259 }
260 if (i >= size) continue; // missing value at end will cause this.
261 double x = elements[i] - elements[i - gap];
262 /** */
263 run += x * x;
264 run_sq += elements[i] * elements[i];
265 }
266
267 return run / run_sq;
268 }
269
270 /**
271 * Returns the geometric mean of a data sequence. Missing values are ignored. Note that for a geometric mean to be
272 * meaningful, the minimum of the data sequence must not be less or equal to zero. <br>
273 * The geometric mean is given by <tt>pow( Product( data[i] ),
274 * 1/data.size())</tt>. This method tries to avoid overflows at the expense of an equivalent but somewhat slow
275 * definition: <tt>geo = Math.exp( Sum(
276 * Log(data[i]) ) / data.size())</tt>.
277 *
278 * @param data DoubleArrayList
279 * @return double
280 */
281 public static double geometricMean(DoubleArrayList data) {
282 return geometricMean(sizeWithoutMissingValues(data), sumOfLogarithms(data, 0, data.size() - 1));
283 }
284
285 /**
286 * Incrementally maintains and updates minimum, maximum, sum and sum of squares of a data sequence.
287 * <p>
288 * Assume we have already recorded some data sequence elements and know their minimum, maximum, sum and sum of
289 * squares. Assume further, we are to record some more elements and to derive updated values of minimum, maximum,
290 * sum and sum of squares. This method computes those updated values without needing to know the already recorded
291 * elements. This is interesting for interactive online monitoring and/or applications that cannot keep the entire
292 * huge data sequence in memory.
293 *
294 * @param data the additional elements to be incorporated into min, max, etc.
295 * @param from the index of the first element within <tt>data</tt> to consider.
296 * @param to the index of the last element within <tt>data</tt> to consider. The method incorporates elements
297 * <tt>data[from], ..., data[to]</tt>.
298 * @param inOut the old values in the following format:
299 * <ul>
300 * <li><tt>inOut[0]</tt> is the old minimum. <li><tt>inOut[1]</tt> is the old maximum. <li><tt>inOut[2]</tt>
301 * is the old sum. <li><tt>inOut[3]</tt> is the old sum of squares.
302 * </ul>
303 * If no data sequence elements have so far been recorded set the values as follows
304 * <ul>
305 * <li><tt>inOut[0] = Double.POSITIVE_INFINITY</tt> as the old minimum. <li><tt>inOut[1] =
306 * Double.NEGATIVE_INFINITY</tt> as the old maximum. <li><tt>inOut[2] = 0.0</tt> as the old sum. <li><tt>
307 * inOut[3] = 0.0</tt> as the old sum of squares.
308 * </ul>
309 */
310 public static void incrementalUpdate(DoubleArrayList data, int from, int to, double[] inOut) {
311 throw new UnsupportedOperationException("incrementalUpdate not supported with missing values");
312 }
313
314 /**
315 * <b>Not supported. </b>
316 *
317 * @param data DoubleArrayList
318 * @param from int
319 * @param to int
320 * @param fromSumIndex int
321 * @param toSumIndex int
322 * @param sumOfPowers double[]
323 */
324 public static void incrementalUpdateSumsOfPowers(DoubleArrayList data, int from, int to, int fromSumIndex, int toSumIndex, double[] sumOfPowers) {
325 throw new UnsupportedOperationException("incrementalUpdateSumsOfPowers not supported with missing values");
326 }
327
328 /**
329 * <b>Not supported. </b>
330 *
331 * @param data DoubleArrayList
332 * @param weights DoubleArrayList
333 * @param from int
334 * @param to int
335 * @param inOut double[]
336 */
337 public static void incrementalWeightedUpdate(DoubleArrayList data, DoubleArrayList weights, int from, int to, double[] inOut) {
338 throw new UnsupportedOperationException("incrementalWeightedUpdate not supported with missing values");
339 }
340
341 /**
342 * Returns the kurtosis (aka excess) of a data sequence.
343 *
344 * @param moment4 the fourth central moment, which is <tt>moment(data,4,mean)</tt>.
345 * @param standardDeviation the standardDeviation.
346 * @return double
347 */
348 public static double kurtosis(double moment4, double standardDeviation) {
349 return -3 + moment4 / (standardDeviation * standardDeviation * standardDeviation * standardDeviation);
350 }
351
352 /**
353 * Returns the kurtosis (aka excess) of a data sequence, which is <tt>-3 +
354 * moment(data,4,mean) / standardDeviation<sup>4</sup></tt>.
355 *
356 * @param data DoubleArrayList
357 * @param mean double
358 * @param standardDeviation double
359 * @return double
360 */
361 public static double kurtosis(DoubleArrayList data, double mean, double standardDeviation) {
362 return kurtosis(moment(data, 4, mean), standardDeviation);
363 }
364
365 /**
366 * <b>Not supported. </b>
367 *
368 * @param data DoubleArrayList
369 * @param mean double
370 * @return double
371 */
372 public static double lag1(DoubleArrayList data, double mean) {
373 throw new UnsupportedOperationException("lag1 not supported with missing values");
374 }
375
376 /**
377 * @param dat
378 * @return the median absolute deviation from the median.
379 */
380 public static double mad(DoubleArrayList dat) {
381 DoubleArrayList copy = removeMissing(dat.copy());
382
383 double median = median(copy);
384
385 DoubleArrayList devsum = new DoubleArrayList(copy.size());
386 for (int i = 0; i < copy.size(); i++) {
387 double v = copy.getQuick(i);
388 devsum.add(Math.abs(v - median));
389 }
390
391 devsum.sort();
392 return Descriptive.median(devsum);
393 }
394
395 public static double max(DoubleArrayList input) {
396 int size = input.size();
397 if (size == 0) throw new IllegalArgumentException();
398
399 double[] elements = input.elements();
400 double max = Double.MIN_VALUE;
401 for (int i = 0; i < size; i++) {
402 if (Double.isNaN(elements[i])) continue;
403 if (elements[i] > max) max = elements[i];
404 }
405
406 return max;
407 }
408
409 /**
410 * Special mean calculation where we use the effective size as an input.
411 *
412 * @param elements The data double array.
413 * @param effectiveSize The effective size used for the mean calculation.
414 * @return double
415 */
416 public static double mean(double[] elements, int effectiveSize) {
417
418 int length = elements.length;
419
420 if (0 == effectiveSize) {
421 return Double.NaN;
422 }
423
424 double sum = 0.0;
425 int i, count;
426 count = 0;
427 double value;
428 for (i = 0; i < length; i++) {
429 value = elements[i];
430 if (Double.isNaN(value)) {
431 continue;
432 }
433 sum += value;
434 count++;
435 }
436 if (0.0 == count) {
437 return Double.NaN;
438 }
439 return sum / effectiveSize;
440 }
441
442 /**
443 * @param data Values to be analyzed.
444 * @return Mean of the values in x. Missing values are ignored in the analysis.
445 */
446 public static double mean(DoubleArrayList data) {
447 return sum(data) / sizeWithoutMissingValues(data);
448 }
449
450 /**
451 * Special mean calculation where we use the effective size as an input.
452 *
453 * @param x The data
454 * @param effectiveSize The effective size used for the mean calculation.
455 * @return double
456 */
457 public static double mean(DoubleArrayList x, int effectiveSize) {
458
459 int length = x.size();
460
461 if (0 == effectiveSize) {
462 return Double.NaN;
463 }
464
465 double sum = 0.0;
466 int i, count;
467 count = 0;
468 double value;
469 double[] elements = x.elements();
470 for (i = 0; i < length; i++) {
471 value = elements[i];
472 if (Double.isNaN(value)) {
473 continue;
474 }
475 sum += value;
476 count++;
477 }
478 if (0.0 == count) {
479 return Double.NaN;
480 }
481 return sum / effectiveSize;
482
483 }
484
485 /**
486 * Calculate the mean of the values above to a particular quantile of an array.
487 *
488 * @param quantile A value from 0 to 1
489 * @param array Array for which we want to get the quantile.
490 * @return double
491 */
492 public static double meanAboveQuantile(double quantile, DoubleArrayList array) {
493
494 if (quantile < 0.0 || quantile > 1.0) {
495 throw new IllegalArgumentException("Quantile must be between 0 and 1");
496 }
497
498 double returnvalue = 0.0;
499 int k = 0;
500
501 double quantileValue = DescriptiveWithMissing.quantile(array, quantile);
502
503 for (int i = 0; i < array.size(); i++) {
504 if (array.get(i) > quantileValue) {
505 returnvalue += array.get(i);
506 k++;
507 }
508 }
509
510 if (k == 0) {
511 throw new ArithmeticException("No values found above quantile");
512 }
513
514 return returnvalue / k;
515 }
516
517 /**
518 * Returns the median. Missing values are ignored entirely.
519 *
520 * @param data the data sequence, does not have to be sorted.
521 * @return double
522 */
523 public static double median(DoubleArrayList data) {
524 return DescriptiveWithMissing.quantile(data, 0.5);
525 }
526
527 public static double min(DoubleArrayList input) {
528 int size = input.size();
529 if (size == 0) throw new IllegalArgumentException();
530
531 double[] elements = input.elements();
532 double min = Double.MAX_VALUE;
533 for (int i = 0; i < size; i++) {
534 if (Double.isNaN(elements[i])) continue;
535 if (elements[i] < min) min = elements[i];
536 }
537
538 return min;
539 }
540
541 /**
542 * Returns the moment of <tt>k</tt> -th order with constant <tt>c</tt> of a data sequence, which is
543 * <tt>Sum( (data[i]-c)<sup>k</sup> ) /
544 * data.size()</tt>.
545 *
546 * @param data DoubleArrayList
547 * @param k int
548 * @param c double
549 * @return double
550 */
551 public static double moment(DoubleArrayList data, int k, double c) {
552 return sumOfPowerDeviations(data, k, c) / sizeWithoutMissingValues(data);
553 }
554
555 /**
556 * Returns the product of a data sequence, which is <tt>Prod( data[i] )</tt>. Missing values are ignored. In other
557 * words: <tt>data[0]*data[1]*...*data[data.size()-1]</tt>. Note that you may easily get numeric overflows.
558 *
559 * @param data DoubleArrayList
560 * @return double
561 */
562 public static double product(DoubleArrayList data) {
563 int size = data.size();
564 double[] elements = data.elements();
565
566 double product = 1;
567 for (int i = size; --i >= 0; ) {
568 if (Double.isNaN(elements[i])) {
569 continue;
570 }
571 product *= elements[i];
572
573 }
574 return product;
575 }
576
577 /**
578 * Returns the <tt>phi-</tt> quantile; that is, an element <tt>elem</tt> for which holds that <tt>phi</tt> percent
579 * of data elements are less than <tt>elem</tt>. Missing values are ignored. The quantile need not necessarily be
580 * contained in the data sequence, it can be a linear interpolation.
581 *
582 * @param data the data sequence, does not have to be sorted.
583 * @param phi the percentage; must satisfy <tt>0 <= phi <= 1</tt>.
584 * @return double
585 * @todo possibly implement so a copy is not made.
586 */
587 public static double quantile(DoubleArrayList data, double phi) {
588 DoubleArrayList sortedData = removeMissing(data.copy());
589 sortedData.sort();
590 return Descriptive.quantile(sortedData, phi);
591 }
592
593 /**
594 * Returns how many percent of the elements contained in the receiver are <tt><= element</tt>. Does linear
595 * interpolation if the element is not contained but lies in between two contained elements. Missing values are
596 * ignored.
597 *
598 * @param data the list to be searched
599 * @param element the element to search for.
600 * @return the percentage <tt>phi</tt> of elements <tt><= element</tt>(<tt>0.0 <= phi <= 1.0)</tt>.
601 */
602 public static double quantileInverse(DoubleArrayList data, double element) {
603 DoubleArrayList sortedData = removeMissing(data.copy());
604 sortedData.sort();
605 return rankInterpolated(sortedData, element) / sortedData.size();
606 }
607
608 /**
609 * Returns the quantiles of the specified percentages. The quantiles need not necessarily be contained in the data
610 * sequence, it can be a linear interpolation.
611 *
612 * @param sortedData the data sequence; <b>must be sorted ascending </b>.
613 * @param percentages the percentages for which quantiles are to be computed. Each percentage must be in the
614 * interval <tt>[0.0,1.0]</tt>.
615 * @return the quantiles.
616 */
617 public static DoubleArrayList quantiles(DoubleArrayList sortedData, DoubleArrayList percentages) {
618 int s = percentages.size();
619 DoubleArrayList quantiles = new DoubleArrayList(s);
620
621 for (int i = 0; i < s; i++) {
622 quantiles.add(quantile(sortedData, percentages.get(i)));
623 }
624
625 return quantiles;
626 }
627
628 /**
629 * Returns the linearly interpolated number of elements in a list less or equal to a given element. Missing values
630 * are ignored. The rank is the number of elements <= element. Ranks are of the form
631 * <tt>{0, 1, 2,..., sortedList.size()}</tt>. If no element is <= element, then the rank is zero. If the element
632 * lies in between two contained elements, then linear interpolation is used and a non integer value is returned.
633 *
634 * @param sortedList the list to be searched (must be sorted ascending).
635 * @param element the element to search for.
636 * @return the rank of the element.
637 * @todo possibly implement so a copy is not made.
638 */
639 public static double rankInterpolated(DoubleArrayList sortedList, double element) {
640 return Descriptive.rankInterpolated(removeMissing(sortedList), element);
641 }
642
643 /**
644 * Makes a copy of the list that doesn't have the missing values.
645 *
646 * @param data DoubleArrayList
647 * @return DoubleArrayList
648 */
649 public static DoubleArrayList removeMissing(DoubleArrayList data) {
650 DoubleArrayList r = new DoubleArrayList(sizeWithoutMissingValues(data));
651 double[] elements = data.elements();
652 int size = data.size();
653 for (int i = 0; i < size; i++) {
654 if (Double.isNaN(elements[i])) {
655 continue;
656 }
657 r.add(elements[i]);
658 }
659 return r;
660 }
661
662 /**
663 * Returns the sample kurtosis (aka excess) of a data sequence.
664 *
665 * @param data DoubleArrayList
666 * @param mean double
667 * @param sampleVariance double
668 * @return double
669 */
670 public static double sampleKurtosis(DoubleArrayList data, double mean, double sampleVariance) {
671 return sampleKurtosis(sizeWithoutMissingValues(data), moment(data, 4, mean), sampleVariance);
672 }
673
674 /**
675 * Returns the sample skew of a data sequence.
676 *
677 * @param data DoubleArrayList
678 * @param mean double
679 * @param sampleVariance double
680 * @return double
681 */
682 public static double sampleSkew(DoubleArrayList data, double mean, double sampleVariance) {
683 return sampleSkew(sizeWithoutMissingValues(data), moment(data, 3, mean), sampleVariance);
684 }
685
686 /**
687 * Returns the sample standard deviation.
688 * <p>
689 * This is included for compatibility with the superclass, but does not implement the correction used there.
690 *
691 * @param size the number of elements of the data sequence.
692 * @param sampleVariance the <b>sample variance </b>.
693 * @see cern.jet.stat.Descriptive#sampleStandardDeviation(int, double)
694 */
695 public static double sampleStandardDeviation(int size, double sampleVariance) {
696 return Math.sqrt(sampleVariance);
697 }
698
699 /**
700 * Returns the sample variance of a data sequence. That is <tt>Sum (
701 * (data[i]-mean)^2 ) / (data.size()-1)</tt>.
702 *
703 * @param data DoubleArrayList
704 * @param mean double
705 * @return double
706 */
707 public static double sampleVariance(DoubleArrayList data, double mean) {
708 double[] elements = data.elements();
709 int effsize = sizeWithoutMissingValues(data);
710 int size = data.size();
711 double sum = 0;
712 // find the sum of the squares
713 for (int i = size; --i >= 0; ) {
714 if (Double.isNaN(elements[i])) {
715 continue;
716 }
717 double delta = elements[i] - mean;
718 sum += delta * delta;
719 }
720
721 return sum / (effsize - 1);
722 }
723
724 /**
725 * Return the size of the list, ignoring missing values.
726 *
727 * @param list DoubleArrayList
728 * @return int
729 */
730 public static int sizeWithoutMissingValues(DoubleArrayList list) {
731
732 int size = 0;
733 for (int i = 0; i < list.size(); i++) {
734 if (!Double.isNaN(list.get(i))) {
735 size++;
736 }
737 }
738 return size;
739 }
740
741 /**
742 * Returns the skew of a data sequence, which is <tt>moment(data,3,mean) /
743 * standardDeviation<sup>3</sup></tt>.
744 *
745 * @param data DoubleArrayList
746 * @param mean double
747 * @param standardDeviation double
748 * @return double
749 */
750 public static double skew(DoubleArrayList data, double mean, double standardDeviation) {
751 return skew(moment(data, 3, mean), standardDeviation);
752 }
753
754 /**
755 * Standardize. Note that this does something slightly different than standardize in the superclass, because our
756 * sampleStandardDeviation does not use the correction of the superclass (which isn't really standard).
757 *
758 * @param data DoubleArrayList
759 */
760 public static void standardize(DoubleArrayList data) {
761 double mean = mean(data);
762 double stdev = Math.sqrt(sampleVariance(data, mean));
763 DescriptiveWithMissing.standardize(data, mean, stdev);
764 }
765
766 /**
767 * Modifies a data sequence to be standardized. Missing values are ignored. Changes each element <tt>data[i]</tt> as
768 * follows: <tt>data[i] = (data[i]-mean)/standardDeviation</tt> unless the standard deviation is 0.00 or very close to zero (indicating the data are constant),
769 * in which case we return a vector of zeros (in effect just doing mean subtraction)
770 *
771 * @param data DoubleArrayList
772 * @param mean mean of data
773 * @param standardDeviation stdev of data. |stdev| < Constants.TINY is treated as zero.
774 */
775 public static void standardize(DoubleArrayList data, double mean, double standardDeviation) {
776
777 if (Math.abs(0.0 - standardDeviation) < Constants.TINY) {
778 for (int i = 0; i < data.size(); i++) {
779 if (!Double.isNaN(data.get(i))) {
780 data.set(i, 0.0);
781 }
782 }
783 return;
784 }
785
786 double[] elements = data.elements();
787
788 for (int i = 0; i < elements.length; i++) {
789 if (Double.isNaN(elements[i])) {
790 continue;
791 }
792 elements[i] = (elements[i] - mean) / standardDeviation;
793 }
794 }
795
796 /**
797 * Returns the sum of a data sequence. That is <tt>Sum( data[i] )</tt>.
798 *
799 * @param data DoubleArrayList
800 * @return double
801 */
802 public static double sum(DoubleArrayList data) {
803 return sumOfPowerDeviations(data, 1, 0.0);
804 }
805
806 /**
807 * Returns the sum of inversions of a data sequence, which is <tt>Sum( 1.0 /
808 * data[i])</tt>.
809 *
810 * @param data the data sequence.
811 * @param from the index of the first data element (inclusive).
812 * @param to the index of the last data element (inclusive).
813 * @return double
814 */
815 public static double sumOfInversions(DoubleArrayList data, int from, int to) {
816 return sumOfPowerDeviations(data, -1, 0.0, from, to);
817 }
818
819 /**
820 * Returns the sum of logarithms of a data sequence, which is <tt>Sum(
821 * Log(data[i])</tt>. Missing values are ignored.
822 *
823 * @param data the data sequence.
824 * @param from the index of the first data element (inclusive).
825 * @param to the index of the last data element (inclusive).
826 * @return double
827 */
828 public static double sumOfLogarithms(DoubleArrayList data, int from, int to) {
829 double[] elements = data.elements();
830 double logsum = 0;
831 for (int i = from - 1; ++i <= to; ) {
832 if (Double.isNaN(elements[i])) {
833 continue;
834 }
835 logsum += Math.log(elements[i]);
836 }
837 return logsum;
838 }
839
840 /**
841 * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or
842 * <tt>k == -2 .. 4</tt>.
843 *
844 * @param data DoubleArrayList
845 * @param k int
846 * @param c double
847 * @return double
848 */
849 public static double sumOfPowerDeviations(DoubleArrayList data, int k, double c) {
850 return sumOfPowerDeviations(data, k, c, 0, data.size() - 1);
851 }
852
853 /**
854 * Returns <tt>Sum( (data[i]-c)<sup>k</sup> )</tt> for all <tt>i = from ..
855 * to</tt>; optimized for common parameters like <tt>c == 0.0</tt> and/or <tt>k == -2 .. 5</tt>. Missing values are
856 * ignored.
857 *
858 * @param data DoubleArrayList
859 * @param k int
860 * @param c double
861 * @param from int
862 * @param to int
863 * @return double
864 */
865 public static double sumOfPowerDeviations(final DoubleArrayList data, final int k, final double c, final int from, final int to) {
866 final double[] elements = data.elements();
867 double sum = 0;
868 double v;
869 int i;
870 switch (k) { // optimized for speed
871 case -2:
872 if (c == 0.0) {
873 for (i = from - 1; ++i <= to; ) {
874 if (Double.isNaN(elements[i])) {
875 continue;
876 }
877 v = elements[i];
878 sum += 1 / (v * v);
879 }
880 } else {
881 for (i = from - 1; ++i <= to; ) {
882 if (Double.isNaN(elements[i])) {
883 continue;
884 }
885 v = elements[i] - c;
886 sum += 1 / (v * v);
887 }
888 }
889 break;
890 case -1:
891 if (c == 0.0) {
892 for (i = from - 1; ++i <= to; ) {
893 if (Double.isNaN(elements[i])) {
894 continue;
895 }
896 sum += 1 / elements[i];
897 }
898 } else {
899 for (i = from - 1; ++i <= to; ) {
900 if (Double.isNaN(elements[i])) {
901 continue;
902 }
903 sum += 1 / (elements[i] - c);
904 }
905 }
906 break;
907 case 0:
908 for (i = from - 1; ++i <= to; ) {
909 if (Double.isNaN(elements[i])) {
910 continue;
911 }
912 sum += 1;
913 }
914 break;
915 case 1:
916 if (c == 0.0) {
917 for (i = from - 1; ++i <= to; ) {
918 if (Double.isNaN(elements[i])) {
919 continue;
920 }
921 sum += elements[i];
922 }
923 } else {
924 for (i = from - 1; ++i <= to; ) {
925 if (Double.isNaN(elements[i])) {
926 continue;
927 }
928 sum += elements[i] - c;
929 }
930 }
931 break;
932 case 2:
933 if (c == 0.0) {
934 for (i = from - 1; ++i <= to; ) {
935 if (Double.isNaN(elements[i])) {
936 continue;
937 }
938 v = elements[i];
939 sum += v * v;
940 }
941 } else {
942 for (i = from - 1; ++i <= to; ) {
943 if (Double.isNaN(elements[i])) {
944 continue;
945 }
946 v = elements[i] - c;
947 sum += v * v;
948 }
949 }
950 break;
951 case 3:
952 if (c == 0.0) {
953 for (i = from - 1; ++i <= to; ) {
954 if (Double.isNaN(elements[i])) {
955 continue;
956 }
957 v = elements[i];
958 sum += v * v * v;
959 }
960 } else {
961 for (i = from - 1; ++i <= to; ) {
962 if (Double.isNaN(elements[i])) {
963 continue;
964 }
965 v = elements[i] - c;
966 sum += v * v * v;
967 }
968 }
969 break;
970 case 4:
971 if (c == 0.0) {
972 for (i = from - 1; ++i <= to; ) {
973 if (Double.isNaN(elements[i])) {
974 continue;
975 }
976 v = elements[i];
977 sum += v * v * v * v;
978 }
979 } else {
980 for (i = from - 1; ++i <= to; ) {
981 if (Double.isNaN(elements[i])) {
982 continue;
983 }
984 v = elements[i] - c;
985 sum += v * v * v * v;
986 }
987 }
988 break;
989 case 5:
990 if (c == 0.0) {
991 for (i = from - 1; ++i <= to; ) {
992 if (Double.isNaN(elements[i])) {
993 continue;
994 }
995 v = elements[i];
996 sum += v * v * v * v * v;
997 }
998 } else {
999 for (i = from - 1; ++i <= to; ) {
1000 if (Double.isNaN(elements[i])) {
1001 continue;
1002 }
1003 v = elements[i] - c;
1004 sum += v * v * v * v * v;
1005 }
1006 }
1007 break;
1008 default:
1009 for (i = from - 1; ++i <= to; ) {
1010 if (Double.isNaN(elements[i])) {
1011 continue;
1012 }
1013 sum += Math.pow(elements[i] - c, k);
1014 }
1015 break;
1016 }
1017 return sum;
1018 }
1019
1020 /**
1021 * Returns the sum of powers of a data sequence, which is <tt>Sum (
1022 * data[i]<sup>k</sup> )</tt>.
1023 *
1024 * @param data DoubleArrayList
1025 * @param k int
1026 * @return double
1027 */
1028 public static double sumOfPowers(DoubleArrayList data, int k) {
1029 return sumOfPowerDeviations(data, k, 0);
1030 }
1031
1032 /**
1033 * Compute the sum of the squared deviations from the mean of a data sequence. Missing values are ignored.
1034 *
1035 * @param data DoubleArrayList
1036 * @return double
1037 */
1038 public static double sumOfSquaredDeviations(DoubleArrayList data) {
1039 return sumOfSquaredDeviations(sizeWithoutMissingValues(data), variance(sizeWithoutMissingValues(data), sum(data), sumOfSquares(data)));
1040 }
1041
1042 /**
1043 * Returns the sum of squares of a data sequence. Skips missing values.
1044 *
1045 * @param data DoubleArrayList
1046 * @return double
1047 */
1048 public static double sumOfSquares(DoubleArrayList data) {
1049 return sumOfPowerDeviations(data, 2, 0.0);
1050 }
1051
1052 /**
1053 * Returns the trimmed mean of a sorted data sequence. Missing values are completely ignored.
1054 *
1055 * @param sortedData the data sequence; <b>must be sorted ascending </b>.
1056 * @param mean the mean of the (full) sorted data sequence.
1057 * @param left int the number of leading elements to trim.
1058 * @param right int number of trailing elements to trim.
1059 * @return double
1060 */
1061 public static double trimmedMean(DoubleArrayList sortedData, double mean, int left, int right) {
1062 return Descriptive.trimmedMean(removeMissing(sortedData), mean, left, right);
1063 }
1064
1065 /**
1066 * Provided for convenience!
1067 *
1068 * @param data DoubleArrayList
1069 * @return double
1070 */
1071 public static double variance(DoubleArrayList data) {
1072 return variance(sizeWithoutMissingValues(data), sum(data), sumOfSquares(data));
1073 }
1074
1075 /**
1076 *
1077 */
1078 public static double variance(int sizeWithoutMissing, double sum, double sumOfSquares) {
1079 return Descriptive.variance(sizeWithoutMissing, sum, sumOfSquares);
1080 }
1081
1082 /**
1083 * Returns the weighted mean of a data sequence. That is <tt> Sum (data[i] *
1084 * weights[i]) / Sum ( weights[i] )</tt>.
1085 *
1086 * @param data DoubleArrayList
1087 * @param weights DoubleArrayList
1088 * @return double
1089 */
1090 public static double weightedMean(DoubleArrayList data, DoubleArrayList weights) {
1091 int size = data.size();
1092 if (size != weights.size() || size == 0) {
1093 throw new IllegalArgumentException();
1094 }
1095
1096 double[] elements = data.elements();
1097 double[] theWeights = weights.elements();
1098 double sum = 0.0;
1099 double weightsSum = 0.0;
1100 for (int i = size; --i >= 0; ) {
1101 double w = theWeights[i];
1102 if (Double.isNaN(elements[i]) || Double.isNaN(w)) {
1103 continue;
1104 }
1105 sum += elements[i] * w;
1106 weightsSum += w;
1107 }
1108
1109 return sum / weightsSum;
1110 }
1111
1112 /**
1113 * Returns the winsorized mean of a sorted data sequence.
1114 *
1115 * @param sortedData DoubleArrayList, must already be sorted ascending
1116 * @param mean the mean of the (full) sorted data sequence.
1117 * @param left the number of leading elements to trim. Refers to the number of elements to trim
1118 * <em>excluding any missing values</em>
1119 * @param right the number of trailing elements to trim <em>excluding any missing values</em>
1120 * @return double
1121 */
1122 public static double winsorizedMean(DoubleArrayList sortedData, double mean, int left, int right) {
1123 DoubleArrayList sdm = removeMissing(sortedData);
1124 return Descriptive.winsorizedMean(sdm, mean(sdm), left, right);
1125 }
1126
1127 /* private methods */
1128
1129 private DescriptiveWithMissing() {
1130 }
1131
1132 } // end of class