1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package ubic.basecode.math;
20
21 import cern.colt.list.DoubleArrayList;
22 import cern.jet.stat.Descriptive;
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46 public class DescriptiveWithMissing extends cern.jet.stat.Descriptive {
47
48
49
50
51
52
53
54
55
56
57 public static double autoCorrelation(DoubleArrayList data, int lag, double mean, double variance) {
58 throw new UnsupportedOperationException("autoCorrelation not supported with missing values");
59 }
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 public static double correlation(double[] x, double[] y, double[] selfSquaredX, double[] selfSquaredY, boolean[] nanStatusX, boolean[] nanStatusY) {
78
79 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
80 int numused = 0;
81 syy = 0.0;
82 sxy = 0.0;
83 sxx = 0.0;
84 sx = 0.0;
85 sy = 0.0;
86
87 int length = x.length;
88 for (int j = 0; j < length; j++) {
89 xj = x[j];
90 yj = y[j];
91
92 if (nanStatusX[j] || nanStatusY[j]) {
93 continue;
94 }
95 sx += xj;
96 sy += yj;
97 sxy += xj * yj;
98 sxx += selfSquaredX[j];
99 syy += selfSquaredY[j];
100 numused++;
101 }
102
103 if (numused > 0) {
104 ay = sy / numused;
105 ax = sx / numused;
106 return (sxy - sx * ay) / Math.sqrt((sxx - sx * ax) * (syy - sy * ay));
107 }
108 return Double.NaN;
109 }
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124 public static double correlation(DoubleArrayList data1, double standardDev1, DoubleArrayList data2, double standardDev2) {
125 return correlation(data1, data2);
126 }
127
128
129
130
131
132
133
134
135 public static double correlation(DoubleArrayList x, DoubleArrayList y) {
136 int j;
137 double syy, sxy, sxx, sx, sy, xj, yj, ay, ax;
138 int numused;
139 syy = 0.0;
140 sxy = 0.0;
141 sxx = 0.0;
142 sx = 0.0;
143 sy = 0.0;
144 numused = 0;
145 if (x.size() != y.size()) {
146 throw new ArithmeticException("Unequal vector sizes: " + x.size() + " != " + y.size());
147 }
148
149 double[] xel = x.elements();
150 double[] yel = y.elements();
151
152 int length = x.size();
153 for (j = 0; j < length; j++) {
154 xj = xel[j];
155 yj = yel[j];
156
157 if (!Double.isNaN(xj) && !Double.isNaN(yj)) {
158 sx += xj;
159 sy += yj;
160 sxy += xj * yj;
161 sxx += xj * xj;
162 syy += yj * yj;
163 numused++;
164 }
165 }
166
167 if (numused > 0) {
168 ay = sy / numused;
169 ax = sx / numused;
170 return (sxy - sx * ay) / Math.sqrt((sxx - sx * ax) * (syy - sy * ay));
171 }
172 return Double.NaN;
173
174 }
175
176
177
178
179
180
181
182
183
184 public static double covariance(DoubleArrayList data1, DoubleArrayList data2) {
185 int size = data1.size();
186 if (size != data2.size() || size == 0) {
187 throw new IllegalArgumentException();
188 }
189 double[] elements1 = data1.elements();
190 double[] elements2 = data2.elements();
191
192
193
194 int i = 0;
195 double sumx = 0.0, sumy = 0.0, Sxy = 0.0;
196 while (i < size) {
197 sumx = elements1[i];
198 sumy = elements2[i];
199 if (!Double.isNaN(elements1[i]) && !Double.isNaN(elements2[i])) {
200 break;
201 }
202 i++;
203 }
204 i++;
205 int usedPairs = 1;
206 for (; i < size; ++i) {
207 double x = elements1[i];
208 double y = elements2[i];
209 if (Double.isNaN(x) || Double.isNaN(y)) {
210 continue;
211 }
212
213 sumx += x;
214 Sxy += (x - sumx / (usedPairs + 1)) * (y - sumy / usedPairs);
215 sumy += y;
216 usedPairs++;
217 }
218 return Sxy / (usedPairs - 1);
219 }
220
221
222
223
224
225
226
227
228 public static double durbinWatson(DoubleArrayList data) {
229 int size = data.size();
230 if (size < 2) {
231 throw new IllegalArgumentException("data sequence must contain at least two values.");
232 }
233
234 double[] elements = data.elements();
235 double run = 0;
236 double run_sq = 0;
237 int firstNotNaN = 0;
238 while (firstNotNaN < size) {
239 run_sq = elements[firstNotNaN] * elements[firstNotNaN];
240
241 if (!Double.isNaN(elements[firstNotNaN])) {
242 break;
243 }
244
245 firstNotNaN++;
246 }
247
248 if (firstNotNaN > 0 && size - firstNotNaN < 2) {
249 throw new IllegalArgumentException("data sequence must contain at least two non-missing values.");
250
251 }
252
253 for (int i = firstNotNaN + 1; i < size; i++) {
254 int gap = 1;
255 while (i < size && Double.isNaN(elements[i])) {
256 gap++;
257 i++;
258 continue;
259 }
260 if (i >= size) continue;
261 double x = elements[i] - elements[i - gap];
262
263 run += x * x;
264 run_sq += elements[i] * elements[i];
265 }
266
267 return run / run_sq;
268 }
269
270
271
272
273
274
275
276
277
278
279
280
281 public static double geometricMean(DoubleArrayList data) {
282 return geometricMean(sizeWithoutMissingValues(data), sumOfLogarithms(data, 0, data.size() - 1));
283 }
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310 public static void incrementalUpdate(DoubleArrayList data, int from, int to, double[] inOut) {
311 throw new UnsupportedOperationException("incrementalUpdate not supported with missing values");
312 }
313
314
315
316
317
318
319
320
321
322
323
324 public static void incrementalUpdateSumsOfPowers(DoubleArrayList data, int from, int to, int fromSumIndex, int toSumIndex, double[] sumOfPowers) {
325 throw new UnsupportedOperationException("incrementalUpdateSumsOfPowers not supported with missing values");
326 }
327
328
329
330
331
332
333
334
335
336
337 public static void incrementalWeightedUpdate(DoubleArrayList data, DoubleArrayList weights, int from, int to, double[] inOut) {
338 throw new UnsupportedOperationException("incrementalWeightedUpdate not supported with missing values");
339 }
340
341
342
343
344
345
346
347
348 public static double kurtosis(double moment4, double standardDeviation) {
349 return -3 + moment4 / (standardDeviation * standardDeviation * standardDeviation * standardDeviation);
350 }
351
352
353
354
355
356
357
358
359
360
361 public static double kurtosis(DoubleArrayList data, double mean, double standardDeviation) {
362 return kurtosis(moment(data, 4, mean), standardDeviation);
363 }
364
365
366
367
368
369
370
371
372 public static double lag1(DoubleArrayList data, double mean) {
373 throw new UnsupportedOperationException("lag1 not supported with missing values");
374 }
375
376
377
378
379
380 public static double mad(DoubleArrayList dat) {
381 DoubleArrayList copy = removeMissing(dat.copy());
382
383 double median = median(copy);
384
385 DoubleArrayList devsum = new DoubleArrayList(copy.size());
386 for (int i = 0; i < copy.size(); i++) {
387 double v = copy.getQuick(i);
388 devsum.add(Math.abs(v - median));
389 }
390
391 devsum.sort();
392 return Descriptive.median(devsum);
393 }
394
395 public static double max(DoubleArrayList input) {
396 int size = input.size();
397 if (size == 0) throw new IllegalArgumentException();
398
399 double[] elements = input.elements();
400 double max = Double.MIN_VALUE;
401 for (int i = 0; i < size; i++) {
402 if (Double.isNaN(elements[i])) continue;
403 if (elements[i] > max) max = elements[i];
404 }
405
406 return max;
407 }
408
409
410
411
412
413
414
415
416 public static double mean(double[] elements, int effectiveSize) {
417
418 int length = elements.length;
419
420 if (0 == effectiveSize) {
421 return Double.NaN;
422 }
423
424 double sum = 0.0;
425 int i, count;
426 count = 0;
427 double value;
428 for (i = 0; i < length; i++) {
429 value = elements[i];
430 if (Double.isNaN(value)) {
431 continue;
432 }
433 sum += value;
434 count++;
435 }
436 if (0.0 == count) {
437 return Double.NaN;
438 }
439 return sum / effectiveSize;
440 }
441
442
443
444
445
446 public static double mean(DoubleArrayList data) {
447 return sum(data) / sizeWithoutMissingValues(data);
448 }
449
450
451
452
453
454
455
456
457 public static double mean(DoubleArrayList x, int effectiveSize) {
458
459 int length = x.size();
460
461 if (0 == effectiveSize) {
462 return Double.NaN;
463 }
464
465 double sum = 0.0;
466 int i, count;
467 count = 0;
468 double value;
469 double[] elements = x.elements();
470 for (i = 0; i < length; i++) {
471 value = elements[i];
472 if (Double.isNaN(value)) {
473 continue;
474 }
475 sum += value;
476 count++;
477 }
478 if (0.0 == count) {
479 return Double.NaN;
480 }
481 return sum / effectiveSize;
482
483 }
484
485
486
487
488
489
490
491
492 public static double meanAboveQuantile(double quantile, DoubleArrayList array) {
493
494 if (quantile < 0.0 || quantile > 1.0) {
495 throw new IllegalArgumentException("Quantile must be between 0 and 1");
496 }
497
498 double returnvalue = 0.0;
499 int k = 0;
500
501 double quantileValue = DescriptiveWithMissing.quantile(array, quantile);
502
503 for (int i = 0; i < array.size(); i++) {
504 if (array.get(i) > quantileValue) {
505 returnvalue += array.get(i);
506 k++;
507 }
508 }
509
510 if (k == 0) {
511 throw new ArithmeticException("No values found above quantile");
512 }
513
514 return returnvalue / k;
515 }
516
517
518
519
520
521
522
523 public static double median(DoubleArrayList data) {
524 return DescriptiveWithMissing.quantile(data, 0.5);
525 }
526
527 public static double min(DoubleArrayList input) {
528 int size = input.size();
529 if (size == 0) throw new IllegalArgumentException();
530
531 double[] elements = input.elements();
532 double min = Double.MAX_VALUE;
533 for (int i = 0; i < size; i++) {
534 if (Double.isNaN(elements[i])) continue;
535 if (elements[i] < min) min = elements[i];
536 }
537
538 return min;
539 }
540
541
542
543
544
545
546
547
548
549
550
551 public static double moment(DoubleArrayList data, int k, double c) {
552 return sumOfPowerDeviations(data, k, c) / sizeWithoutMissingValues(data);
553 }
554
555
556
557
558
559
560
561
562 public static double product(DoubleArrayList data) {
563 int size = data.size();
564 double[] elements = data.elements();
565
566 double product = 1;
567 for (int i = size; --i >= 0; ) {
568 if (Double.isNaN(elements[i])) {
569 continue;
570 }
571 product *= elements[i];
572
573 }
574 return product;
575 }
576
577
578
579
580
581
582
583
584
585
586
587 public static double quantile(DoubleArrayList data, double phi) {
588 DoubleArrayList sortedData = removeMissing(data.copy());
589 sortedData.sort();
590 return Descriptive.quantile(sortedData, phi);
591 }
592
593
594
595
596
597
598
599
600
601
602 public static double quantileInverse(DoubleArrayList data, double element) {
603 DoubleArrayList sortedData = removeMissing(data.copy());
604 sortedData.sort();
605 return rankInterpolated(sortedData, element) / sortedData.size();
606 }
607
608
609
610
611
612
613
614
615
616
617 public static DoubleArrayList quantiles(DoubleArrayList sortedData, DoubleArrayList percentages) {
618 int s = percentages.size();
619 DoubleArrayList quantiles = new DoubleArrayList(s);
620
621 for (int i = 0; i < s; i++) {
622 quantiles.add(quantile(sortedData, percentages.get(i)));
623 }
624
625 return quantiles;
626 }
627
628
629
630
631
632
633
634
635
636
637
638
639 public static double rankInterpolated(DoubleArrayList sortedList, double element) {
640 return Descriptive.rankInterpolated(removeMissing(sortedList), element);
641 }
642
643
644
645
646
647
648
649 public static DoubleArrayList removeMissing(DoubleArrayList data) {
650 DoubleArrayList r = new DoubleArrayList(sizeWithoutMissingValues(data));
651 double[] elements = data.elements();
652 int size = data.size();
653 for (int i = 0; i < size; i++) {
654 if (Double.isNaN(elements[i])) {
655 continue;
656 }
657 r.add(elements[i]);
658 }
659 return r;
660 }
661
662
663
664
665
666
667
668
669
670 public static double sampleKurtosis(DoubleArrayList data, double mean, double sampleVariance) {
671 return sampleKurtosis(sizeWithoutMissingValues(data), moment(data, 4, mean), sampleVariance);
672 }
673
674
675
676
677
678
679
680
681
682 public static double sampleSkew(DoubleArrayList data, double mean, double sampleVariance) {
683 return sampleSkew(sizeWithoutMissingValues(data), moment(data, 3, mean), sampleVariance);
684 }
685
686
687
688
689
690
691
692
693
694
695 public static double sampleStandardDeviation(int size, double sampleVariance) {
696 return Math.sqrt(sampleVariance);
697 }
698
699
700
701
702
703
704
705
706
707 public static double sampleVariance(DoubleArrayList data, double mean) {
708 double[] elements = data.elements();
709 int effsize = sizeWithoutMissingValues(data);
710 int size = data.size();
711 double sum = 0;
712
713 for (int i = size; --i >= 0; ) {
714 if (Double.isNaN(elements[i])) {
715 continue;
716 }
717 double delta = elements[i] - mean;
718 sum += delta * delta;
719 }
720
721 return sum / (effsize - 1);
722 }
723
724
725
726
727
728
729
730 public static int sizeWithoutMissingValues(DoubleArrayList list) {
731
732 int size = 0;
733 for (int i = 0; i < list.size(); i++) {
734 if (!Double.isNaN(list.get(i))) {
735 size++;
736 }
737 }
738 return size;
739 }
740
741
742
743
744
745
746
747
748
749
750 public static double skew(DoubleArrayList data, double mean, double standardDeviation) {
751 return skew(moment(data, 3, mean), standardDeviation);
752 }
753
754
755
756
757
758
759
760 public static void standardize(DoubleArrayList data) {
761 double mean = mean(data);
762 double stdev = Math.sqrt(sampleVariance(data, mean));
763 DescriptiveWithMissing.standardize(data, mean, stdev);
764 }
765
766
767
768
769
770
771
772
773
774
775 public static void standardize(DoubleArrayList data, double mean, double standardDeviation) {
776
777 if (Math.abs(0.0 - standardDeviation) < Constants.TINY) {
778 for (int i = 0; i < data.size(); i++) {
779 if (!Double.isNaN(data.get(i))) {
780 data.set(i, 0.0);
781 }
782 }
783 return;
784 }
785
786 double[] elements = data.elements();
787
788 for (int i = 0; i < elements.length; i++) {
789 if (Double.isNaN(elements[i])) {
790 continue;
791 }
792 elements[i] = (elements[i] - mean) / standardDeviation;
793 }
794 }
795
796
797
798
799
800
801
802 public static double sum(DoubleArrayList data) {
803 return sumOfPowerDeviations(data, 1, 0.0);
804 }
805
806
807
808
809
810
811
812
813
814
815 public static double sumOfInversions(DoubleArrayList data, int from, int to) {
816 return sumOfPowerDeviations(data, -1, 0.0, from, to);
817 }
818
819
820
821
822
823
824
825
826
827
828 public static double sumOfLogarithms(DoubleArrayList data, int from, int to) {
829 double[] elements = data.elements();
830 double logsum = 0;
831 for (int i = from - 1; ++i <= to; ) {
832 if (Double.isNaN(elements[i])) {
833 continue;
834 }
835 logsum += Math.log(elements[i]);
836 }
837 return logsum;
838 }
839
840
841
842
843
844
845
846
847
848
849 public static double sumOfPowerDeviations(DoubleArrayList data, int k, double c) {
850 return sumOfPowerDeviations(data, k, c, 0, data.size() - 1);
851 }
852
853
854
855
856
857
858
859
860
861
862
863
864
865 public static double sumOfPowerDeviations(final DoubleArrayList data, final int k, final double c, final int from, final int to) {
866 final double[] elements = data.elements();
867 double sum = 0;
868 double v;
869 int i;
870 switch (k) {
871 case -2:
872 if (c == 0.0) {
873 for (i = from - 1; ++i <= to; ) {
874 if (Double.isNaN(elements[i])) {
875 continue;
876 }
877 v = elements[i];
878 sum += 1 / (v * v);
879 }
880 } else {
881 for (i = from - 1; ++i <= to; ) {
882 if (Double.isNaN(elements[i])) {
883 continue;
884 }
885 v = elements[i] - c;
886 sum += 1 / (v * v);
887 }
888 }
889 break;
890 case -1:
891 if (c == 0.0) {
892 for (i = from - 1; ++i <= to; ) {
893 if (Double.isNaN(elements[i])) {
894 continue;
895 }
896 sum += 1 / elements[i];
897 }
898 } else {
899 for (i = from - 1; ++i <= to; ) {
900 if (Double.isNaN(elements[i])) {
901 continue;
902 }
903 sum += 1 / (elements[i] - c);
904 }
905 }
906 break;
907 case 0:
908 for (i = from - 1; ++i <= to; ) {
909 if (Double.isNaN(elements[i])) {
910 continue;
911 }
912 sum += 1;
913 }
914 break;
915 case 1:
916 if (c == 0.0) {
917 for (i = from - 1; ++i <= to; ) {
918 if (Double.isNaN(elements[i])) {
919 continue;
920 }
921 sum += elements[i];
922 }
923 } else {
924 for (i = from - 1; ++i <= to; ) {
925 if (Double.isNaN(elements[i])) {
926 continue;
927 }
928 sum += elements[i] - c;
929 }
930 }
931 break;
932 case 2:
933 if (c == 0.0) {
934 for (i = from - 1; ++i <= to; ) {
935 if (Double.isNaN(elements[i])) {
936 continue;
937 }
938 v = elements[i];
939 sum += v * v;
940 }
941 } else {
942 for (i = from - 1; ++i <= to; ) {
943 if (Double.isNaN(elements[i])) {
944 continue;
945 }
946 v = elements[i] - c;
947 sum += v * v;
948 }
949 }
950 break;
951 case 3:
952 if (c == 0.0) {
953 for (i = from - 1; ++i <= to; ) {
954 if (Double.isNaN(elements[i])) {
955 continue;
956 }
957 v = elements[i];
958 sum += v * v * v;
959 }
960 } else {
961 for (i = from - 1; ++i <= to; ) {
962 if (Double.isNaN(elements[i])) {
963 continue;
964 }
965 v = elements[i] - c;
966 sum += v * v * v;
967 }
968 }
969 break;
970 case 4:
971 if (c == 0.0) {
972 for (i = from - 1; ++i <= to; ) {
973 if (Double.isNaN(elements[i])) {
974 continue;
975 }
976 v = elements[i];
977 sum += v * v * v * v;
978 }
979 } else {
980 for (i = from - 1; ++i <= to; ) {
981 if (Double.isNaN(elements[i])) {
982 continue;
983 }
984 v = elements[i] - c;
985 sum += v * v * v * v;
986 }
987 }
988 break;
989 case 5:
990 if (c == 0.0) {
991 for (i = from - 1; ++i <= to; ) {
992 if (Double.isNaN(elements[i])) {
993 continue;
994 }
995 v = elements[i];
996 sum += v * v * v * v * v;
997 }
998 } else {
999 for (i = from - 1; ++i <= to; ) {
1000 if (Double.isNaN(elements[i])) {
1001 continue;
1002 }
1003 v = elements[i] - c;
1004 sum += v * v * v * v * v;
1005 }
1006 }
1007 break;
1008 default:
1009 for (i = from - 1; ++i <= to; ) {
1010 if (Double.isNaN(elements[i])) {
1011 continue;
1012 }
1013 sum += Math.pow(elements[i] - c, k);
1014 }
1015 break;
1016 }
1017 return sum;
1018 }
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028 public static double sumOfPowers(DoubleArrayList data, int k) {
1029 return sumOfPowerDeviations(data, k, 0);
1030 }
1031
1032
1033
1034
1035
1036
1037
1038 public static double sumOfSquaredDeviations(DoubleArrayList data) {
1039 return sumOfSquaredDeviations(sizeWithoutMissingValues(data), variance(sizeWithoutMissingValues(data), sum(data), sumOfSquares(data)));
1040 }
1041
1042
1043
1044
1045
1046
1047
1048 public static double sumOfSquares(DoubleArrayList data) {
1049 return sumOfPowerDeviations(data, 2, 0.0);
1050 }
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061 public static double trimmedMean(DoubleArrayList sortedData, double mean, int left, int right) {
1062 return Descriptive.trimmedMean(removeMissing(sortedData), mean, left, right);
1063 }
1064
1065
1066
1067
1068
1069
1070
1071 public static double variance(DoubleArrayList data) {
1072 return variance(sizeWithoutMissingValues(data), sum(data), sumOfSquares(data));
1073 }
1074
1075
1076
1077
1078 public static double variance(int sizeWithoutMissing, double sum, double sumOfSquares) {
1079 return Descriptive.variance(sizeWithoutMissing, sum, sumOfSquares);
1080 }
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090 public static double weightedMean(DoubleArrayList data, DoubleArrayList weights) {
1091 int size = data.size();
1092 if (size != weights.size() || size == 0) {
1093 throw new IllegalArgumentException();
1094 }
1095
1096 double[] elements = data.elements();
1097 double[] theWeights = weights.elements();
1098 double sum = 0.0;
1099 double weightsSum = 0.0;
1100 for (int i = size; --i >= 0; ) {
1101 double w = theWeights[i];
1102 if (Double.isNaN(elements[i]) || Double.isNaN(w)) {
1103 continue;
1104 }
1105 sum += elements[i] * w;
1106 weightsSum += w;
1107 }
1108
1109 return sum / weightsSum;
1110 }
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122 public static double winsorizedMean(DoubleArrayList sortedData, double mean, int left, int right) {
1123 DoubleArrayList sdm = removeMissing(sortedData);
1124 return Descriptive.winsorizedMean(sdm, mean(sdm), left, right);
1125 }
1126
1127
1128
1129 private DescriptiveWithMissing() {
1130 }
1131
1132 }