1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 import cern.colt.list.DoubleArrayList;
22 import cern.colt.matrix.DoubleMatrix2D;
23 import cern.colt.matrix.impl.SparseDoubleMatrix2D;
24 import cern.jet.math.Arithmetic;
25 import cern.jet.stat.Probability;
26
27 /**
28 * Statistical evaluation and transformation tools for correlations.
29 *
30 * @author Paul Pavlidis
31 *
32 */
33 public class CorrelationStats {
34
35 /* for spearman - for n <= this, we compute exact probabilities. */
36 final static int n_small = 9;
37 private static final double BINSIZE = 0.005; // resolution of correlation.
38 // Differences smaller than this
39 // are considered meaningless.
40
41 /* Edgeworth coefficients for spearman p-value computation : */
42 private static final double c1 = 0.2274, c2 = 0.2531, c3 = 0.1745, c4 = 0.0758, c5 = 0.1033, c6 = 0.3932,
43 c7 = 0.0879, c8 = 0.0151, c9 = 0.0072, c10 = 0.0831, c11 = 0.0131, c12 = 4.6e-4;
44 private static DoubleMatrix2D correlationPvalLookup;
45
46 private static final int MAXCOUNT = 1000; // maximum number of things.
47
48 private static final double PVALCHOP = 8.0; // value by which log(pvalues)
49 // are scaled before storing as
50 // bytes. Values less than
51 // 10^e-256/PVALCHOP are
52 // 'clipped'.
53
54 private static DoubleMatrix2D spearmanPvalLookup;
55
56 static {
57 int numbins = ( int ) Math.ceil( 1.0 / BINSIZE );
58 correlationPvalLookup = new SparseDoubleMatrix2D( numbins, MAXCOUNT + 1 );
59 spearmanPvalLookup = new SparseDoubleMatrix2D( numbins, MAXCOUNT + 1 );
60 }
61
62 /**
63 * @param correlByte int
64 * @return double
65 */
66 public static double byteToCorrel( int correlByte ) {
67 return correlByte / 128.0 - 1.0;
68 }
69
70 /**
71 * @param pvalByte int
72 * @return double
73 */
74 public static double byteToPvalue( int pvalByte ) {
75 return Math.pow( 10.0, -( double ) pvalByte / PVALCHOP );
76 }
77
78 /**
79 * Statistical comparison of two Pearson correlations. Assumes data are bivariate normal. Null hypothesis is that
80 * the two correlations are equal. See Zar (Biostatistics)
81 *
82 * @param correl1 First correlation
83 * @param n1 Number of values used to compute correl1
84 * @param correl2 Second correlation
85 * @param n2 Number of values used to compute correl2
86 * @return double p value.
87 */
88 public static double compare( double correl1, int n1, double correl2, int n2 ) {
89
90 double Z;
91 double sigma;
92 double p;
93
94 sigma = Math.sqrt( 1 / ( ( double ) n1 - 3 ) + 1 / ( ( double ) n2 - 3 ) );
95
96 Z = Math.abs( correl1 - correl2 ) / sigma;
97
98 p = Probability.normal( -Z ); // upper tail.
99
100 if ( p > 0.5 ) {
101 return 1.0 - p;
102 }
103 return p;
104 }
105
106 /**
107 * Compute the Pearson correlation, missing values are permitted.
108 *
109 * @param ival
110 * @param jval
111 * @return
112 */
113 public static double correl( double[] ival, double[] jval ) {
114 /* do it the old fashioned way */
115 int numused = 0;
116 double sxy = 0.0, sxx = 0.0, syy = 0.0, sx = 0.0, sy = 0.0;
117 int length = Math.min( ival.length, jval.length );
118 for ( int k = 0; k < length; k++ ) {
119 double xj = ival[k];
120 double yj = jval[k];
121 if ( !Double.isNaN( ival[k] ) && !Double.isNaN( jval[k] ) ) {
122 sx += xj;
123 sy += yj;
124 sxy += xj * yj;
125 sxx += xj * xj;
126 syy += yj * yj;
127 numused++;
128 }
129 }
130 if ( numused < 2 ) return Double.NaN;
131 double denom = ( sxx - sx * sx / numused ) * ( syy - sy * sy / numused );
132 if ( denom <= 0 ) return Double.NaN;
133 double correl = ( sxy - sx * sy / numused ) / Math.sqrt( denom );
134 return correl;
135 }
136
137 /**
138 * @param correl double
139 * @return int
140 */
141 public static int correlAsByte( double correl ) {
142 if ( correl == -1.0 ) {
143 return 0;
144 }
145 return ( int ) ( Math.ceil( ( correl + 1.0 ) * 128 ) - 1 );
146 }
147
148 /**
149 * Find the approximate Pearson correlation required to meet a particular pvalue. If the pvalue is <=0 or >= 1,
150 * returns 1 and 0 respectively.
151 *
152 * @param pval double
153 * @param count int
154 * @return double
155 */
156 public static double correlationForPvalue( double pval, int count ) {
157
158 if ( pval >= 1.0 ) {
159 return 0.0;
160 }
161
162 if ( pval <= 0.0 ) {
163 return 1.0;
164 }
165
166 if ( count < 3 ) {
167 return 0.0; // warn?
168 }
169
170 double z = Math.abs( Probability.normalInverse( pval ) );
171
172 double v = z / Math.sqrt( count - 3.0 );
173
174 double corrguess = unFisherTransform( v );
175 assert Math.abs( corrguess ) <= 1.1 : "Invalid correlation: " + corrguess; // sanity.
176 // double goback = pvalue( corrguess, count );
177
178 // System.err.println( "est corr=" + corrguess + " with pvalue " + goback + " ==? original p=" + pval );
179
180 return Math.min( corrguess, 1.0 );
181 }
182
183 /**
184 * Compute the t-statistic associated with a Pearson correlation.
185 *
186 * @param correl Pearson correlation
187 * @param dof Degrees of freedom (n - 2)
188 * @return double
189 */
190 public static double correlationTstat( double correl, int dof ) {
191 return correl / Math.sqrt( ( 1.0 - correl * correl ) / dof );
192 }
193
194 /**
195 * Compute Pearson correlation when there are no missing values.
196 *
197 * @param ival
198 * @param jval
199 * @param meani
200 * @param meanj
201 * @param sqrti
202 * @param sqrtj
203 * @return
204 */
205 public static double correlFast( double[] ival, double[] jval, double meani, double meanj, double sqrti,
206 double sqrtj ) {
207 double sxy = 0.0;
208 for ( int k = 0, n = ival.length; k < n; k++ ) {
209 sxy += ( ival[k] - meani ) * ( jval[k] - meanj );
210 }
211 return sxy / ( sqrti * sqrtj );
212 }
213
214 /**
215 * Compute the Fisher z transform of the Pearson correlation.
216 *
217 * @param r Correlation coefficient.
218 * @return Fisher transform of the Correlation.
219 */
220 public static double fisherTransform( double r ) {
221
222 if ( Math.abs( r ) == 1.0 ) {
223 return Double.POSITIVE_INFINITY;
224 }
225
226 if ( r == 0.0 ) return 0.0;
227
228 if ( !isValidPearsonCorrelation( r ) ) {
229 throw new IllegalArgumentException( "Invalid correlation " + r );
230 }
231
232 return 0.5 * Math.log( ( 1.0 + r ) / ( 1.0 - r ) );
233 }
234
235 /**
236 * Fisher-transform a list of Pearson correlations.
237 *
238 * @param e
239 * @return
240 */
241 public static DoubleArrayList fisherTransform( DoubleArrayList e ) {
242 DoubleArrayList r = new DoubleArrayList( e.size() );
243 for ( int i = 0; i < e.size(); i++ ) {
244 r.add( CorrelationStats.fisherTransform( e.getQuick( i ) ) );
245 }
246 return r;
247 }
248
249 /**
250 * Test if a value is a reasonable Pearson correlation (in the range -1 to 1; values outside of this range are
251 * acceptable within a small roundoff.
252 *
253 * @param r
254 * @return
255 */
256 public static boolean isValidPearsonCorrelation( double r ) {
257 return r + Constants.SMALL >= -1.0 && r - Constants.SMALL <= 1.0;
258 }
259
260 /**
261 * Compute pvalue for the pearson correlation, using the t distribution method.
262 *
263 * @param correl Pearson correlation.
264 * @param count Number of items used to calculate the correlation. NOT the degrees of freedom.
265 * @return double one-side pvalue
266 */
267 public static double pvalue( double correl, int count ) {
268
269 double acorrel = Math.abs( correl );
270
271 if ( acorrel == 1.0 ) {
272 return 0.0;
273 }
274
275 if ( acorrel == 0.0 ) {
276 return 1.0;
277 }
278
279 int dof = count - 2;
280
281 if ( dof <= 0 ) {
282 return 1.0;
283 }
284
285 int bin = ( int ) Math.ceil( acorrel / BINSIZE );
286 if ( count <= MAXCOUNT && correlationPvalLookup.getQuick( bin, dof ) != 0.0 ) {
287 return correlationPvalLookup.getQuick( bin, dof );
288 }
289 double t = correlationTstat( acorrel, dof );
290 double p = Probability.studentT( dof, -t );
291
292 /*
293 * Alternate method: Fisher's transform.
294 */
295 // double rr = 0.5 * Math.log( ( 1 + correl ) / ( 1 - correl ) );
296 // double z = rr * Math.sqrt( count - 3.0 );
297 // double altp = 1.0 - Probability.normal( z );
298 // System.err.println( "P for " + correl + ", n=" + count + " =" + String.format( "%.2g", p ) + " alt: "
299 // + String.format( "%.2g", altp ) );
300
301 if ( count < MAXCOUNT ) {
302 correlationPvalLookup.setQuick( bin, dof, p );
303 }
304
305 return p;
306
307 }
308
309 /**
310 * Convert a p value into a value between 0 and 255 inclusive. This is done by taking the log, multiplying it by a
311 * fixed value (currently 8). This means that pvalues less than 10^-32 are rounded to 10^-32.
312 *
313 * @param correl double
314 * @param count int
315 * @return int
316 */
317 public static int pvalueAsByte( double correl, int count ) {
318 int p = -( int ) Math.floor( PVALCHOP * Arithmetic.log10( pvalue( correl, count ) ) );
319
320 if ( p < 0 ) {
321 return 0;
322 } else if ( p > 255 ) {
323 return 255;
324 }
325 return p;
326 }
327
328 /**
329 * @param correl Spearman's correlation
330 * @param count
331 * @return pvalue, two-tailed pvalue, computed using t distribution for large sample sizes or exact computation for
332 * small sample sizes.
333 */
334 public static double spearmanPvalue( double correl, int count ) {
335 double acorrel = Math.abs( correl );
336 int dof = count - 2;
337
338 if ( dof <= 0 ) {
339 return 1.0;
340 }
341 int bin = ( int ) Math.ceil( acorrel / BINSIZE );
342 if ( count <= MAXCOUNT && spearmanPvalLookup.getQuick( bin, dof ) != 0.0 ) {
343 return spearmanPvalLookup.getQuick( bin, dof );
344 }
345
346 double p;
347 if ( count > 1290 ) { // this is the threshold used by R (cor.test.R), to avoid overflows.
348 double t = correlationTstat( acorrel, dof );
349 p = Probability.studentT( dof, -t );
350 // studentT returns the lower tail so we use -t to effectively get the
351 // upper tail.
352 } else {
353 p = spearmanPvalueSmallSample( acorrel, count ); // returns the upper tail for rho>0.
354 }
355
356 assert p <= 0.5 : "Pvalue was " + p + " for correl=" + correl + ", count=" + count
357 + ", expected value less than 0.5";
358 p = Math.min( 1.0, 2.0 * p ); // two-tailed.
359 if ( count < MAXCOUNT ) {
360 spearmanPvalLookup.setQuick( bin, dof, p );
361 }
362 return p;
363 }
364
365 /**
366 * Reverse the Fisher z-transform of Pearson correlations
367 *
368 * @param z
369 * @return r
370 */
371 public static double unFisherTransform( double z ) {
372 if ( Math.abs( z ) < Constants.SMALLISH ) {
373 return 0.0;
374 }
375
376 if ( Math.abs( z ) > 20.0 ) { // ridiculously large
377 return 1.0;
378 }
379
380 return ( Math.exp( 2.0 * z ) - 1.0 ) / ( Math.exp( 2.0 * z ) + 1.0 );
381 }
382
383 /**
384 * Ported from R prho.c and cor.test.R which is in turn a port of a Fortran method (AS 89, Best and Roberts, Applied
385 * Statistics 1975 p 377-379). We compute exact probabilities for very small values (< 9) and use a special
386 * algorithm for larger values. At very large values the t-distribution can be used, this method will be slow.
387 * <p>
388 * The pvalues returned are based on the assumption of no tied ranks.
389 *
390 * @param rho Spearman rank correlation
391 * @param n number of samples -- NOT degrees of freedom. If there were missing values, you should compute n as the
392 * number of complete cases.
393 * @return one-sided pvalue. This is the upper tail if rho is positive, lower tail if rho is negative (this is
394 * potentially confusing, basically we always return a value <=0.5)
395 */
396 private static double spearmanPvalueSmallSample( double rho, int n ) {
397
398 /*
399 * In R, sStat (is) is the S statistic, and gets computed in cor.test.R and passed into prho(). It's the sum
400 * squared error of the ranks (the usual spearman test stat). We backcompute it here. As in the R version we
401 * round it off, as properly S should be an integer.
402 */
403 int sStat = ( int ) Math.round( ( Math.pow( n, 3 ) - n ) * ( 1.0 - Math.abs( rho ) ) / 6.0 );
404
405 // invaluable for debugging!
406 // System.out.println( "rho=" + rho + "; N=" + n + "; S=" + sStat );
407
408 if ( n <= 1 ) {
409 throw new IllegalArgumentException();
410 }
411
412 if ( sStat <= 0.0 ) {
413 if ( rho > 0 ) {
414 return 0.0;
415 }
416 return 1.0;
417 }
418
419 double pv = 1.0;
420
421 /*
422 * Exact evaluation of probability for very small n
423 */
424 if ( n <= n_small ) {
425 int[] ar = new int[n_small];
426 int ifr;
427
428 int n3 = n;
429 n3 *= ( n3 * n3 - 1 ) / 3.0;/* = (n^3 - n)/3 */
430 if ( sStat > n3 ) { /* larger than maximal value */
431 return 0.0; // best possible pvalue...
432 }
433
434 int nfac = 1;
435 for ( int i = 1; i <= n; ++i ) {
436 nfac *= i;
437 ar[i - 1] = i;
438 }
439
440 if ( sStat == n3 ) {
441 ifr = 1;
442 } else {
443 int n1, mt;
444 ifr = 0;
445 for ( int m = 0; m < nfac; ++m ) {
446 int ise = 0;
447 for ( int i = 0; i < n; ++i ) {
448 n1 = i + 1 - ar[i];
449 ise += n1 * n1;
450 }
451 if ( sStat <= ise ) {
452 ++ifr;
453 }
454
455 n1 = n;
456 do {
457 mt = ar[0];
458 for ( int i = 1; i < n1; ++i ) {
459 ar[i - 1] = ar[i];
460 }
461 --n1;
462 ar[n1] = mt;
463 } while ( mt == n1 + 1 && n1 > 1 );
464 }
465 }
466 pv = ifr / ( double ) nfac;
467
468 } else {
469 /*
470 * Evaluation by Edgeworth series expansion. For most sample-wise genomics data sets this is what is used,
471 * as N is typically on the order of magnitude 10-100.
472 */
473
474 double b = 1.0 / n;
475
476 /*
477 * This wasteful backcomputation of rho is designed to mimic the behaviour of R's implementation
478 */
479 double x = ( 6.0 * ( sStat - 1 ) * b / ( n * n - 1 ) - 1 ) * Math.sqrt( n - 1 );
480
481 double y = x * x;
482 double u = x
483 * b
484 * ( c1 + b * ( c2 + c3 * b ) + y
485 * ( -c4 + b * ( c5 + c6 * b ) - y * b
486 * ( c7 + c8 * b - y * ( c9 - c10 * b + y * b * ( c11 - c12 * y ) ) ) ) );
487 y = u / Math.exp( y / 2.0 );
488
489 double pp = 1.0 - Probability.normal( x ); // mean 0, variance 1.
490 pv = y + pp;
491 }
492
493 if ( pv > 0.5 ) pv = 1.0 - pv; // here we enforce our possibly confusing tail rule.
494 if ( pv < 0.0 ) pv = 0.0;
495 if ( pv > 1.0 ) pv = 1.0;
496 return pv;
497
498 }
499
500 }