View Javadoc
1   /*
2    * The baseCode project
3    * 
4    * Copyright (c) 2006 University of British Columbia
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math;
20  
21  import cern.colt.list.DoubleArrayList;
22  import cern.colt.matrix.DoubleMatrix2D;
23  import cern.colt.matrix.impl.SparseDoubleMatrix2D;
24  import cern.jet.math.Arithmetic;
25  import cern.jet.stat.Probability;
26  
27  /**
28   * Statistical evaluation and transformation tools for correlations.
29   * 
30   * @author Paul Pavlidis
31   * 
32   */
33  public class CorrelationStats {
34  
35      /* for spearman - for n <= this, we compute exact probabilities. */
36      final static int n_small = 9;
37      private static final double BINSIZE = 0.005; // resolution of correlation.
38      // Differences smaller than this
39      // are considered meaningless.
40  
41      /* Edgeworth coefficients for spearman p-value computation : */
42      private static final double c1 = 0.2274, c2 = 0.2531, c3 = 0.1745, c4 = 0.0758, c5 = 0.1033, c6 = 0.3932,
43              c7 = 0.0879, c8 = 0.0151, c9 = 0.0072, c10 = 0.0831, c11 = 0.0131, c12 = 4.6e-4;
44      private static DoubleMatrix2D correlationPvalLookup;
45  
46      private static final int MAXCOUNT = 1000; // maximum number of things.
47  
48      private static final double PVALCHOP = 8.0; // value by which log(pvalues)
49      // are scaled before storing as
50      // bytes. Values less than
51      // 10^e-256/PVALCHOP are
52      // 'clipped'.
53  
54      private static DoubleMatrix2D spearmanPvalLookup;
55  
56      static {
57          int numbins = ( int ) Math.ceil( 1.0 / BINSIZE );
58          correlationPvalLookup = new SparseDoubleMatrix2D( numbins, MAXCOUNT + 1 );
59          spearmanPvalLookup = new SparseDoubleMatrix2D( numbins, MAXCOUNT + 1 );
60      }
61  
62      /**
63       * @param correlByte int
64       * @return double
65       */
66      public static double byteToCorrel( int correlByte ) {
67          return correlByte / 128.0 - 1.0;
68      }
69  
70      /**
71       * @param pvalByte int
72       * @return double
73       */
74      public static double byteToPvalue( int pvalByte ) {
75          return Math.pow( 10.0, -( double ) pvalByte / PVALCHOP );
76      }
77  
78      /**
79       * Statistical comparison of two Pearson correlations. Assumes data are bivariate normal. Null hypothesis is that
80       * the two correlations are equal. See Zar (Biostatistics)
81       * 
82       * @param correl1 First correlation
83       * @param n1 Number of values used to compute correl1
84       * @param correl2 Second correlation
85       * @param n2 Number of values used to compute correl2
86       * @return double p value.
87       */
88      public static double compare( double correl1, int n1, double correl2, int n2 ) {
89  
90          double Z;
91          double sigma;
92          double p;
93  
94          sigma = Math.sqrt( 1 / ( ( double ) n1 - 3 ) + 1 / ( ( double ) n2 - 3 ) );
95  
96          Z = Math.abs( correl1 - correl2 ) / sigma;
97  
98          p = Probability.normal( -Z ); // upper tail.
99  
100         if ( p > 0.5 ) {
101             return 1.0 - p;
102         }
103         return p;
104     }
105 
106     /**
107      * Compute the Pearson correlation, missing values are permitted.
108      * 
109      * @param ival
110      * @param jval
111      * @return
112      */
113     public static double correl( double[] ival, double[] jval ) {
114         /* do it the old fashioned way */
115         int numused = 0;
116         double sxy = 0.0, sxx = 0.0, syy = 0.0, sx = 0.0, sy = 0.0;
117         int length = Math.min( ival.length, jval.length );
118         for ( int k = 0; k < length; k++ ) {
119             double xj = ival[k];
120             double yj = jval[k];
121             if ( !Double.isNaN( ival[k] ) && !Double.isNaN( jval[k] ) ) {
122                 sx += xj;
123                 sy += yj;
124                 sxy += xj * yj;
125                 sxx += xj * xj;
126                 syy += yj * yj;
127                 numused++;
128             }
129         }
130         if ( numused < 2 ) return Double.NaN;
131         double denom = ( sxx - sx * sx / numused ) * ( syy - sy * sy / numused );
132         if ( denom <= 0 ) return Double.NaN;
133         double correl = ( sxy - sx * sy / numused ) / Math.sqrt( denom );
134         return correl;
135     }
136 
137     /**
138      * @param correl double
139      * @return int
140      */
141     public static int correlAsByte( double correl ) {
142         if ( correl == -1.0 ) {
143             return 0;
144         }
145         return ( int ) ( Math.ceil( ( correl + 1.0 ) * 128 ) - 1 );
146     }
147 
148     /**
149      * Find the approximate Pearson correlation required to meet a particular pvalue. If the pvalue is <=0 or >= 1,
150      * returns 1 and 0 respectively.
151      * 
152      * @param pval double
153      * @param count int
154      * @return double
155      */
156     public static double correlationForPvalue( double pval, int count ) {
157 
158         if ( pval >= 1.0 ) {
159             return 0.0;
160         }
161 
162         if ( pval <= 0.0 ) {
163             return 1.0;
164         }
165 
166         if ( count < 3 ) {
167             return 0.0; // warn?
168         }
169 
170         double z = Math.abs( Probability.normalInverse( pval ) );
171 
172         double v = z / Math.sqrt( count - 3.0 );
173 
174         double corrguess = unFisherTransform( v );
175         assert Math.abs( corrguess ) <= 1.1 : "Invalid correlation: " + corrguess; // sanity.
176         // double goback = pvalue( corrguess, count );
177 
178         // System.err.println( "est corr=" + corrguess + " with pvalue " + goback + " ==? original p=" + pval );
179 
180         return Math.min( corrguess, 1.0 );
181     }
182 
183     /**
184      * Compute the t-statistic associated with a Pearson correlation.
185      * 
186      * @param correl Pearson correlation
187      * @param dof Degrees of freedom (n - 2)
188      * @return double
189      */
190     public static double correlationTstat( double correl, int dof ) {
191         return correl / Math.sqrt( ( 1.0 - correl * correl ) / dof );
192     }
193 
194     /**
195      * Compute Pearson correlation when there are no missing values.
196      * 
197      * @param ival
198      * @param jval
199      * @param meani
200      * @param meanj
201      * @param sqrti
202      * @param sqrtj
203      * @return
204      */
205     public static double correlFast( double[] ival, double[] jval, double meani, double meanj, double sqrti,
206             double sqrtj ) {
207         double sxy = 0.0;
208         for ( int k = 0, n = ival.length; k < n; k++ ) {
209             sxy += ( ival[k] - meani ) * ( jval[k] - meanj );
210         }
211         return sxy / ( sqrti * sqrtj );
212     }
213 
214     /**
215      * Compute the Fisher z transform of the Pearson correlation.
216      * 
217      * @param r Correlation coefficient.
218      * @return Fisher transform of the Correlation.
219      */
220     public static double fisherTransform( double r ) {
221 
222         if ( Math.abs( r ) == 1.0 ) {
223             return Double.POSITIVE_INFINITY;
224         }
225 
226         if ( r == 0.0 ) return 0.0;
227 
228         if ( !isValidPearsonCorrelation( r ) ) {
229             throw new IllegalArgumentException( "Invalid correlation " + r );
230         }
231 
232         return 0.5 * Math.log( ( 1.0 + r ) / ( 1.0 - r ) );
233     }
234 
235     /**
236      * Fisher-transform a list of Pearson correlations.
237      * 
238      * @param e
239      * @return
240      */
241     public static DoubleArrayList fisherTransform( DoubleArrayList e ) {
242         DoubleArrayList r = new DoubleArrayList( e.size() );
243         for ( int i = 0; i < e.size(); i++ ) {
244             r.add( CorrelationStats.fisherTransform( e.getQuick( i ) ) );
245         }
246         return r;
247     }
248 
249     /**
250      * Test if a value is a reasonable Pearson correlation (in the range -1 to 1; values outside of this range are
251      * acceptable within a small roundoff.
252      * 
253      * @param r
254      * @return
255      */
256     public static boolean isValidPearsonCorrelation( double r ) {
257         return r + Constants.SMALL >= -1.0 && r - Constants.SMALL <= 1.0;
258     }
259 
260     /**
261      * Compute pvalue for the pearson correlation, using the t distribution method.
262      * 
263      * @param correl Pearson correlation.
264      * @param count Number of items used to calculate the correlation. NOT the degrees of freedom.
265      * @return double one-side pvalue
266      */
267     public static double pvalue( double correl, int count ) {
268 
269         double acorrel = Math.abs( correl );
270 
271         if ( acorrel == 1.0 ) {
272             return 0.0;
273         }
274 
275         if ( acorrel == 0.0 ) {
276             return 1.0;
277         }
278 
279         int dof = count - 2;
280 
281         if ( dof <= 0 ) {
282             return 1.0;
283         }
284 
285         int bin = ( int ) Math.ceil( acorrel / BINSIZE );
286         if ( count <= MAXCOUNT && correlationPvalLookup.getQuick( bin, dof ) != 0.0 ) {
287             return correlationPvalLookup.getQuick( bin, dof );
288         }
289         double t = correlationTstat( acorrel, dof );
290         double p = Probability.studentT( dof, -t );
291 
292         /*
293          * Alternate method: Fisher's transform.
294          */
295         // double rr = 0.5 * Math.log( ( 1 + correl ) / ( 1 - correl ) );
296         // double z = rr * Math.sqrt( count - 3.0 );
297         // double altp = 1.0 - Probability.normal( z );
298         // System.err.println( "P for " + correl + ", n=" + count + " =" + String.format( "%.2g", p ) + " alt: "
299         // + String.format( "%.2g", altp ) );
300 
301         if ( count < MAXCOUNT ) {
302             correlationPvalLookup.setQuick( bin, dof, p );
303         }
304 
305         return p;
306 
307     }
308 
309     /**
310      * Convert a p value into a value between 0 and 255 inclusive. This is done by taking the log, multiplying it by a
311      * fixed value (currently 8). This means that pvalues less than 10^-32 are rounded to 10^-32.
312      * 
313      * @param correl double
314      * @param count int
315      * @return int
316      */
317     public static int pvalueAsByte( double correl, int count ) {
318         int p = -( int ) Math.floor( PVALCHOP * Arithmetic.log10( pvalue( correl, count ) ) );
319 
320         if ( p < 0 ) {
321             return 0;
322         } else if ( p > 255 ) {
323             return 255;
324         }
325         return p;
326     }
327 
328     /**
329      * @param correl Spearman's correlation
330      * @param count
331      * @return pvalue, two-tailed pvalue, computed using t distribution for large sample sizes or exact computation for
332      *         small sample sizes.
333      */
334     public static double spearmanPvalue( double correl, int count ) {
335         double acorrel = Math.abs( correl );
336         int dof = count - 2;
337 
338         if ( dof <= 0 ) {
339             return 1.0;
340         }
341         int bin = ( int ) Math.ceil( acorrel / BINSIZE );
342         if ( count <= MAXCOUNT && spearmanPvalLookup.getQuick( bin, dof ) != 0.0 ) {
343             return spearmanPvalLookup.getQuick( bin, dof );
344         }
345 
346         double p;
347         if ( count > 1290 ) { // this is the threshold used by R (cor.test.R), to avoid overflows.
348             double t = correlationTstat( acorrel, dof );
349             p = Probability.studentT( dof, -t );
350             // studentT returns the lower tail so we use -t to effectively get the
351             // upper tail.
352         } else {
353             p = spearmanPvalueSmallSample( acorrel, count ); // returns the upper tail for rho>0.
354         }
355 
356         assert p <= 0.5 : "Pvalue was " + p + " for correl=" + correl + ", count=" + count
357                 + ", expected value less than 0.5";
358         p = Math.min( 1.0, 2.0 * p ); // two-tailed.
359         if ( count < MAXCOUNT ) {
360             spearmanPvalLookup.setQuick( bin, dof, p );
361         }
362         return p;
363     }
364 
365     /**
366      * Reverse the Fisher z-transform of Pearson correlations
367      * 
368      * @param z
369      * @return r
370      */
371     public static double unFisherTransform( double z ) {
372         if ( Math.abs( z ) < Constants.SMALLISH ) {
373             return 0.0;
374         }
375 
376         if ( Math.abs( z ) > 20.0 ) { // ridiculously large
377             return 1.0;
378         }
379 
380         return ( Math.exp( 2.0 * z ) - 1.0 ) / ( Math.exp( 2.0 * z ) + 1.0 );
381     }
382 
383     /**
384      * Ported from R prho.c and cor.test.R which is in turn a port of a Fortran method (AS 89, Best and Roberts, Applied
385      * Statistics 1975 p 377-379). We compute exact probabilities for very small values (< 9) and use a special
386      * algorithm for larger values. At very large values the t-distribution can be used, this method will be slow.
387      * <p>
388      * The pvalues returned are based on the assumption of no tied ranks.
389      * 
390      * @param rho Spearman rank correlation
391      * @param n number of samples -- NOT degrees of freedom. If there were missing values, you should compute n as the
392      *        number of complete cases.
393      * @return one-sided pvalue. This is the upper tail if rho is positive, lower tail if rho is negative (this is
394      *         potentially confusing, basically we always return a value <=0.5)
395      */
396     private static double spearmanPvalueSmallSample( double rho, int n ) {
397 
398         /*
399          * In R, sStat (is) is the S statistic, and gets computed in cor.test.R and passed into prho(). It's the sum
400          * squared error of the ranks (the usual spearman test stat). We backcompute it here. As in the R version we
401          * round it off, as properly S should be an integer.
402          */
403         int sStat = ( int ) Math.round( ( Math.pow( n, 3 ) - n ) * ( 1.0 - Math.abs( rho ) ) / 6.0 );
404 
405         // invaluable for debugging!
406         // System.out.println( "rho=" + rho + "; N=" + n + "; S=" + sStat );
407 
408         if ( n <= 1 ) {
409             throw new IllegalArgumentException();
410         }
411 
412         if ( sStat <= 0.0 ) {
413             if ( rho > 0 ) {
414                 return 0.0;
415             }
416             return 1.0;
417         }
418 
419         double pv = 1.0;
420 
421         /*
422          * Exact evaluation of probability for very small n
423          */
424         if ( n <= n_small ) {
425             int[] ar = new int[n_small];
426             int ifr;
427 
428             int n3 = n;
429             n3 *= ( n3 * n3 - 1 ) / 3.0;/* = (n^3 - n)/3 */
430             if ( sStat > n3 ) { /* larger than maximal value */
431                 return 0.0; // best possible pvalue...
432             }
433 
434             int nfac = 1;
435             for ( int i = 1; i <= n; ++i ) {
436                 nfac *= i;
437                 ar[i - 1] = i;
438             }
439 
440             if ( sStat == n3 ) {
441                 ifr = 1;
442             } else {
443                 int n1, mt;
444                 ifr = 0;
445                 for ( int m = 0; m < nfac; ++m ) {
446                     int ise = 0;
447                     for ( int i = 0; i < n; ++i ) {
448                         n1 = i + 1 - ar[i];
449                         ise += n1 * n1;
450                     }
451                     if ( sStat <= ise ) {
452                         ++ifr;
453                     }
454 
455                     n1 = n;
456                     do {
457                         mt = ar[0];
458                         for ( int i = 1; i < n1; ++i ) {
459                             ar[i - 1] = ar[i];
460                         }
461                         --n1;
462                         ar[n1] = mt;
463                     } while ( mt == n1 + 1 && n1 > 1 );
464                 }
465             }
466             pv = ifr / ( double ) nfac;
467 
468         } else {
469             /*
470              * Evaluation by Edgeworth series expansion. For most sample-wise genomics data sets this is what is used,
471              * as N is typically on the order of magnitude 10-100.
472              */
473 
474             double b = 1.0 / n;
475 
476             /*
477              * This wasteful backcomputation of rho is designed to mimic the behaviour of R's implementation
478              */
479             double x = ( 6.0 * ( sStat - 1 ) * b / ( n * n - 1 ) - 1 ) * Math.sqrt( n - 1 );
480 
481             double y = x * x;
482             double u = x
483                     * b
484                     * ( c1 + b * ( c2 + c3 * b ) + y
485                             * ( -c4 + b * ( c5 + c6 * b ) - y * b
486                                     * ( c7 + c8 * b - y * ( c9 - c10 * b + y * b * ( c11 - c12 * y ) ) ) ) );
487             y = u / Math.exp( y / 2.0 );
488 
489             double pp = 1.0 - Probability.normal( x ); // mean 0, variance 1.
490             pv = y + pp;
491         }
492 
493         if ( pv > 0.5 ) pv = 1.0 - pv; // here we enforce our possibly confusing tail rule.
494         if ( pv < 0.0 ) pv = 0.0;
495         if ( pv > 1.0 ) pv = 1.0;
496         return pv;
497 
498     }
499 
500 }