View Javadoc
1   /*
2    * The baseCode project
3    * 
4    * Copyright (c) 2006 University of British Columbia
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math;
20  
21  import cern.colt.list.DoubleArrayList;
22  
23  /**
24   * Alternative distance and similarity metrics for vectors.
25   * 
26   * @author Paul Pavlidis
27   * 
28   */
29  public class Distance {
30  
31      /**
32       * Highly optimized implementation of the Pearson correlation. The inputs must be standardized - mean zero, variance
33       * one, without any missing values.
34       * 
35       * @param xe A standardized vector
36       * @param ye A standardized vector
37       * @return Pearson correlation coefficient.
38       */
39      public static double correlationOfStandardized( double[] xe, double[] ye ) {
40          double sxy = 0.0;
41          for ( int i = 0, n = xe.length; i < n; i++ ) {
42              double xj = xe[i];
43              double yj = ye[i];
44              sxy += xj * yj;
45          }
46  
47          return sxy / ( xe.length - 1 );
48      }
49  
50      /**
51       * Like correlationofNormedFast, but takes DoubleArrayLists as inputs, handles missing values correctly, and does
52       * more error checking. Assumes the data has been converted to z scores already.
53       * 
54       * @param x A standardized vector
55       * @param y A standardized vector
56       * @return The Pearson correlation between x and y.
57       */
58      public static double correlationOfStandardized( DoubleArrayList x, DoubleArrayList y ) {
59  
60          if ( x.size() != y.size() ) {
61              throw new IllegalArgumentException( "Array lengths must be the same" );
62          }
63  
64          double[] xe = x.elements();
65          double[] ye = y.elements();
66          double sxy = 0.0;
67          int length = 0;
68          for ( int i = 0, n = x.size(); i < n; i++ ) {
69              double xj = xe[i];
70              double yj = ye[i];
71  
72              if ( Double.isNaN( xj ) || Double.isNaN( yj ) ) {
73                  continue;
74              }
75  
76              sxy += xj * yj;
77              length++;
78          }
79  
80          if ( length == 0 ) {
81              return -2.0; // flag of illegal value.
82          }
83          return sxy / ( length - 1 );
84      }
85  
86      /**
87       * Calculate the Euclidean distance between two vectors.
88       * 
89       * @param x DoubleArrayList
90       * @param y DoubleArrayList
91       * @return Euclidean distance between x and y
92       */
93      public static double euclDistance( DoubleArrayList x, DoubleArrayList y ) {
94          int j;
95          double sum;
96          sum = 0.0;
97          if ( x.size() != y.size() ) {
98              throw new ArithmeticException();
99          }
100 
101         int length = x.size();
102 
103         for ( j = 0; j < length; j++ ) {
104             if ( !Double.isNaN( x.elements()[j] ) && !Double.isNaN( y.elements()[j] ) ) {
105                 sum += Math.pow( ( x.elements()[j] - y.elements()[j] ), 2 );
106             }
107         }
108         if ( sum == 0.0 ) {
109             return 0.0;
110         }
111         return Math.sqrt( sum );
112     }
113 
114     /**
115      * Calculate the Manhattan distance between two vectors.
116      * 
117      * @param x DoubleArrayList
118      * @param y DoubleArrayList
119      * @return Manhattan distance between x and y
120      */
121     public static double manhattanDistance( DoubleArrayList x, DoubleArrayList y ) {
122         int j;
123         double sum = 0.0;
124 
125         if ( x.size() != y.size() ) {
126             throw new ArithmeticException();
127         }
128 
129         int length = x.size();
130         for ( j = 0; j < length; j++ ) {
131             if ( !Double.isNaN( x.elements()[j] ) && !Double.isNaN( y.elements()[j] ) ) {
132                 sum += Math.abs( x.elements()[j] - y.elements()[j] );
133             }
134         }
135         return sum;
136     }
137 
138     /**
139      * Convenience function to compute the rank correlation when we just want to know if the values are "in order".
140      * Values in perfect ascending order are a correlation of 1, descending is -1.
141      * 
142      * @param x
143      * @return
144      */
145     public static double spearmanRankCorrelation( DoubleArrayList x ) {
146         DoubleArrayList ranks = Rank.rankTransform( x );
147 
148         double s = 0.0;
149         int n = ranks.size();
150         for ( int i = 0; i < n; i++ ) {
151             double r = ranks.get( i ); // 1 is the minimum.
152             double d = Math.pow( r - ( i + 1 ), 2 );
153             s += d;
154         }
155 
156         return 1.0 - ( 6.0 * s ) / ( n * ( Math.pow( n, 2 ) - 1 ) );
157     }
158 
159     //
160     // public static double mutualInformation(DoubleMatrix1D x, DoubleMatrix1D y) {
161     // double h = 0.1; // toto: estimate this. Tricky.
162     //
163     //
164     //
165     // }
166 
167     /**
168      * Spearman Rank Correlation. This does the rank transformation of the data. Only mutually non-NaN values are used.
169      * 
170      * @param x DoubleArrayList
171      * @param y DoubleArrayList
172      * @return Spearman's rank correlation between x and y or NaN if it could not be computed.
173      */
174     public static double spearmanRankCorrelation( DoubleArrayList x, DoubleArrayList y ) {
175 
176         if ( x.size() != y.size() ) {
177             throw new IllegalArgumentException( "x and y must have equal sizes." );
178         }
179 
180         DoubleArrayList mx = new DoubleArrayList();
181         DoubleArrayList my = new DoubleArrayList();
182 
183         for ( int i = 0; i < x.size(); i++ ) {
184             if ( Double.isNaN( x.get( i ) ) || Double.isNaN( y.get( i ) ) ) {
185                 continue;
186             }
187             mx.add( x.get( i ) );
188             my.add( y.get( i ) );
189         }
190 
191         assert mx.size() == my.size();
192         if ( mx.size() < 2 ) return Double.NaN;
193 
194         DoubleArrayList rx = Rank.rankTransform( mx );
195         DoubleArrayList ry = Rank.rankTransform( my );
196 
197         return DescriptiveWithMissing.correlation( rx, ry );
198     }
199 }