1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 import cern.colt.list.DoubleArrayList;
22
23 /**
24 * Alternative distance and similarity metrics for vectors.
25 *
26 * @author Paul Pavlidis
27 *
28 */
29 public class Distance {
30
31 /**
32 * Highly optimized implementation of the Pearson correlation. The inputs must be standardized - mean zero, variance
33 * one, without any missing values.
34 *
35 * @param xe A standardized vector
36 * @param ye A standardized vector
37 * @return Pearson correlation coefficient.
38 */
39 public static double correlationOfStandardized( double[] xe, double[] ye ) {
40 double sxy = 0.0;
41 for ( int i = 0, n = xe.length; i < n; i++ ) {
42 double xj = xe[i];
43 double yj = ye[i];
44 sxy += xj * yj;
45 }
46
47 return sxy / ( xe.length - 1 );
48 }
49
50 /**
51 * Like correlationofNormedFast, but takes DoubleArrayLists as inputs, handles missing values correctly, and does
52 * more error checking. Assumes the data has been converted to z scores already.
53 *
54 * @param x A standardized vector
55 * @param y A standardized vector
56 * @return The Pearson correlation between x and y.
57 */
58 public static double correlationOfStandardized( DoubleArrayList x, DoubleArrayList y ) {
59
60 if ( x.size() != y.size() ) {
61 throw new IllegalArgumentException( "Array lengths must be the same" );
62 }
63
64 double[] xe = x.elements();
65 double[] ye = y.elements();
66 double sxy = 0.0;
67 int length = 0;
68 for ( int i = 0, n = x.size(); i < n; i++ ) {
69 double xj = xe[i];
70 double yj = ye[i];
71
72 if ( Double.isNaN( xj ) || Double.isNaN( yj ) ) {
73 continue;
74 }
75
76 sxy += xj * yj;
77 length++;
78 }
79
80 if ( length == 0 ) {
81 return -2.0; // flag of illegal value.
82 }
83 return sxy / ( length - 1 );
84 }
85
86 /**
87 * Calculate the Euclidean distance between two vectors.
88 *
89 * @param x DoubleArrayList
90 * @param y DoubleArrayList
91 * @return Euclidean distance between x and y
92 */
93 public static double euclDistance( DoubleArrayList x, DoubleArrayList y ) {
94 int j;
95 double sum;
96 sum = 0.0;
97 if ( x.size() != y.size() ) {
98 throw new ArithmeticException();
99 }
100
101 int length = x.size();
102
103 for ( j = 0; j < length; j++ ) {
104 if ( !Double.isNaN( x.elements()[j] ) && !Double.isNaN( y.elements()[j] ) ) {
105 sum += Math.pow( ( x.elements()[j] - y.elements()[j] ), 2 );
106 }
107 }
108 if ( sum == 0.0 ) {
109 return 0.0;
110 }
111 return Math.sqrt( sum );
112 }
113
114 /**
115 * Calculate the Manhattan distance between two vectors.
116 *
117 * @param x DoubleArrayList
118 * @param y DoubleArrayList
119 * @return Manhattan distance between x and y
120 */
121 public static double manhattanDistance( DoubleArrayList x, DoubleArrayList y ) {
122 int j;
123 double sum = 0.0;
124
125 if ( x.size() != y.size() ) {
126 throw new ArithmeticException();
127 }
128
129 int length = x.size();
130 for ( j = 0; j < length; j++ ) {
131 if ( !Double.isNaN( x.elements()[j] ) && !Double.isNaN( y.elements()[j] ) ) {
132 sum += Math.abs( x.elements()[j] - y.elements()[j] );
133 }
134 }
135 return sum;
136 }
137
138 /**
139 * Convenience function to compute the rank correlation when we just want to know if the values are "in order".
140 * Values in perfect ascending order are a correlation of 1, descending is -1.
141 *
142 * @param x
143 * @return
144 */
145 public static double spearmanRankCorrelation( DoubleArrayList x ) {
146 DoubleArrayList ranks = Rank.rankTransform( x );
147
148 double s = 0.0;
149 int n = ranks.size();
150 for ( int i = 0; i < n; i++ ) {
151 double r = ranks.get( i ); // 1 is the minimum.
152 double d = Math.pow( r - ( i + 1 ), 2 );
153 s += d;
154 }
155
156 return 1.0 - ( 6.0 * s ) / ( n * ( Math.pow( n, 2 ) - 1 ) );
157 }
158
159 //
160 // public static double mutualInformation(DoubleMatrix1D x, DoubleMatrix1D y) {
161 // double h = 0.1; // toto: estimate this. Tricky.
162 //
163 //
164 //
165 // }
166
167 /**
168 * Spearman Rank Correlation. This does the rank transformation of the data. Only mutually non-NaN values are used.
169 *
170 * @param x DoubleArrayList
171 * @param y DoubleArrayList
172 * @return Spearman's rank correlation between x and y or NaN if it could not be computed.
173 */
174 public static double spearmanRankCorrelation( DoubleArrayList x, DoubleArrayList y ) {
175
176 if ( x.size() != y.size() ) {
177 throw new IllegalArgumentException( "x and y must have equal sizes." );
178 }
179
180 DoubleArrayList mx = new DoubleArrayList();
181 DoubleArrayList my = new DoubleArrayList();
182
183 for ( int i = 0; i < x.size(); i++ ) {
184 if ( Double.isNaN( x.get( i ) ) || Double.isNaN( y.get( i ) ) ) {
185 continue;
186 }
187 mx.add( x.get( i ) );
188 my.add( y.get( i ) );
189 }
190
191 assert mx.size() == my.size();
192 if ( mx.size() < 2 ) return Double.NaN;
193
194 DoubleArrayList rx = Rank.rankTransform( mx );
195 DoubleArrayList ry = Rank.rankTransform( my );
196
197 return DescriptiveWithMissing.correlation( rx, ry );
198 }
199 }