1 /*
2 * The baseCode project
3 *
4 * Copyright (c) 2006 University of British Columbia
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19 package ubic.basecode.math;
20
21 /**
22 * @author pavlidis
23 *
24 */
25 public class StringDistance {
26
27 /**
28 * The edit distance counts the differences between two strings, where we would count a difference not only when
29 * strings have different characters but also when one has a character whereas the other does not. The formal
30 * definition follows.
31 * <p>
32 * For a string s, let s(i) stand for its ith character. For two characters a and b, define r(a, b) = 0 if a = b.
33 * Let r(a, b) = 1, otherwise.
34 * <p>
35 * Assume we are given two strings s and t of length n and m, respectively. We are going to fill an (n+1) x (m+1)
36 * array d with integers such that the low right corner element d(n+1, m+1) will furnish the required values of the
37 * Levenshtein distance L(s, t).
38 * <p>
39 * The definition of entries of d is recursive. First set d(i, 0) = i, i = 0, 1,..., n, and d(0, j) = j, j = 0, 1,
40 * ..., m. For other pairs i, j use (1) d(i, j) = min(d(i-1, j)+1, d(i, j-1)+1, d(i-1, j-1) + r(s(i), t(j)))
41 * <p>
42 * That last step is also described as:
43 * <li>Set cell d[i,j] of the matrix equal to the minimum of:<br>
44 * a. The cell immediately above plus 1: d[i-1,j] + 1.<br>
45 * b. The cell immediately to the left plus 1: d[i,j-1] + 1.<br>
46 * c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost.<br>
47 * <p>
48 * (Description partly cribbed from http://www.cut-the-knot.org/do_you_know/Strings.shtml and
49 * http://www.merriampark.com/ld.htm)
50 * </p>
51 *
52 * @param a
53 * @param b
54 * @return
55 */
56 public static int editDistance( String s, String t ) {
57 int n = s.length();
58 int m = t.length();
59
60 if ( n == 0 ) return m;
61 if ( m == 0 ) return n;
62
63 // matrix has m+1 rows and n+1 columns.
64 int[][] mat = new int[m + 1][n + 1];
65
66 // initialize all to zero except for first row and columns.
67 for ( int i = 0; i < mat.length; i++ ) {
68 for ( int j = 0; j < mat[i].length; j++ ) {
69 mat[i][j] = 0;
70 mat[0][j] = j;
71 }
72 mat[i][0] = i;
73 }
74
75 // recursively fill in the matrix
76
77 for ( int col = 1; col <= n; col++ ) {
78 char sc = s.charAt( col - 1 );
79
80 for ( int row = 1; row <= m; row++ ) {
81 char tc = t.charAt( row - 1 );
82
83 // minimum of: cell directly above + 1, the cell to the left + 1 and the cell above to the left + cost.
84 int p = mat[row - 1][col] + 1; // above
85 int q = mat[row][col - 1] + 1; // to the left
86 int r = mat[row - 1][col - 1] + ( sc == tc ? 0 : 1 ); // above left +
87 // cost
88
89 mat[row][col] = Math.min( Math.min( p, q ), r );
90 }
91
92 // // debug
93 // for ( int k = 0; k < mat.length; k++ ) {
94 // for ( int e = 0; e < mat[k].length; e++ ) {
95 // System.err.print( mat[k][e] + " " );
96 // }
97 // System.err.print( "\n" );
98 // }
99 // System.err.print( "\n" );
100 }
101
102 return mat[m][n];
103 }
104
105 /**
106 * The Hamming distance H is defined only for strings of the same length. For two strings s and t, H(s, t) is the
107 * number of places in which the two string differ, i.e., have different characters.
108 *
109 * @param a
110 * @param b
111 * @return
112 */
113 public static int hammingDistance( String a, String b ) {
114 if ( a.length() != b.length() ) throw new IllegalArgumentException( "Strings must be the same length" );
115 int result = 0;
116 for ( int i = 0; i < a.length(); i++ ) {
117 result += a.charAt( i ) == b.charAt( i ) ? 0 : 1;
118 }
119 return result;
120 }
121
122 /**
123 * Compute the Hamming distance between two strings of equal length (if they are of unequal length the longer one is
124 * trimmed), giving higher weight to characters at the start of the strings, so strings that are similar at the
125 * starts are given higher scores (shorter distances) than strings that are similar at the ends.
126 *
127 * @param s
128 * @param t
129 * @param weight Controls how quickly (normalized by the length of the string) the bias towards the front drops off.
130 * The penalty for mismatches drops off linearly for the fraction of the String represented by
131 * <code>weight</code>. A weight of 1.0 indicates the bias should be linear across the length of the string.
132 * Intermediate values (e.g., 0.25) mean that differences beyond the first 1/4 of the string results in no
133 * penalty.
134 * @return
135 */
136 public static double prefixWeightedHammingDistance( String s, String t, double weight ) {
137
138 if ( weight <= 0.0 || weight > 1.0 )
139 throw new IllegalArgumentException( "weight must be between zero and one" );
140
141 String trimmedS = s;
142 String trimmedT = t;
143 if ( s.length() != t.length() ) {
144 if ( s.length() > t.length() ) {
145 trimmedS = s.substring( 0, t.length() );
146 } else {
147 trimmedT = t.substring( 0, s.length() );
148 }
149 }
150
151 double result = 0;
152 for ( int i = 0; i < trimmedS.length(); i++ ) {
153 double penalty = Math.max( 0.0, ( 1.0 - i / ( weight * trimmedS.length() ) ) );
154 result += trimmedT.charAt( i ) == trimmedS.charAt( i ) ? 0 : penalty;
155 }
156 return result;
157 }
158
159 /**
160 * @param s
161 * @param t
162 * @param weight
163 * @return
164 * @see prefixWeightedHammingDistance
165 */
166 public static double suffixWeightedHammingDistance( String s, String t, double weight ) {
167
168 if ( weight <= 0.0 || weight > 1.0 )
169 throw new IllegalArgumentException( "weight must be between zero and one" );
170
171 String trimmedS = s;
172 String trimmedT = t;
173 if ( s.length() != t.length() ) {
174 if ( s.length() > t.length() ) {
175 trimmedS = s.substring( 0, t.length() );
176 } else {
177 trimmedT = t.substring( 0, s.length() );
178 }
179 }
180
181 double result = 0;
182 for ( int i = 0; i < trimmedS.length(); i++ ) {
183 double rawpen = ( double ) i / ( double ) trimmedS.length() / weight - ( 1.0 - weight );
184 double penalty = Math.max( 0.0, rawpen );
185 // / System.err.println( rawpen + " " + penalty );
186 result += trimmedT.charAt( i ) == trimmedS.charAt( i ) ? 0 : penalty;
187 }
188 return result;
189 }
190 }