1 /* 2 * The baseCode project 3 * 4 * Copyright (c) 2006 University of British Columbia 5 * 6 * Licensed under the Apache License, Version 2.0 (the "License"); 7 * you may not use this file except in compliance with the License. 8 * You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, software 13 * distributed under the License is distributed on an "AS IS" BASIS, 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 * See the License for the specific language governing permissions and 16 * limitations under the License. 17 * 18 */ 19 package ubic.basecode.math; 20 21 /** 22 * @author pavlidis 23 * 24 */ 25 public class StringDistance { 26 27 /** 28 * The edit distance counts the differences between two strings, where we would count a difference not only when 29 * strings have different characters but also when one has a character whereas the other does not. The formal 30 * definition follows. 31 * <p> 32 * For a string s, let s(i) stand for its ith character. For two characters a and b, define r(a, b) = 0 if a = b. 33 * Let r(a, b) = 1, otherwise. 34 * <p> 35 * Assume we are given two strings s and t of length n and m, respectively. We are going to fill an (n+1) x (m+1) 36 * array d with integers such that the low right corner element d(n+1, m+1) will furnish the required values of the 37 * Levenshtein distance L(s, t). 38 * <p> 39 * The definition of entries of d is recursive. First set d(i, 0) = i, i = 0, 1,..., n, and d(0, j) = j, j = 0, 1, 40 * ..., m. For other pairs i, j use (1) d(i, j) = min(d(i-1, j)+1, d(i, j-1)+1, d(i-1, j-1) + r(s(i), t(j))) 41 * <p> 42 * That last step is also described as: 43 * <li>Set cell d[i,j] of the matrix equal to the minimum of:<br> 44 * a. The cell immediately above plus 1: d[i-1,j] + 1.<br> 45 * b. The cell immediately to the left plus 1: d[i,j-1] + 1.<br> 46 * c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost.<br> 47 * <p> 48 * (Description partly cribbed from http://www.cut-the-knot.org/do_you_know/Strings.shtml and 49 * http://www.merriampark.com/ld.htm) 50 * </p> 51 * 52 * @param a 53 * @param b 54 * @return 55 */ 56 public static int editDistance( String s, String t ) { 57 int n = s.length(); 58 int m = t.length(); 59 60 if ( n == 0 ) return m; 61 if ( m == 0 ) return n; 62 63 // matrix has m+1 rows and n+1 columns. 64 int[][] mat = new int[m + 1][n + 1]; 65 66 // initialize all to zero except for first row and columns. 67 for ( int i = 0; i < mat.length; i++ ) { 68 for ( int j = 0; j < mat[i].length; j++ ) { 69 mat[i][j] = 0; 70 mat[0][j] = j; 71 } 72 mat[i][0] = i; 73 } 74 75 // recursively fill in the matrix 76 77 for ( int col = 1; col <= n; col++ ) { 78 char sc = s.charAt( col - 1 ); 79 80 for ( int row = 1; row <= m; row++ ) { 81 char tc = t.charAt( row - 1 ); 82 83 // minimum of: cell directly above + 1, the cell to the left + 1 and the cell above to the left + cost. 84 int p = mat[row - 1][col] + 1; // above 85 int q = mat[row][col - 1] + 1; // to the left 86 int r = mat[row - 1][col - 1] + ( sc == tc ? 0 : 1 ); // above left + 87 // cost 88 89 mat[row][col] = Math.min( Math.min( p, q ), r ); 90 } 91 92 // // debug 93 // for ( int k = 0; k < mat.length; k++ ) { 94 // for ( int e = 0; e < mat[k].length; e++ ) { 95 // System.err.print( mat[k][e] + " " ); 96 // } 97 // System.err.print( "\n" ); 98 // } 99 // System.err.print( "\n" ); 100 } 101 102 return mat[m][n]; 103 } 104 105 /** 106 * The Hamming distance H is defined only for strings of the same length. For two strings s and t, H(s, t) is the 107 * number of places in which the two string differ, i.e., have different characters. 108 * 109 * @param a 110 * @param b 111 * @return 112 */ 113 public static int hammingDistance( String a, String b ) { 114 if ( a.length() != b.length() ) throw new IllegalArgumentException( "Strings must be the same length" ); 115 int result = 0; 116 for ( int i = 0; i < a.length(); i++ ) { 117 result += a.charAt( i ) == b.charAt( i ) ? 0 : 1; 118 } 119 return result; 120 } 121 122 /** 123 * Compute the Hamming distance between two strings of equal length (if they are of unequal length the longer one is 124 * trimmed), giving higher weight to characters at the start of the strings, so strings that are similar at the 125 * starts are given higher scores (shorter distances) than strings that are similar at the ends. 126 * 127 * @param s 128 * @param t 129 * @param weight Controls how quickly (normalized by the length of the string) the bias towards the front drops off. 130 * The penalty for mismatches drops off linearly for the fraction of the String represented by 131 * <code>weight</code>. A weight of 1.0 indicates the bias should be linear across the length of the string. 132 * Intermediate values (e.g., 0.25) mean that differences beyond the first 1/4 of the string results in no 133 * penalty. 134 * @return 135 */ 136 public static double prefixWeightedHammingDistance( String s, String t, double weight ) { 137 138 if ( weight <= 0.0 || weight > 1.0 ) 139 throw new IllegalArgumentException( "weight must be between zero and one" ); 140 141 String trimmedS = s; 142 String trimmedT = t; 143 if ( s.length() != t.length() ) { 144 if ( s.length() > t.length() ) { 145 trimmedS = s.substring( 0, t.length() ); 146 } else { 147 trimmedT = t.substring( 0, s.length() ); 148 } 149 } 150 151 double result = 0; 152 for ( int i = 0; i < trimmedS.length(); i++ ) { 153 double penalty = Math.max( 0.0, ( 1.0 - i / ( weight * trimmedS.length() ) ) ); 154 result += trimmedT.charAt( i ) == trimmedS.charAt( i ) ? 0 : penalty; 155 } 156 return result; 157 } 158 159 /** 160 * @param s 161 * @param t 162 * @param weight 163 * @return 164 * @see prefixWeightedHammingDistance 165 */ 166 public static double suffixWeightedHammingDistance( String s, String t, double weight ) { 167 168 if ( weight <= 0.0 || weight > 1.0 ) 169 throw new IllegalArgumentException( "weight must be between zero and one" ); 170 171 String trimmedS = s; 172 String trimmedT = t; 173 if ( s.length() != t.length() ) { 174 if ( s.length() > t.length() ) { 175 trimmedS = s.substring( 0, t.length() ); 176 } else { 177 trimmedT = t.substring( 0, s.length() ); 178 } 179 } 180 181 double result = 0; 182 for ( int i = 0; i < trimmedS.length(); i++ ) { 183 double rawpen = ( double ) i / ( double ) trimmedS.length() / weight - ( 1.0 - weight ); 184 double penalty = Math.max( 0.0, rawpen ); 185 // / System.err.println( rawpen + " " + penalty ); 186 result += trimmedT.charAt( i ) == trimmedS.charAt( i ) ? 0 : penalty; 187 } 188 return result; 189 } 190 }