View Javadoc
1   /*
2    * The baseCode project
3    * 
4    * Copyright (c) 2006 University of British Columbia
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math;
20  
21  /**
22   * @author pavlidis
23   * 
24   */
25  public class StringDistance {
26  
27      /**
28       * The edit distance counts the differences between two strings, where we would count a difference not only when
29       * strings have different characters but also when one has a character whereas the other does not. The formal
30       * definition follows.
31       * <p>
32       * For a string s, let s(i) stand for its ith character. For two characters a and b, define r(a, b) = 0 if a = b.
33       * Let r(a, b) = 1, otherwise.
34       * <p>
35       * Assume we are given two strings s and t of length n and m, respectively. We are going to fill an (n+1) x (m+1)
36       * array d with integers such that the low right corner element d(n+1, m+1) will furnish the required values of the
37       * Levenshtein distance L(s, t).
38       * <p>
39       * The definition of entries of d is recursive. First set d(i, 0) = i, i = 0, 1,..., n, and d(0, j) = j, j = 0, 1,
40       * ..., m. For other pairs i, j use (1) d(i, j) = min(d(i-1, j)+1, d(i, j-1)+1, d(i-1, j-1) + r(s(i), t(j)))
41       * <p>
42       * That last step is also described as:
43       * <li>Set cell d[i,j] of the matrix equal to the minimum of:<br>
44       * a. The cell immediately above plus 1: d[i-1,j] + 1.<br>
45       * b. The cell immediately to the left plus 1: d[i,j-1] + 1.<br>
46       * c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost.<br>
47       * <p>
48       * (Description partly cribbed from http://www.cut-the-knot.org/do_you_know/Strings.shtml and
49       * http://www.merriampark.com/ld.htm)
50       * </p>
51       * 
52       * @param a
53       * @param b
54       * @return
55       */
56      public static int editDistance( String s, String t ) {
57          int n = s.length();
58          int m = t.length();
59  
60          if ( n == 0 ) return m;
61          if ( m == 0 ) return n;
62  
63          // matrix has m+1 rows and n+1 columns.
64          int[][] mat = new int[m + 1][n + 1];
65  
66          // initialize all to zero except for first row and columns.
67          for ( int i = 0; i < mat.length; i++ ) {
68              for ( int j = 0; j < mat[i].length; j++ ) {
69                  mat[i][j] = 0;
70                  mat[0][j] = j;
71              }
72              mat[i][0] = i;
73          }
74  
75          // recursively fill in the matrix
76  
77          for ( int col = 1; col <= n; col++ ) {
78              char sc = s.charAt( col - 1 );
79  
80              for ( int row = 1; row <= m; row++ ) {
81                  char tc = t.charAt( row - 1 );
82  
83                  // minimum of: cell directly above + 1, the cell to the left + 1 and the cell above to the left + cost.
84                  int p = mat[row - 1][col] + 1; // above
85                  int q = mat[row][col - 1] + 1; // to the left
86                  int r = mat[row - 1][col - 1] + ( sc == tc ? 0 : 1 ); // above left +
87                  // cost
88  
89                  mat[row][col] = Math.min( Math.min( p, q ), r );
90              }
91  
92              // // debug
93              // for ( int k = 0; k < mat.length; k++ ) {
94              // for ( int e = 0; e < mat[k].length; e++ ) {
95              // System.err.print( mat[k][e] + " " );
96              // }
97              // System.err.print( "\n" );
98              // }
99              // System.err.print( "\n" );
100         }
101 
102         return mat[m][n];
103     }
104 
105     /**
106      * The Hamming distance H is defined only for strings of the same length. For two strings s and t, H(s, t) is the
107      * number of places in which the two string differ, i.e., have different characters.
108      * 
109      * @param a
110      * @param b
111      * @return
112      */
113     public static int hammingDistance( String a, String b ) {
114         if ( a.length() != b.length() ) throw new IllegalArgumentException( "Strings must be the same length" );
115         int result = 0;
116         for ( int i = 0; i < a.length(); i++ ) {
117             result += a.charAt( i ) == b.charAt( i ) ? 0 : 1;
118         }
119         return result;
120     }
121 
122     /**
123      * Compute the Hamming distance between two strings of equal length (if they are of unequal length the longer one is
124      * trimmed), giving higher weight to characters at the start of the strings, so strings that are similar at the
125      * starts are given higher scores (shorter distances) than strings that are similar at the ends.
126      * 
127      * @param s
128      * @param t
129      * @param weight Controls how quickly (normalized by the length of the string) the bias towards the front drops off.
130      *        The penalty for mismatches drops off linearly for the fraction of the String represented by
131      *        <code>weight</code>. A weight of 1.0 indicates the bias should be linear across the length of the string.
132      *        Intermediate values (e.g., 0.25) mean that differences beyond the first 1/4 of the string results in no
133      *        penalty.
134      * @return
135      */
136     public static double prefixWeightedHammingDistance( String s, String t, double weight ) {
137 
138         if ( weight <= 0.0 || weight > 1.0 )
139             throw new IllegalArgumentException( "weight must be between zero and one" );
140 
141         String trimmedS = s;
142         String trimmedT = t;
143         if ( s.length() != t.length() ) {
144             if ( s.length() > t.length() ) {
145                 trimmedS = s.substring( 0, t.length() );
146             } else {
147                 trimmedT = t.substring( 0, s.length() );
148             }
149         }
150 
151         double result = 0;
152         for ( int i = 0; i < trimmedS.length(); i++ ) {
153             double penalty = Math.max( 0.0, ( 1.0 - i / ( weight * trimmedS.length() ) ) );
154             result += trimmedT.charAt( i ) == trimmedS.charAt( i ) ? 0 : penalty;
155         }
156         return result;
157     }
158 
159     /**
160      * @param s
161      * @param t
162      * @param weight
163      * @return
164      * @see prefixWeightedHammingDistance
165      */
166     public static double suffixWeightedHammingDistance( String s, String t, double weight ) {
167 
168         if ( weight <= 0.0 || weight > 1.0 )
169             throw new IllegalArgumentException( "weight must be between zero and one" );
170 
171         String trimmedS = s;
172         String trimmedT = t;
173         if ( s.length() != t.length() ) {
174             if ( s.length() > t.length() ) {
175                 trimmedS = s.substring( 0, t.length() );
176             } else {
177                 trimmedT = t.substring( 0, s.length() );
178             }
179         }
180 
181         double result = 0;
182         for ( int i = 0; i < trimmedS.length(); i++ ) {
183             double rawpen = ( double ) i / ( double ) trimmedS.length() / weight - ( 1.0 - weight );
184             double penalty = Math.max( 0.0, rawpen );
185             // / System.err.println( rawpen + " " + penalty );
186             result += trimmedT.charAt( i ) == trimmedS.charAt( i ) ? 0 : penalty;
187         }
188         return result;
189     }
190 }