View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2006 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.util;
20  
21  import org.apache.commons.csv.CSVFormat;
22  import org.apache.commons.csv.CSVParser;
23  import org.apache.commons.csv.CSVRecord;
24  import org.apache.commons.lang3.StringUtils;
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  
28  import java.io.IOException;
29  import java.io.StringReader;
30  import java.util.Collection;
31  import java.util.HashMap;
32  import java.util.Map;
33  
34  /**
35   * @author pavlidis
36   */
37  public class StringUtil {
38  
39      private static final Logger log = LoggerFactory.getLogger( StringUtil.class );
40  
41      /**
42       * @param appendee  The string to be added to
43       * @param appendant The string to add to the end of the appendee
44       * @param separator The string to put between the joined strings, if necessary.
45       * @return appendee + separator + separator unless appendee is empty, in which case the appendant is returned.
46       */
47      public static String append( String appendee, String appendant, String separator ) {
48          if ( StringUtils.isBlank( appendee ) ) {
49              return appendant;
50          }
51          return appendee + separator + appendant;
52  
53      }
54  
55      /**
56       * Given a set of strings, identify any prefix they have in common.
57       *
58       * @param strings
59       * @return the common prefix, null if there isn't one.
60       */
61      public static String commonPrefix( Collection<String> strings ) {
62          // find the shortest string; this is the maximum length of the prefix. It is itself the prefix to look for.
63          String shortest = shortestString( strings );
64  
65          if ( shortest == null || shortest.length() == 0 ) return null;
66  
67          String test = shortest;
68          while ( test.length() > 0 ) {
69              boolean found = true;
70              for ( String string : strings ) {
71                  if ( !string.startsWith( test ) ) {
72                      found = false;
73                      break;
74                  }
75              }
76              if ( found ) return test;
77              test = test.substring( 0, test.length() - 1 );
78          }
79          return null;
80      }
81  
82      /**
83       * Given a set of strings, identify any suffix they have in common.
84       *
85       * @param strings
86       * @return the commons suffix, null if there isn't one.
87       */
88      public static String commonSuffix( Collection<String> strings ) {
89          String shortest = shortestString( strings );
90  
91          if ( shortest == null || shortest.length() == 0 ) return null;
92  
93          String test = shortest;
94          while ( test.length() > 0 ) {
95              boolean found = true;
96              for ( String string : strings ) {
97                  if ( !string.endsWith( test ) ) {
98                      found = false;
99                      break;
100                 }
101             }
102             if ( found ) return test;
103             test = test.substring( 1 );
104         }
105         return null;
106     }
107 
108     /**
109      * Checks a string to find "strange" character, used by phenocarta to check evidence description
110      *
111      * @param the string to check
112      * @return return false if something strange was found
113      * @author Nicolas?
114      */
115     public static boolean containsValidCharacter( String s ) {
116 
117         if ( s != null ) {
118 
119             for ( int i = 0; i < s.length(); i++ ) {
120 
121                 Character cha = s.charAt( i );
122 
123                 if ( !( isLatinLetter( cha ) || Character.isDigit( cha ) || cha == '=' || cha == ',' || cha == '('
124                     || cha == ')' || cha == '\'' || Character.isWhitespace( cha ) || cha == '/' || cha == '?'
125                     || cha == '+' || cha == ':' || cha == '-' || cha == '<' || cha == '>' || cha == '"'
126                     || cha == '%' || cha == '.' || cha == '*' || cha == '[' || cha == ']' || cha == ';'
127                     || cha == '_' || cha == '\\' || cha == '|' || cha == '&' || cha == '^' || cha == '#'
128                     || cha == '{' || cha == '}' || cha == '!' || cha == '~' || cha == '@' || cha == '—'
129                     || cha == '×' || cha == '–' || cha == ' ' ) ) {
130 
131                     // new cha to be added, special Öö≤≥âμ etc... TODO and check later if found
132 
133                     log.warn( "Illegal character found: " + cha + " found on description: " + s );
134 
135                     return false;
136                 }
137             }
138         }
139         return true;
140     }
141 
142     /**
143      * @param line
144      * @return
145      */
146     public static String[] csvSplit( String line ) {
147         try ( CSVParser parser = CSVParser.parse( new StringReader( line ), CSVFormat.DEFAULT ) ) {
148             for ( CSVRecord record : parser ) {
149                 return record.values();
150             }
151             throw new IllegalArgumentException( "No CSV records found in line." );
152         } catch ( IOException e ) {
153             throw new RuntimeException( e );
154         }
155     }
156 
157     /**
158      * Made by Nicolas
159      *
160      * @param a line in a file cvs format
161      * @return the same line but in tsv format
162      */
163     public static String cvs2tsv( String line ) {
164 
165         StringBuffer newLine = new StringBuffer( line );
166 
167         boolean change = true;
168 
169         for ( int position = 0; position < newLine.length(); position++ ) {
170 
171             if ( newLine.charAt( position ) == ',' && change ) {
172                 newLine.setCharAt( position, '\t' );
173             } else if ( newLine.charAt( position ) == '"' ) {
174 
175                 if ( change ) {
176                     change = false;
177                 } else {
178                     change = true;
179                 }
180             }
181         }
182         return newLine.toString().replaceAll( "\"", "" );
183     }
184 
185     public static boolean isLatinLetter( char c ) {
186         return ( c >= 'A' && c <= 'Z' ) || ( c >= 'a' && c <= 'z' );
187     }
188 
189     /**
190      * Mimics the {@code make.names} method in R (character.c) to make valid variables names; we use this for column
191      * headers in some output files.
192      * <p>
193      * This was modified in 1.1.26 to match the behavior of R more closely, if not exactly.
194      *
195      * @param s a string to be made valid for R
196      * @return modified string
197      * @author paul
198      * @deprecated use {@link #makeNames(String[], boolean)} instead
199      */
200     public static String makeValidForR( String s ) {
201         return makeNames( s );
202     }
203 
204     /**
205      * Mimics the {@code make.names} method in R when using with a vector of strings and the unique argument set to TRUE.
206      * @author poirigui
207      * @deprecated use {@link #makeNames(String[], boolean)} instead
208      */
209     @Deprecated
210     public static String[] makeValidForR( String[] strings ) {
211         return makeNames( strings, true );
212     }
213 
214     /**
215      * Mimics the {@code make.names} method in R.
216      * @param strings a list of strings to be made valid for R
217      * @param unique  if true, will ensure that the names are unique by appending a number to duplicates as per
218      * {@link #makeUnique(String[])}
219      * @author poirigui
220      */
221     public static String[] makeNames( String[] strings, boolean unique ) {
222         String[] result = new String[strings.length];
223         if ( unique ) {
224             Map<String, Integer> counts = new HashMap<>();
225             for ( int i = 0; i < strings.length; i++ ) {
226                 String s = strings[i];
227                 String rs = makeNames( s );
228                 if ( counts.containsKey( rs ) ) {
229                     int count = counts.get( rs );
230                     result[i] = rs + "." + count;
231                     counts.put( rs, count + 1 );
232                 } else {
233                     result[i] = rs;
234                     counts.put( rs, 1 );
235                 }
236             }
237         } else {
238             for ( int i = 0; i < strings.length; i++ ) {
239                 result[i] = makeNames( strings[i] );
240             }
241         }
242         return result;
243     }
244 
245     private static final String[] R_RESERVED_WORDS = {
246         "if", "else", "repeat", "while", "function", "for", "in", "next", "break",
247         "TRUE", "FALSE", "NULL", "Inf", "NaN", "NA", "NA_integer_", "NA_real_", "NA_character_", "NA_complex_",
248     };
249 
250     /**
251      * Mimics the {@code make.names} method in R for a single string.
252      * @author paul
253      */
254     public static String makeNames( String s ) {
255         if ( s == null ) {
256             return "NA";
257         }
258         if ( s.isEmpty()
259             // starts with a non-letter or non-dot
260             || ( !Character.isAlphabetic( s.charAt( 0 ) ) && s.charAt( 0 ) != '.' )
261             // dot followed by a digit
262             || ( s.charAt( 0 ) == '.' && s.length() > 1 && Character.isDigit( s.charAt( 1 ) ) ) ) {
263             return "X" + s.replaceAll( "[^A-Za-z0-9._]", "." );
264         }
265         if ( StringUtils.equalsAny( s, R_RESERVED_WORDS ) ) {
266             return s + ".";
267         }
268         return s.replaceAll( "[^A-Za-z0-9._]", "." );
269     }
270 
271     /**
272      * Mimics the {@code make.unique} method in R.
273      * <p>
274      * Duplicated values in the input array will be suffixed with a dot and a number, starting from 1.
275      * @author poirigui
276      */
277     public static String[] makeUnique( String[] strings ) {
278         Map<String, Integer> counts = new HashMap<>();
279         String[] result = new String[strings.length];
280         for ( int i = 0; i < strings.length; i++ ) {
281             String cn = strings[i];
282             if ( counts.containsKey( cn ) ) {
283                 int count = counts.get( cn );
284                 result[i] = cn + "." + count;
285                 counts.put( cn, count + 1 );
286             } else {
287                 result[i] = cn;
288                 counts.put( cn, 1 );
289             }
290         }
291         return result;
292 
293     }
294 
295     /**
296      * @param stringi
297      * @param stringj
298      * @return
299      */
300     public static Long twoStringHashKey( String stringi, String stringj ) {
301         // use arbitrary but consistent method for ordering.
302         if ( stringi.hashCode() < stringj.hashCode() ) {
303             return new Long( stringi.hashCode() | ( long ) stringj.hashCode() << 32 );
304         }
305         return new Long( stringj.hashCode() | ( long ) stringi.hashCode() << 32 );
306     }
307 
308     private static String shortestString( Collection<String> strings ) {
309         String shortest = null;
310         for ( String string : strings ) {
311             if ( shortest == null || string.length() < shortest.length() ) shortest = string;
312         }
313         return shortest;
314     }
315 
316 }