View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2006 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.io.reader;
20  
21  import java.io.BufferedReader;
22  import java.io.File;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.InputStreamReader;
26  import java.text.DecimalFormat;
27  import java.text.NumberFormat;
28  import java.text.ParseException;
29  import java.util.*;
30  
31  import org.apache.commons.lang3.StringUtils;
32  
33  import ubic.basecode.dataStructure.matrix.DoubleMatrix;
34  import ubic.basecode.dataStructure.matrix.DoubleMatrixFactory;
35  import ubic.basecode.util.FileTools;
36  import cern.colt.list.DoubleArrayList;
37  
38  /**
39   * Reader for {@link basecode.dataStructure.matrix.DoubleMatrix}. Lines beginning with "#" or "!" will be ignored.
40   *
41   * @author Paul Pavlidis
42   *
43   */
44  public class DoubleMatrixReader extends AbstractMatrixReader<DoubleMatrix<String, String>, Double> {
45  
46      private List<String> colNames;
47      private int numHeadings;
48  
49      /**
50       * @param stream InputStream stream to read from
51       * @return NamedMatrix object constructed from the data file
52       * @throws IOException
53       */
54      @Override
55      public DoubleMatrix<String, String> read( InputStream stream ) throws IOException {
56          return read( stream, null, 0 );
57      }
58  
59      /**
60       * @param stream InputStream
61       * @param wantedRowNames Set
62       * @return <code>read( stream, wantedRowNames, createEmptyRows )</code> with <code>createEmptyRows</code> set to
63       *         true.
64       * @throws IOException
65       */
66      public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames )
67              throws IOException {
68          return read( stream, wantedRowNames, true, 0, -1 );
69      }
70  
71      /**
72       * @param stream InputStream
73       * @param wantedRowNames Set
74       * @param createEmptyRows if a row contained in <code>wantedRowNames</code> is not found in the file, create an
75       *        empty row filled with Double.NaN iff this param is true.
76       * @param maxRows
77       * @return matrix
78       * @throws IOException
79       */
80      @SuppressWarnings("resource")
81      public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames,
82              boolean createEmptyRows, int skipColumns, int maxRows ) throws IOException {
83  
84          NumberFormat nf = NumberFormat.getInstance( Locale.ENGLISH );
85          if ( nf instanceof DecimalFormat ) {
86              // ( ( DecimalFormat ) nf ).setDecimalSeparatorAlwaysShown( true );
87          }
88  
89          BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
90  
91          List<DoubleArrayList> MTemp = new Vector<DoubleArrayList>();
92  
93          List<String> rowNames = new Vector<String>();
94          String row;
95  
96          //
97          // We need to keep track of which row names we actually found in the file
98          // because will want to add empty rows for each row name we didn't find
99          // (if createEmptyRows == true).
100         //
101         Collection<String> wantedRowsFound = new HashSet<String>();
102 
103         colNames = readHeader( dis, skipColumns );
104 
105         numHeadings = colNames.size();
106 
107         int rowNumber = 0;
108 
109         while ( ( row = dis.readLine() ) != null ) {
110 
111             if ( StringUtils.isBlank( row ) ) {
112                 continue;
113             }
114 
115             String rowName = parseRow( row, rowNames, MTemp, wantedRowNames, skipColumns, nf );
116 
117             if ( rowName == null ) {
118                 // signals a blank or skipped row.
119                 continue;
120             }
121 
122             if ( wantedRowNames != null ) {
123 
124                 // if we already have all the rows we want, then bail out
125                 if ( wantedRowsFound.size() >= wantedRowNames.size() ) {
126                     assert wantedRowsFound.containsAll( wantedRowNames );
127                     log.info( "Found all rows needed" );
128                     return createMatrix( MTemp, rowNames, colNames );
129                 }
130 
131                 if ( wantedRowNames.contains( rowName ) ) {
132                     wantedRowsFound.add( rowName );
133                 }
134             }
135 
136             if ( maxRows > 0 && ++rowNumber == maxRows ) break;
137 
138         }
139         stream.close();
140 
141         //
142         // Add empty rows for each row name we didn't find in the file
143         //
144         if ( wantedRowNames != null && wantedRowNames.size() != wantedRowsFound.size() && createEmptyRows ) {
145             Iterator<String> iterator = wantedRowNames.iterator();
146             while ( iterator.hasNext() ) {
147                 String s = iterator.next();
148                 if ( !wantedRowsFound.contains( s ) ) {
149                     if ( log.isDebugEnabled() ) log.debug( s + " was not found, adding empty row" );
150                     DoubleArrayList emptyRow = createEmptyRow( numHeadings );
151                     rowNames.add( s );
152                     MTemp.add( emptyRow );
153                 }
154             }
155         }
156         assert rowNames.size() == MTemp.size();
157         return createMatrix( MTemp, rowNames, colNames );
158 
159     }
160 
161     /**
162      * @param stream
163      * @param wantedRowNames
164      * @param numberOfColumnsToSkip
165      * @return
166      * @throws IOException
167      */
168     public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames,
169             int numberOfColumnsToSkip ) throws IOException {
170         return read( stream, wantedRowNames, true, numberOfColumnsToSkip, -1 );
171     }
172 
173     /**
174      * @param filename data file to read from (can be compressed)
175      * @return NamedMatrix object constructed from the data file
176      * @throws IOException
177      */
178     @Override
179     public DoubleMatrix<String, String> read( String filename ) throws IOException {
180         return read( filename, null, -1 );
181     }
182 
183     /**
184      * Read a matrix from a file, subject to filtering criteria.
185      *
186      * @param filename data file to read from (can be compressed)
187      * @param wantedRowNames contains names of rows we want to get
188      * @return NamedMatrix object constructed from the data file
189      * @throws IOException
190      */
191     @SuppressWarnings("resource")
192     public DoubleMatrix<String, String> read( String filename, Collection<String> wantedRowNames ) throws IOException {
193         File infile = new File( filename );
194         if ( !infile.exists() || !infile.canRead() ) {
195             throw new IOException( "Could not read from file " + filename );
196         }
197         InputStream stream = FileTools.getInputStreamFromPlainOrCompressedFile( filename );
198         return read( stream, wantedRowNames, -1 );
199     } // end read
200 
201     /**
202      * @param fileName
203      * @param wantedRowNames if null, takes all rows
204      * @param numberOfColumnsToSkip how many columns to skip -- not counting the first column. So if you set this to 4,
205      *        the first four data columns will be skipped. If you set it to zero, only the first column will be skipped.
206      * @return
207      * @throws IOException
208      */
209     @SuppressWarnings("resource")
210     public DoubleMatrix<String, String> read( String fileName, Collection<String> wantedRowNames,
211             int numberOfColumnsToSkip ) throws IOException {
212         File infile = new File( fileName );
213         if ( !infile.exists() || !infile.canRead() ) {
214             throw new IOException( "Could not read from file " + fileName );
215         }
216         InputStream stream = FileTools.getInputStreamFromPlainOrCompressedFile( fileName );
217         return read( stream, wantedRowNames, true, numberOfColumnsToSkip, -1 );
218     }
219 
220     @Override
221     public DoubleMatrix<String, String> read( String filename, int maxRows ) throws IOException {
222         return read( filename, null, maxRows );
223     }
224 
225     protected DoubleArrayList createEmptyRow( int numColumns ) {
226 
227         DoubleArrayList row = new DoubleArrayList();
228         for ( int i = 0; i < numColumns; i++ ) {
229             row.add( Double.NaN );
230         }
231         return row;
232     }
233 
234     // -----------------------------------------------------------------
235     // protected methods
236     // -----------------------------------------------------------------
237 
238     protected DoubleMatrix<String, String> createMatrix( List<DoubleArrayList> MTemp, List<String> rowNames,
239             List<String> colNames1 ) {
240 
241         if ( MTemp.isEmpty() ) {
242             throw new IllegalArgumentException( "Must provide vectors" );
243         }
244         DoubleMatrix<String, String> matrix = DoubleMatrixFactory.fastrow( MTemp.size(), MTemp.get( 0 ).size() );
245 
246         for ( int i = 0; i < matrix.rows(); i++ ) {
247             for ( int j = 0; j < matrix.columns(); j++ ) {
248                 if ( MTemp.get( i ).size() < j + 1 ) {
249                     matrix.set( i, j, Double.NaN );
250                     // this allows the input file to have ragged ends.
251                     // todo I'm not sure allowing ragged inputs is a good idea -PP
252                 } else {
253                     matrix.set( i, j, MTemp.get( i ).elements()[j] );
254                 }
255             }
256         }
257 
258         assert matrix.rows() == MTemp.size();
259         assert matrix.rows() == rowNames.size();
260         assert matrix.columns() == colNames.size() : "Got " + matrix.columns() + " != " + colNames.size();
261 
262         matrix.setRowNames( rowNames );
263         matrix.setColumnNames( colNames1 );
264         return matrix;
265 
266     } // end createMatrix
267 
268     /**
269      * @param row
270      * @param rowNames
271      * @param MTemp
272      * @param wantedRowNames
273      * @param skipColumns the number of columns after the first to ignore (for example, Gemma output that includes gene
274      *        information as well as numeric data)
275      * @return
276      * @throws IOException
277      */
278     private String parseRow( String row, Collection<String> rowNames, List<DoubleArrayList> MTemp,
279             Collection<String> wantedRowNames, int skipColumns, NumberFormat nf ) throws IOException {
280 
281         if ( row.startsWith( "#" ) || row.startsWith( "!" ) ) {
282             return null;
283         }
284 
285         String[] tokens = StringUtils.splitPreserveAllTokens( row, "\t" );
286 
287         DoubleArrayList rowTemp = new DoubleArrayList();
288         int columnNumber = 0;
289         String previousToken = "";
290         String currentRowName = null;
291         for ( int i = 0; i < tokens.length; i++ ) {
292             String tok = tokens[i];
293             boolean missing = false;
294 
295             if ( tok.compareTo( "\t" ) == 0 ) {
296                 /* two tabs in a row */
297                 if ( previousToken.compareTo( "\t" ) == 0 ) {
298                     missing = true;
299                 } else if ( i == tokens.length - 1 ) { // at end of line.
300                     missing = true;
301                 } else {
302                     previousToken = tok;
303                     continue;
304                 }
305             } else if ( StringUtils.isBlank( tok ) || tok.compareTo( "NaN" ) == 0 || tok.compareTo( "NA" ) == 0 ) {
306                 missing = true;
307             }
308 
309             if ( columnNumber > 0 ) {
310 
311                 if ( skipColumns > 0 && columnNumber <= skipColumns ) {
312                     // skip.
313                 } else if ( missing ) {
314                     rowTemp.add( Double.NaN );
315                 } else {
316                     try {
317                         /*
318                          * NumberFormat.parse thinks things like 9101001_at are okay. Try to catch such cases. Note that
319                          * we can't use Double.parseDouble because that doesn't seem to handle locale-specific number
320                          * formats like european decimals (0,001 etc.)
321                          */
322                         // if ( tok.matches( ".*[a-zA-Z_=].*" ) ) {
323                         // throw new NumberFormatException( "Unexpected non-numeric value found in column "
324                         // + columnNumber + ": " + tok );
325                         // }
326                         rowTemp.add( nf.parse( tok.toUpperCase() ).doubleValue() );
327                     } catch ( ParseException e ) {
328                         throw new RuntimeException( e );
329                     }
330                 }
331             } else {
332                 // First field is the row label.
333 
334                 if ( missing ) {
335                     throw new IOException( "Missing values not allowed for row labels ("
336                             + StringUtils.abbreviate( row, 20 ) + ")" );
337                 }
338 
339                 currentRowName = tok;
340 
341                 // Skip rows. Return the row name anyway.
342                 if ( wantedRowNames != null && !wantedRowNames.contains( currentRowName ) ) {
343                     return currentRowName;
344                 }
345 
346                 rowNames.add( currentRowName );
347             }
348 
349             columnNumber++;
350             previousToken = tok;
351         } // end while (st.hasMoreTokens())
352           // done parsing one row -- no more tokens
353 
354         if ( rowTemp.size() > numHeadings ) {
355             throw new IOException( "Too many values (" + rowTemp.size() + ") in row  (based on headings count of "
356                     + numHeadings + ")" );
357         }
358 
359         MTemp.add( rowTemp );
360         return currentRowName;
361 
362     }
363 
364 } // end class DoubleMatrixReader