View Javadoc
1   /*
2    * The baseCode project
3    * 
4    * Copyright (c) 2006 University of British Columbia
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.io.reader;
20  
21  import java.io.BufferedReader;
22  import java.io.File;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.InputStreamReader;
26  import java.text.DecimalFormat;
27  import java.text.NumberFormat;
28  import java.text.ParseException;
29  import java.util.*;
30  
31  import org.apache.commons.lang3.StringUtils;
32  
33  import ubic.basecode.dataStructure.matrix.DoubleMatrix;
34  import ubic.basecode.dataStructure.matrix.DoubleMatrixFactory;
35  import ubic.basecode.util.FileTools;
36  import cern.colt.list.DoubleArrayList;
37  
38  /**
39   * Reader for {@link basecode.dataStructure.matrix.DoubleMatrix}. Lines beginning with "#" or "!" will be ignored.
40   * 
41   * @author Paul Pavlidis
42   * 
43   */
44  public class DoubleMatrixReader extends AbstractMatrixReader<DoubleMatrix<String, String>, Double> {
45  
46      private static NumberFormat nf = NumberFormat.getInstance( Locale.ENGLISH );
47      static {
48          if ( nf instanceof DecimalFormat ) {
49              // ( ( DecimalFormat ) nf ).setDecimalSeparatorAlwaysShown( true );
50          }
51      }
52      private List<String> colNames;
53      private int numHeadings;
54  
55      /**
56       * @param stream InputStream stream to read from
57       * @return NamedMatrix object constructed from the data file
58       * @throws IOException
59       */
60      @Override
61      public DoubleMatrix<String, String> read( InputStream stream ) throws IOException {
62          return read( stream, null, 0 );
63      }
64  
65      /**
66       * @param stream InputStream
67       * @param wantedRowNames Set
68       * @return <code>read( stream, wantedRowNames, createEmptyRows )</code> with <code>createEmptyRows</code> set to
69       *         true.
70       * @throws IOException
71       */
72      public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames )
73              throws IOException {
74          return read( stream, wantedRowNames, true, 0, -1 );
75      }
76  
77      /**
78       * @param stream InputStream
79       * @param wantedRowNames Set
80       * @param createEmptyRows if a row contained in <code>wantedRowNames</code> is not found in the file, create an
81       *        empty row filled with Double.NaN iff this param is true.
82       * @param maxRows
83       * @return matrix
84       * @throws IOException
85       */
86      @SuppressWarnings("resource")
87      public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames,
88              boolean createEmptyRows, int skipColumns, int maxRows ) throws IOException {
89  
90          BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
91  
92          List<DoubleArrayList> MTemp = new Vector<DoubleArrayList>();
93  
94          List<String> rowNames = new Vector<String>();
95          String row;
96  
97          //
98          // We need to keep track of which row names we actually found in the file
99          // because will want to add empty rows for each row name we didn't find
100         // (if createEmptyRows == true).
101         //
102         Collection<String> wantedRowsFound = new HashSet<String>();
103 
104         colNames = readHeader( dis, skipColumns );
105 
106         numHeadings = colNames.size();
107 
108         int rowNumber = 0;
109 
110         while ( ( row = dis.readLine() ) != null ) {
111 
112             if ( StringUtils.isBlank( row ) ) {
113                 continue;
114             }
115 
116             String rowName = parseRow( row, rowNames, MTemp, wantedRowNames, skipColumns );
117 
118             if ( rowName == null ) {
119                 // signals a blank or skipped row.
120                 continue;
121             }
122 
123             if ( wantedRowNames != null ) {
124 
125                 // if we already have all the rows we want, then bail out
126                 if ( wantedRowsFound.size() >= wantedRowNames.size() ) {
127                     assert wantedRowsFound.containsAll( wantedRowNames );
128                     log.info( "Found all rows needed" );
129                     return createMatrix( MTemp, rowNames, colNames );
130                 }
131 
132                 if ( wantedRowNames.contains( rowName ) ) {
133                     wantedRowsFound.add( rowName );
134                 }
135             }
136 
137             if ( maxRows > 0 && ++rowNumber == maxRows ) break;
138 
139         }
140         stream.close();
141 
142         //
143         // Add empty rows for each row name we didn't find in the file
144         //
145         if ( wantedRowNames != null && wantedRowNames.size() != wantedRowsFound.size() && createEmptyRows ) {
146             Iterator<String> iterator = wantedRowNames.iterator();
147             while ( iterator.hasNext() ) {
148                 String s = iterator.next();
149                 if ( !wantedRowsFound.contains( s ) ) {
150                     if ( log.isDebugEnabled() ) log.debug( s + " was not found, adding empty row" );
151                     DoubleArrayList emptyRow = createEmptyRow( numHeadings );
152                     rowNames.add( s );
153                     MTemp.add( emptyRow );
154                 }
155             }
156         }
157         assert rowNames.size() == MTemp.size();
158         return createMatrix( MTemp, rowNames, colNames );
159 
160     }
161 
162     /**
163      * @param stream
164      * @param wantedRowNames
165      * @param numberOfColumnsToSkip
166      * @return
167      * @throws IOException
168      */
169     public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames,
170             int numberOfColumnsToSkip ) throws IOException {
171         return read( stream, wantedRowNames, true, numberOfColumnsToSkip, -1 );
172     }
173 
174     /**
175      * @param filename data file to read from (can be compressed)
176      * @return NamedMatrix object constructed from the data file
177      * @throws IOException
178      */
179     @Override
180     public DoubleMatrix<String, String> read( String filename ) throws IOException {
181         return read( filename, null, -1 );
182     }
183 
184     /**
185      * Read a matrix from a file, subject to filtering criteria.
186      * 
187      * @param filename data file to read from (can be compressed)
188      * @param wantedRowNames contains names of rows we want to get
189      * @return NamedMatrix object constructed from the data file
190      * @throws IOException
191      */
192     @SuppressWarnings("resource")
193     public DoubleMatrix<String, String> read( String filename, Collection<String> wantedRowNames ) throws IOException {
194         File infile = new File( filename );
195         if ( !infile.exists() || !infile.canRead() ) {
196             throw new IOException( "Could not read from file " + filename );
197         }
198         InputStream stream = FileTools.getInputStreamFromPlainOrCompressedFile( filename );
199         return read( stream, wantedRowNames, -1 );
200     } // end read
201 
202     /**
203      * @param fileName
204      * @param wantedRowNames if null, takes all rows
205      * @param numberOfColumnsToSkip how many columns to skip -- not counting the first column. So if you set this to 4,
206      *        the first four data columns will be skipped. If you set it to zero, only the first column will be skipped.
207      * @return
208      * @throws IOException
209      */
210     @SuppressWarnings("resource")
211     public DoubleMatrix<String, String> read( String fileName, Collection<String> wantedRowNames,
212             int numberOfColumnsToSkip ) throws IOException {
213         File infile = new File( fileName );
214         if ( !infile.exists() || !infile.canRead() ) {
215             throw new IOException( "Could not read from file " + fileName );
216         }
217         InputStream stream = FileTools.getInputStreamFromPlainOrCompressedFile( fileName );
218         return read( stream, wantedRowNames, true, numberOfColumnsToSkip, -1 );
219     }
220 
221     @Override
222     public DoubleMatrix<String, String> read( String filename, int maxRows ) throws IOException {
223         return read( filename, null, maxRows );
224     }
225 
226     protected DoubleArrayList createEmptyRow( int numColumns ) {
227 
228         DoubleArrayList row = new DoubleArrayList();
229         for ( int i = 0; i < numColumns; i++ ) {
230             row.add( Double.NaN );
231         }
232         return row;
233     }
234 
235     // -----------------------------------------------------------------
236     // protected methods
237     // -----------------------------------------------------------------
238 
239     protected DoubleMatrix<String, String> createMatrix( List<DoubleArrayList> MTemp, List<String> rowNames,
240             List<String> colNames1 ) {
241 
242         if ( MTemp.isEmpty() ) {
243             throw new IllegalArgumentException( "Must provide vectors" );
244         }
245         DoubleMatrix<String, String> matrix = DoubleMatrixFactory.fastrow( MTemp.size(), MTemp.get( 0 ).size() );
246 
247         for ( int i = 0; i < matrix.rows(); i++ ) {
248             for ( int j = 0; j < matrix.columns(); j++ ) {
249                 if ( MTemp.get( i ).size() < j + 1 ) {
250                     matrix.set( i, j, Double.NaN );
251                     // this allows the input file to have ragged ends.
252                     // todo I'm not sure allowing ragged inputs is a good idea -PP
253                 } else {
254                     matrix.set( i, j, MTemp.get( i ).elements()[j] );
255                 }
256             }
257         }
258 
259         assert matrix.rows() == MTemp.size();
260         assert matrix.rows() == rowNames.size();
261         assert matrix.columns() == colNames.size() : "Got " + matrix.columns() + " != " + colNames.size();
262 
263         matrix.setRowNames( rowNames );
264         matrix.setColumnNames( colNames1 );
265         return matrix;
266 
267     } // end createMatrix
268 
269     /**
270      * @param row
271      * @param rowNames
272      * @param MTemp
273      * @param wantedRowNames
274      * @param skipColumns the number of columns after the first to ignore (for example, Gemma output that includes gene
275      *        information as well as numeric data)
276      * @return
277      * @throws IOException
278      */
279     private String parseRow( String row, Collection<String> rowNames, List<DoubleArrayList> MTemp,
280             Collection<String> wantedRowNames, int skipColumns ) throws IOException {
281 
282         if ( row.startsWith( "#" ) || row.startsWith( "!" ) ) {
283             return null;
284         }
285 
286         String[] tokens = StringUtils.splitPreserveAllTokens( row, "\t" );
287 
288         DoubleArrayList rowTemp = new DoubleArrayList();
289         int columnNumber = 0;
290         String previousToken = "";
291         String currentRowName = null;
292         for ( int i = 0; i < tokens.length; i++ ) {
293             String tok = tokens[i];
294             boolean missing = false;
295 
296             if ( tok.compareTo( "\t" ) == 0 ) {
297                 /* two tabs in a row */
298                 if ( previousToken.compareTo( "\t" ) == 0 ) {
299                     missing = true;
300                 } else if ( i == tokens.length - 1 ) { // at end of line.
301                     missing = true;
302                 } else {
303                     previousToken = tok;
304                     continue;
305                 }
306             } else if ( StringUtils.isBlank( tok ) || tok.compareTo( "NaN" ) == 0 || tok.compareTo( "NA" ) == 0 ) {
307                 missing = true;
308             }
309 
310             if ( columnNumber > 0 ) {
311 
312                 if ( skipColumns > 0 && columnNumber <= skipColumns ) {
313                     // skip.
314                 } else if ( missing ) {
315                     rowTemp.add( Double.NaN );
316                 } else {
317                     try {
318                         /*
319                          * NumberFormat.parse thinks things like 9101001_at are okay. Try to catch such cases. Note that
320                          * we can't use Double.parseDouble because that doesn't seem to handle locale-specific number
321                          * formats like european decimals (0,001 etc.)
322                          */
323                         // if ( tok.matches( ".*[a-zA-Z_=].*" ) ) {
324                         // throw new NumberFormatException( "Unexpected non-numeric value found in column "
325                         // + columnNumber + ": " + tok );
326                         // }
327                         rowTemp.add( nf.parse( tok.toUpperCase() ).doubleValue() );
328                     } catch ( ParseException e ) {
329                         throw new RuntimeException( e );
330                     }
331                 }
332             } else {
333                 // First field is the row label.
334 
335                 if ( missing ) {
336                     throw new IOException( "Missing values not allowed for row labels ("
337                             + StringUtils.abbreviate( row, 20 ) + ")" );
338                 }
339 
340                 currentRowName = tok;
341 
342                 // Skip rows. Return the row name anyway.
343                 if ( wantedRowNames != null && !wantedRowNames.contains( currentRowName ) ) {
344                     return currentRowName;
345                 }
346 
347                 rowNames.add( currentRowName );
348             }
349 
350             columnNumber++;
351             previousToken = tok;
352         } // end while (st.hasMoreTokens())
353           // done parsing one row -- no more tokens
354 
355         if ( rowTemp.size() > numHeadings ) {
356             throw new IOException( "Too many values (" + rowTemp.size() + ") in row  (based on headings count of "
357                     + numHeadings + ")" );
358         }
359 
360         MTemp.add( rowTemp );
361         return currentRowName;
362 
363     }
364 
365 } // end class DoubleMatrixReader