View Javadoc
1   /*
2    * The baseCode project
3    * 
4    * Copyright (c) 2006 University of British Columbia
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.io.reader;
20  
21  import java.io.BufferedReader;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.util.*;
26  
27  import ubic.basecode.dataStructure.matrix.DoubleMatrix;
28  import ubic.basecode.dataStructure.matrix.SparseDoubleMatrix;
29  
30  /**
31   * @author pavlidis
32   * 
33   */
34  public class SparseDoubleMatrixReader extends DoubleMatrixReader {
35  
36      /**
37       * Read a sparse matrix that is expressed as an adjacency list in a tab-delimited file:
38       * 
39       * <pre>
40       *  item1 item2 weight
41       *  item1 item5 weight
42       * </pre>
43       * <p>
44       * By definition the resulting matrix is square and symmetric.
45       * </p>
46       * <p>
47       * Note that the ordering of the items will be as they are encountered in the file.
48       * 
49       * @param stream InputStream
50       * @return NamedMatrix
51       * @throws IOException
52       */
53      @Override
54      public DoubleMatrix<String, String> read( InputStream stream ) throws IOException {
55  
56          Set<String> itemNames = new HashSet<String>();
57          Map<String, Collection<IndexScoreDyad>> rows = new HashMap<String, Collection<IndexScoreDyad>>();
58  
59          BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
60  
61          String row;
62          int index = 0;
63          Map<String, Integer> nameIndexMap = new HashMap<String, Integer>(); // name --> eventual row index
64          while ( ( row = dis.readLine() ) != null ) {
65              StringTokenizer st = new StringTokenizer( row, " \t", false );
66  
67              String itemA = "";
68  
69              if ( st.hasMoreTokens() ) {
70                  itemA = st.nextToken();
71  
72                  if ( !itemNames.contains( itemA ) ) {
73                      rows.put( itemA, new HashSet<IndexScoreDyad>() );
74                      itemNames.add( itemA );
75                      nameIndexMap.put( itemA, index );
76                      index++;
77                  }
78              } else {
79                  // continue;
80              }
81  
82              String itemB = "";
83              if ( st.hasMoreTokens() ) {
84                  itemB = st.nextToken();
85                  if ( !itemNames.contains( itemB ) ) {
86                      rows.put( itemB, new HashSet<IndexScoreDyad>() );
87                      itemNames.add( itemB );
88                      nameIndexMap.put( itemB, index );
89                      index++;
90                  }
91              } else {
92                  // continue;
93              }
94  
95              double weight;
96              if ( st.hasMoreTokens() ) {
97                  weight = Double.parseDouble( st.nextToken() );
98              } else {
99                  weight = 1.0; // just make it a binary matrix.
100             }
101 
102             rows.get( itemA ).add( new IndexScoreDyad( nameIndexMap.get( itemB ).intValue(), weight ) );
103             rows.get( itemB ).add( new IndexScoreDyad( nameIndexMap.get( itemA ).intValue(), weight ) );
104         }
105 
106         SparseDoubleMatrix<String, String> matrix = new SparseDoubleMatrix<String, String>( itemNames.size(),
107                 itemNames.size() );
108 
109         List<String> itemVec = new Vector<String>( itemNames );
110         Collections.sort( itemVec );
111 
112         matrix.setColumnNames( itemVec );
113         matrix.setRowNames( itemVec );
114         for ( Object element2 : itemNames ) {
115             String itemA = ( String ) element2;
116             int rowIndex = matrix.getRowIndexByName( itemA );
117             Collection<IndexScoreDyad> arow = rows.get( itemA );
118             for ( Iterator<IndexScoreDyad> iterator = arow.iterator(); iterator.hasNext(); ) {
119                 IndexScoreDyad element = iterator.next();
120                 int ind = element.getKey();
121                 double weight = element.getValue();
122 
123                 matrix.set( rowIndex, ind, weight );
124                 matrix.set( ind, rowIndex, weight );
125             }
126 
127         }
128 
129         dis.close();
130         return matrix;
131     }
132 
133     @Override
134     public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames ) {
135         throw new UnsupportedOperationException();
136     }
137 
138     @Override
139     public DoubleMatrix<String, String> read( InputStream stream, Collection<String> wantedRowNames,
140             boolean createEmptyRows, int skipColumns, int maxRows ) {
141         throw new UnsupportedOperationException();
142     }
143 
144     /**
145      * Read a sparse matrix in "JW" (Jason Weston) format. The format is like this:
146      * 
147      * <pre>
148      * 2          &lt;--- number of items - the first line of the file only. NOTE - this line is often blank or not present.
149      * 1 2        &lt;--- items 1 has 2 edges
150      * 1 2        &lt;--- edge indices are to items 1 &amp; 2
151      * 0.1 100    &lt;--- with the following weights
152      * 2 2        &lt;--- items 2 also has 2 edges
153      * 1 2        &lt;--- edge indices are also to items 1 &amp; 2 (fully connected)
154      * 100 0.1    &lt;--- with the following weights
155      * </pre>
156      * <p>
157      * Note that the item numbering starts at 1. This is a requirement.
158      * <p>
159      * Note that this cannot handle very large matrices - the limit to rows x columns is the number Integer.MAX_VALUE.
160      * This is an implementation problem for colt's sparse matrix.
161      * 
162      * @param stream
163      * @param wantedRowNames
164      * @return
165      * @throws IOException
166      */
167     @SuppressWarnings("resource")
168     public DoubleMatrix<String, String> readJW( InputStream stream ) throws IOException {
169 
170         BufferedReader dis = new BufferedReader( new InputStreamReader( stream ) );
171 
172         Scanner ff = new Scanner( dis ).useLocale( Locale.ENGLISH );
173 
174         int index = 0;
175         int amount = 0;
176         double eval = 0;
177 
178         int dim = Integer.parseInt( dis.readLine() );
179         SparseDoubleMatrix<String, String> returnVal = new SparseDoubleMatrix<String, String>( dim, dim );
180 
181         for ( int k = 1; k <= dim; k++ ) {
182 
183             returnVal.setColumnName( new Integer( k ).toString(), k - 1 );
184             returnVal.setRowName( new Integer( k ).toString(), k - 1 );
185 
186             index = ff.nextInt(); // "item 1 has 2 edges"
187             // log.info( index );
188             amount = ff.nextInt();
189 
190             if ( index % 500 == 0 ) {
191                 log.debug( String.format( "loading %2.1f%% complete (%dth entry)... \n", 100.0 * index / dim, index ) );
192             }
193 
194             int[] rowind = new int[amount];
195             for ( int i = 0; i < amount; i++ ) { // "edge indices are to 1 and 2"
196 
197                 index = ff.nextInt();
198                 int ind = index;
199 
200                 if ( ind > dim || ind < 1 ) {
201                     ff.close();
202                     throw new IllegalStateException( "Illegal value " + ind + " found in index list for item " + k );
203                 }
204                 rowind[i] = ind;
205             }
206 
207             for ( int i = 0; i < amount; i++ ) { // "with the following weights"
208                 eval = ff.nextDouble();
209                 returnVal.set( k - 1, rowind[i] - 1, eval );
210             }
211 
212         }
213         ff.close();
214         return returnVal;
215     }
216 
217 }