1   
2   
3   
4   
5   
6   
7   
8   
9   
10  
11  
12  
13  
14  
15  package ubic.basecode.math;
16  
17  import java.util.LinkedHashMap;
18  import java.util.Map;
19  
20  import ubic.basecode.dataStructure.matrix.DenseDoubleMatrix;
21  import ubic.basecode.dataStructure.matrix.DoubleMatrix;
22  import ubic.basecode.datafilter.RowMissingFilter;
23  import cern.colt.list.DoubleArrayList;
24  import cern.jet.stat.Descriptive;
25  
26  
27  
28  
29  
30  public class MatrixNormalizer<R, C> {
31  
32      
33  
34  
35  
36  
37  
38  
39  
40  
41  
42      public DoubleMatrix<R, C> quantileNormalize( DoubleMatrix<R, C> matrix ) {
43  
44          RowMissingFilter<DoubleMatrix<R, C>, R, C, Double> f = new RowMissingFilter<>();
45          f.setMinPresentCount( 1 );
46          DoubleMatrix<R, C> fM = f.filter( matrix );
47  
48          DoubleMatrix<R, C> missingValueStatus = imputeMissing( fM );
49  
50          
51  
52  
53          Map<Integer, DoubleArrayList> ranks = new LinkedHashMap<>();
54  
55          DoubleMatrix<R, C> sortedData = fM.copy();
56          for ( int i = 0; i < fM.columns(); i++ ) {
57              DoubleArrayList dataColumn = new DoubleArrayList( fM.getColumn( i ) );
58  
59              DoubleArrayList sortedColumn = dataColumn.copy();
60              sortedColumn.sort();
61              for ( int j = 0; j < sortedColumn.size(); j++ ) {
62                  sortedData.set( j, i, sortedColumn.get( j ) );
63              }
64  
65              DoubleArrayList r = Rank.rankTransform( dataColumn );
66              assert r != null;
67              ranks.put( i, r );
68          }
69  
70          
71  
72  
73          DoubleArrayList rowMeans = new DoubleArrayList( sortedData.rows() );
74          for ( int i = 0; i < sortedData.rows(); i++ ) {
75              double mean = Descriptive.mean( new DoubleArrayList( sortedData.getRow( i ) ) );
76              rowMeans.add( mean );
77          }
78  
79          for ( int j = 0; j < sortedData.columns(); j++ ) {
80  
81              for ( int i = 0; i < sortedData.rows(); i++ ) {
82  
83                  if ( Double.isNaN( fM.get( i, j ) ) ) {
84                      sortedData.set( i, j, Double.NaN );
85                      continue;
86                  }
87  
88                  double rank = ranks.get( j ).get( i ) - 1.0;
89  
90                  int intrank = ( int ) Math.floor( rank );
91  
92                  Double value = null;
93                  if ( rank - intrank > 0.4 && intrank > 0 ) {
94                      
95                      value = ( rowMeans.get( intrank ) + rowMeans.get( intrank - 1 ) ) / 2.0;
96                  } else {
97                      value = rowMeans.get( intrank );
98                  }
99                  assert value != null : "No mean value for rank=" + rank;
100                 sortedData.set( i, j, value );
101 
102             }
103         }
104 
105         assert missingValueStatus.rows() == sortedData.rows() && missingValueStatus.columns() == sortedData.columns();
106 
107         
108         for ( int i = 0; i < missingValueStatus.rows(); i++ ) {
109             for ( int j = 0; j < missingValueStatus.columns(); j++ ) {
110                 if ( Double.isNaN( missingValueStatus.get( i, j ) ) ) {
111                     sortedData.set( i, j, Double.NaN );
112                 }
113             }
114         }
115 
116         return sortedData;
117 
118     }
119 
120     
121 
122 
123 
124 
125 
126 
127 
128 
129 
130     private DoubleMatrix<R, C> imputeMissing( DoubleMatrix<R, C> matrix ) {
131         
132 
133 
134         DoubleMatrix<R, C> missingValueInfo = new DenseDoubleMatrix<>( matrix.rows(), matrix.columns() );
135         for ( int i = 0; i < matrix.rows(); i++ ) {
136             DoubleArrayList v = new DoubleArrayList( matrix.getRow( i ) );
137             double m = DescriptiveWithMissing.mean( v );
138             for ( int j = 0; j < matrix.columns(); j++ ) {
139                 double d = matrix.get( i, j );
140                 if ( Double.isNaN( d ) ) {
141                     missingValueInfo.set( i, j, Double.NaN );
142                     matrix.set( i, j, m );
143                 } else {
144                     missingValueInfo.set( i, j, 1.0 );
145                 }
146             }
147         }
148         return missingValueInfo;
149     }
150 }