View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2011 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
7    * the License. You may obtain a copy of the License at
8    *
9    * http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
12   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
13   * specific language governing permissions and limitations under the License.
14   */
15  package ubic.basecode.math;
16  
17  import java.util.LinkedHashMap;
18  import java.util.Map;
19  
20  import ubic.basecode.dataStructure.matrix.DenseDoubleMatrix;
21  import ubic.basecode.dataStructure.matrix.DoubleMatrix;
22  import ubic.basecode.datafilter.RowMissingFilter;
23  import cern.colt.list.DoubleArrayList;
24  import cern.jet.stat.Descriptive;
25  
26  /**
27   * @author paul
28   * 
29   */
30  public class MatrixNormalizer<R, C> {
31  
32      /**
33       * Rows with all missing will not be returned. Otherwise, missing values are imputed, used for estimating quantiles,
34       * and then replaced with missing values at the end.
35       * <p>
36       * Note that the Bioconductor implementation deals with missing values differently, and in a much more complex way.
37       * Therefore this gives different answers in the missing value case from Bioconductor (normalize.quantiles).
38       *
39       * @param matrix
40       * @return
41       */
42      public DoubleMatrix<R, C> quantileNormalize( DoubleMatrix<R, C> matrix ) {
43  
44          RowMissingFilter<DoubleMatrix<R, C>, R, C, Double> f = new RowMissingFilter<>();
45          f.setMinPresentCount( 1 );
46          DoubleMatrix<R, C> fM = f.filter( matrix );
47  
48          DoubleMatrix<R, C> missingValueStatus = imputeMissing( fM );
49  
50          /*
51           * Compute ranks of each column. Missing values are wherever they end up, which is a bit odd.
52           */
53          Map<Integer, DoubleArrayList> ranks = new LinkedHashMap<>();
54  
55          DoubleMatrix<R, C> sortedData = fM.copy();
56          for ( int i = 0; i < fM.columns(); i++ ) {
57              DoubleArrayList dataColumn = new DoubleArrayList( fM.getColumn( i ) );
58  
59              DoubleArrayList sortedColumn = dataColumn.copy();
60              sortedColumn.sort();
61              for ( int j = 0; j < sortedColumn.size(); j++ ) {
62                  sortedData.set( j, i, sortedColumn.get( j ) );
63              }
64  
65              DoubleArrayList r = Rank.rankTransform( dataColumn );
66              assert r != null;
67              ranks.put( i, r );
68          }
69  
70          /*
71           * Compute the mean at each rank
72           */
73          DoubleArrayList rowMeans = new DoubleArrayList( sortedData.rows() );
74          for ( int i = 0; i < sortedData.rows(); i++ ) {
75              double mean = Descriptive.mean( new DoubleArrayList( sortedData.getRow( i ) ) );
76              rowMeans.add( mean );
77          }
78  
79          for ( int j = 0; j < sortedData.columns(); j++ ) {
80  
81              for ( int i = 0; i < sortedData.rows(); i++ ) {
82  
83                  if ( Double.isNaN( fM.get( i, j ) ) ) {
84                      sortedData.set( i, j, Double.NaN );
85                      continue;
86                  }
87  
88                  double rank = ranks.get( j ).get( i ) - 1.0;
89  
90                  int intrank = ( int ) Math.floor( rank );
91  
92                  Double value = null;
93                  if ( rank - intrank > 0.4 && intrank > 0 ) {
94                      // cope with tied ranks. 0.4 is the threshold R uses.
95                      value = ( rowMeans.get( intrank ) + rowMeans.get( intrank - 1 ) ) / 2.0;
96                  } else {
97                      value = rowMeans.get( intrank );
98                  }
99                  assert value != null : "No mean value for rank=" + rank;
100                 sortedData.set( i, j, value );
101 
102             }
103         }
104 
105         assert missingValueStatus.rows() == sortedData.rows() && missingValueStatus.columns() == sortedData.columns();
106 
107         // mask the missing values.
108         for ( int i = 0; i < missingValueStatus.rows(); i++ ) {
109             for ( int j = 0; j < missingValueStatus.columns(); j++ ) {
110                 if ( Double.isNaN( missingValueStatus.get( i, j ) ) ) {
111                     sortedData.set( i, j, Double.NaN );
112                 }
113             }
114         }
115 
116         return sortedData;
117 
118     }
119 
120     /**
121      * Simple imputation method. Generally (but not always), missing values correspond to "low expression". Therefore
122      * imputed values of zero are defensible. However, because at this point the matrix has probably already been
123      * filtered, the row mean is better.
124      * <p>
125      * FIXME this should be factored out
126      *
127      * @param matrix
128      * @return missing value status
129      */
130     private DoubleMatrix<R, C> imputeMissing( DoubleMatrix<R, C> matrix ) {
131         /*
132          * keep track of the missing values so they can be re-masked later.
133          */
134         DoubleMatrix<R, C> missingValueInfo = new DenseDoubleMatrix<>( matrix.rows(), matrix.columns() );
135         for ( int i = 0; i < matrix.rows(); i++ ) {
136             DoubleArrayList v = new DoubleArrayList( matrix.getRow( i ) );
137             double m = DescriptiveWithMissing.mean( v );
138             for ( int j = 0; j < matrix.columns(); j++ ) {
139                 double d = matrix.get( i, j );
140                 if ( Double.isNaN( d ) ) {
141                     missingValueInfo.set( i, j, Double.NaN );
142                     matrix.set( i, j, m );
143                 } else {
144                     missingValueInfo.set( i, j, 1.0 );
145                 }
146             }
147         }
148         return missingValueInfo;
149     }
150 }