1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 package ubic.basecode.math;
16
17 import java.util.LinkedHashMap;
18 import java.util.Map;
19
20 import ubic.basecode.dataStructure.matrix.DenseDoubleMatrix;
21 import ubic.basecode.dataStructure.matrix.DoubleMatrix;
22 import ubic.basecode.datafilter.RowMissingFilter;
23 import cern.colt.list.DoubleArrayList;
24 import cern.jet.stat.Descriptive;
25
26
27
28
29
30 public class MatrixNormalizer<R, C> {
31
32
33
34
35
36
37
38
39
40
41
42 public DoubleMatrix<R, C> quantileNormalize( DoubleMatrix<R, C> matrix ) {
43
44 RowMissingFilter<DoubleMatrix<R, C>, R, C, Double> f = new RowMissingFilter<>();
45 f.setMinPresentCount( 1 );
46 DoubleMatrix<R, C> fM = f.filter( matrix );
47
48 DoubleMatrix<R, C> missingValueStatus = imputeMissing( fM );
49
50
51
52
53 Map<Integer, DoubleArrayList> ranks = new LinkedHashMap<>();
54
55 DoubleMatrix<R, C> sortedData = fM.copy();
56 for ( int i = 0; i < fM.columns(); i++ ) {
57 DoubleArrayList dataColumn = new DoubleArrayList( fM.getColumn( i ) );
58
59 DoubleArrayList sortedColumn = dataColumn.copy();
60 sortedColumn.sort();
61 for ( int j = 0; j < sortedColumn.size(); j++ ) {
62 sortedData.set( j, i, sortedColumn.get( j ) );
63 }
64
65 DoubleArrayList r = Rank.rankTransform( dataColumn );
66 assert r != null;
67 ranks.put( i, r );
68 }
69
70
71
72
73 DoubleArrayList rowMeans = new DoubleArrayList( sortedData.rows() );
74 for ( int i = 0; i < sortedData.rows(); i++ ) {
75 double mean = Descriptive.mean( new DoubleArrayList( sortedData.getRow( i ) ) );
76 rowMeans.add( mean );
77 }
78
79 for ( int j = 0; j < sortedData.columns(); j++ ) {
80
81 for ( int i = 0; i < sortedData.rows(); i++ ) {
82
83 if ( Double.isNaN( fM.get( i, j ) ) ) {
84 sortedData.set( i, j, Double.NaN );
85 continue;
86 }
87
88 double rank = ranks.get( j ).get( i ) - 1.0;
89
90 int intrank = ( int ) Math.floor( rank );
91
92 Double value = null;
93 if ( rank - intrank > 0.4 && intrank > 0 ) {
94
95 value = ( rowMeans.get( intrank ) + rowMeans.get( intrank - 1 ) ) / 2.0;
96 } else {
97 value = rowMeans.get( intrank );
98 }
99 assert value != null : "No mean value for rank=" + rank;
100 sortedData.set( i, j, value );
101
102 }
103 }
104
105 assert missingValueStatus.rows() == sortedData.rows() && missingValueStatus.columns() == sortedData.columns();
106
107
108 for ( int i = 0; i < missingValueStatus.rows(); i++ ) {
109 for ( int j = 0; j < missingValueStatus.columns(); j++ ) {
110 if ( Double.isNaN( missingValueStatus.get( i, j ) ) ) {
111 sortedData.set( i, j, Double.NaN );
112 }
113 }
114 }
115
116 return sortedData;
117
118 }
119
120
121
122
123
124
125
126
127
128
129
130 private DoubleMatrix<R, C> imputeMissing( DoubleMatrix<R, C> matrix ) {
131
132
133
134 DoubleMatrix<R, C> missingValueInfo = new DenseDoubleMatrix<>( matrix.rows(), matrix.columns() );
135 for ( int i = 0; i < matrix.rows(); i++ ) {
136 DoubleArrayList v = new DoubleArrayList( matrix.getRow( i ) );
137 double m = DescriptiveWithMissing.mean( v );
138 for ( int j = 0; j < matrix.columns(); j++ ) {
139 double d = matrix.get( i, j );
140 if ( Double.isNaN( d ) ) {
141 missingValueInfo.set( i, j, Double.NaN );
142 matrix.set( i, j, m );
143 } else {
144 missingValueInfo.set( i, j, 1.0 );
145 }
146 }
147 }
148 return missingValueInfo;
149 }
150 }