View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2017 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  
20  package ubic.basecode.math.linearmodels;
21  
22  import java.io.File;
23  import java.io.IOException;
24  import java.io.OutputStream;
25  import java.io.PrintStream;
26  import java.util.List;
27  
28  import cern.colt.function.DoubleFunction;
29  import cern.colt.list.BooleanArrayList;
30  import cern.colt.list.DoubleArrayList;
31  import org.apache.commons.math3.special.Gamma;
32  
33  import cern.colt.matrix.DoubleMatrix1D;
34  import cern.colt.matrix.impl.DenseDoubleMatrix1D;
35  import cern.jet.math.Functions;
36  import ubic.basecode.dataStructure.matrix.MatrixUtil;
37  import ubic.basecode.math.DescriptiveWithMissing;
38  import ubic.basecode.math.SpecFunc;
39  
40  /**
41   * Implements methods described in
42   * <p>
43   * Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing differential expression in microarray
44   * experiments. Statistical Applications in Genetics and Molecular Biology Volume 3, Issue 1, Article 3.
45   * <p>
46   * R code snippets in comments are from squeezeVar.R in the limma source code.
47   *
48   * @author paul
49   */
50  public class ModeratedTstat {
51  
52      private final static DoubleFunction digamma = new DoubleFunction() {
53          @Override
54          public double apply(double v) {
55              return Gamma.digamma(v);
56          }
57      };
58      private final static DoubleFunction trigammainverse = new DoubleFunction() {
59          @Override
60          public double apply(double v) {
61              return SpecFunc.trigammaInverse(v);
62          }
63      };
64      private final static DoubleFunction trigamma = new DoubleFunction() {
65          @Override
66          public double apply(double v) {
67              return Gamma.trigamma(v);
68          }
69      };
70      public static final double TOOSMALL = Math.pow(10, -15);
71  
72      /**
73       * Does essentially the same thing as limma::ebayes
74       *
75       * @param fit which will be modified
76       */
77      public static void ebayes(LeastSquaresFit fit) {
78          List<LinearModelSummary> summaries = fit.summarize();
79          DoubleMatrix1D sigmas = new DenseDoubleMatrix1D(new double[summaries.size()]);
80          DoubleMatrix1D dofs = new DenseDoubleMatrix1D(new double[summaries.size()]);
81          int i = 0;
82          // corner case can get nulls, example: GSE10778
83          for (LinearModelSummary lms : summaries) {
84              if (lms.getSigma() == null) {
85                  sigmas.set(i, Double.NaN);
86                  dofs.set(i, Double.NaN);
87              } else {
88                  sigmas.set(i, lms.getSigma());
89                  dofs.set(i, lms.getResidualDof());  // collect vector of dofs instead of assuming fixed value.
90              }
91              i++;
92  
93          }
94          squeezeVar(sigmas.copy().assign(Functions.square), dofs, fit);
95      }
96  
97      /**
98       * defining 'ok' as non-missing and non-infinite and not very close to zero as per limma implementation
99       *
100      * @param x
101      * @return
102      */
103     private static final BooleanArrayList okVars(DoubleMatrix1D x) {
104         assert x.size() > 0;
105         BooleanArrayList answer = new BooleanArrayList(x.size());
106         for (int i = 0; i < x.size(); i++) {
107             double a = x.getQuick(i);
108             answer.add(!(Double.isNaN(a) || Double.isInfinite(a) || a < -1e-15));
109         }
110         return answer;
111     }
112 
113     private static final BooleanArrayList okDfs(DoubleMatrix1D x) {
114         assert x.size() > 0;
115         BooleanArrayList answer = new BooleanArrayList(x.size());
116 
117         for (int i = 0; i < x.size(); i++) {
118             double a = x.getQuick(i);
119             answer.add(!(Double.isNaN(a) || Double.isInfinite(a) || a < TOOSMALL));
120         }
121         return answer;
122     }
123 
124     /*
125      * @param vars variances
126      * @param df1s vector of degrees of freedom
127      * @return the scale (aka s2.prior or s20 in limma) and df2 (aka df.prior) in a double array of length 2
128      */
129     protected static double[] fitFDist(final DoubleMatrix1D vars, final DoubleMatrix1D df1s) {
130 
131         BooleanArrayList ok = MatrixUtil.conjunction(okVars(vars), okDfs(df1s));
132 
133         DoubleMatrix1D x = MatrixUtil.stripNonOK(vars, ok);
134         DoubleMatrix1D df1 = MatrixUtil.stripNonOK(df1s, ok);
135 
136         if (x.size() == 0) {
137             throw new IllegalStateException("There were no valid values of variance to perform eBayes parameter estimation");
138         }
139 
140         // stay away from zero variance
141         x = x.assign(Functions.max(1e-5 * DescriptiveWithMissing.median(new DoubleArrayList(vars.toArray()))));
142 
143         // z <- log(x)
144         DoubleMatrix1D z = x.copy().assign(Functions.log);
145 
146         // e <- z-digamma(df1/2)+log(df1/2)
147         DoubleMatrix1D e1 = df1.copy().assign(Functions.div(2.0)).assign(digamma);
148         DoubleMatrix1D e2 = df1.copy().assign(Functions.div(2.0)).assign(Functions.log);
149         DoubleMatrix1D e = z.copy().assign(e1, Functions.minus).assign(e2, Functions.plus);
150 
151         int n = x.size();
152         if (n < 2) {
153             throw new IllegalStateException("Too few valid variance values to do eBayes parameter estimation (require at least 2)");
154         }
155 
156         //  emean <- mean(e)
157         double emean = e.zSum() / n;
158         // evar <- sum((e-emean)^2)/(n-1)
159         double evar = e.copy().assign(Functions.minus(emean)).aggregate(Functions.plus, Functions.square)
160                 / (n - 1);
161 
162         // evar <- evar - mean(trigamma(df1/2))
163         evar = evar - df1.copy().assign(Functions.div(2.0)).assign(trigamma).zSum() / df1.size();
164         double df2;
165         double s20;
166         if (evar > 0.0) {
167             // df2 <- 2*trigammaInverse(evar)
168             df2 = 2 * SpecFunc.trigammaInverse(evar);
169 
170             // s20 <- exp(emean+digamma(df2/2)-log(df2/2))
171             s20 = Math.exp(emean + Gamma.digamma(df2 / 2.0) - Math.log(df2 / 2.0));
172 
173         } else {
174             df2 = Double.POSITIVE_INFINITY;
175             // s20 <- mean(x)
176             s20 = x.copy().aggregate(Functions.plus, Functions.identity) / x.size();
177         }
178 
179         return new double[]{s20, df2};
180     }
181 
182 
183     /*
184      * Return the scale and df2. Original implementation, does not handle missing values, kept here for posterity/comparison/debugging.
185      */
186     @Deprecated
187     private static double[] fitFDistNoMissing(DoubleMatrix1D x, double df1) {
188 
189         // stay away from zero valuess
190         x = x.copy().assign(Functions.max(1e-5));
191         // z <- log(x)
192         // e <- z-digamma(df1/2)+log(df1/2)
193         DoubleMatrix1D e = x.copy().assign(Functions.log).assign(Functions.minus(Gamma.digamma(df1 / 2.0)))
194                 .assign(Functions.plus(Math.log(df1 / 2.0)));
195 
196         int nok = x.size(); // number of oks - need to implement checks
197         //  emean <- mean(e)
198         double emean = e.copy().zSum() / nok;
199         // evar <- sum((e-emean)^2)/(nok-1L)
200         double evar = e.copy().assign(Functions.minus(emean)).aggregate(Functions.plus, Functions.square)
201                 / (nok - 1);
202 
203         evar = evar - Gamma.trigamma(df1 / 2.0);
204 
205         double df2;
206         double s20;
207         if (evar > 0) {
208             // df2 <- 2*trigammaInverse(evar)
209             df2 = 2 * SpecFunc.trigammaInverse(evar);
210 
211             // s20 <- exp(emean+digamma(df2/2)-log(df2/2)) 
212             s20 = Math.exp(emean + Gamma.digamma(df2 / 2.0) - Math.log(df2 / 2.0));
213 
214         } else {
215             df2 = Double.POSITIVE_INFINITY;
216             // s20 <- mean(x)
217             s20 = x.copy().zSum() / x.size();
218         }
219 
220         return new double[]{s20, df2};
221     }
222 
223     /**
224      * Ignoring robust and covariate for now
225      *
226      * @param var initial values of estimated residual variance = sigma^2 = rssq/rdof; this will be moderated
227      * @param df  vector of degrees of freedom
228      * @param fit will be updated with new info; call fit.summarize() to get updated pvalues etc.
229      * @return varPost for testing mostly
230      */
231     protected static DoubleMatrix1D squeezeVar(DoubleMatrix1D var, DoubleMatrix1D df, LeastSquaresFit fit) {
232 
233         double[] ffit = fitFDist(var, df);
234         double dfPrior = ffit[1];
235 
236         DoubleMatrix1D varPost = squeezeVariances(var, df, ffit);
237 
238         if (fit != null)
239             fit.ebayesUpdate(dfPrior, ffit[0], varPost);
240 
241         return varPost;
242     }
243 
244     /**
245      * @param var vector of estimated residual variances from original model fit
246      * @param df  vector of dfs
247      * @param fit result of fitFDist() (s2.prior and dfPrior)
248      * @return vector of squeezed variances (varPost or s2.post)
249      */
250     protected static DoubleMatrix1D squeezeVariances(DoubleMatrix1D var, DoubleMatrix1D df, double[] fit) {
251         double varPrior = fit[0];
252         double dfPrior = fit[1];
253 
254         //   out$var.post <- (df*var + out$df.prior*out$var.prior) / df.total
255 
256         DoubleMatrix1D dfTotal = df.copy().assign(Functions.plus(dfPrior));
257 
258         return var.copy().assign(df, Functions.mult)
259                 .assign(Functions.plus(dfPrior * varPrior)).assign(dfTotal, Functions.div);
260 
261     }
262 
263 }