View Javadoc
1   /*
2    * The baseCode project
3    *
4    * Copyright (c) 2017 University of British Columbia
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  
20  package ubic.basecode.math.linearmodels;
21  
22  import java.util.List;
23  
24  import cern.colt.function.DoubleFunction;
25  import cern.colt.list.BooleanArrayList;
26  import cern.colt.list.DoubleArrayList;
27  import org.apache.commons.math3.special.Gamma;
28  
29  import cern.colt.matrix.DoubleMatrix1D;
30  import cern.colt.matrix.impl.DenseDoubleMatrix1D;
31  import cern.jet.math.Functions;
32  import ubic.basecode.dataStructure.matrix.MatrixUtil;
33  import ubic.basecode.math.DescriptiveWithMissing;
34  import ubic.basecode.math.SpecFunc;
35  
36  /**
37   * Implements methods described in
38   * <p>
39   * Smyth, G. K. (2004). Linear models and empirical Bayes methods for assessing differential expression in microarray
40   * experiments. Statistical Applications in Genetics and Molecular Biology Volume 3, Issue 1, Article 3.
41   * <p>
42   * R code snippets in comments are from squeezeVar.R in the limma source code.
43   *
44   * @author paul
45   */
46  public class ModeratedTstat {
47  
48      private final static DoubleFunction digamma = new DoubleFunction() {
49          @Override
50          public double apply(double v) {
51              return Gamma.digamma(v);
52          }
53      };
54      private final static DoubleFunction trigammainverse = new DoubleFunction() {
55          @Override
56          public double apply(double v) {
57              return SpecFunc.trigammaInverse(v);
58          }
59      };
60      private final static DoubleFunction trigamma = new DoubleFunction() {
61          @Override
62          public double apply(double v) {
63              return Gamma.trigamma(v);
64          }
65      };
66      public static final double TOOSMALL = Math.pow(10, -15);
67  
68      /**
69       * Does essentially the same thing as limma::ebayes
70       *
71       * @param fit which will be modified
72       */
73      public static void ebayes(LeastSquaresFit fit) {
74          List<LinearModelSummary> summaries = fit.summarize();
75          DoubleMatrix1D sigmas = new DenseDoubleMatrix1D(new double[summaries.size()]);
76          DoubleMatrix1D dofs = new DenseDoubleMatrix1D(new double[summaries.size()]);
77          int i = 0;
78          // corner case can get nulls, example: GSE10778
79          for ( LinearModelSummary lms : summaries) {
80              if (Double.isNaN(lms.getSigma())) {
81                  sigmas.set(i, Double.NaN);
82                  dofs.set(i, Double.NaN);
83              } else {
84                  sigmas.set(i, lms.getSigma());
85                  dofs.set(i, lms.getResidualsDof());  // collect vector of dofs instead of assuming fixed value.
86              }
87              i++;
88  
89          }
90          squeezeVar(sigmas.copy().assign(Functions.square), dofs, fit);
91      }
92  
93      /**
94       * defining 'ok' as non-missing and non-infinite and not very close to zero as per limma implementation
95       *
96       * @param x
97       * @return
98       */
99      private static final BooleanArrayList okVars(DoubleMatrix1D x) {
100         assert x.size() > 0;
101         BooleanArrayList answer = new BooleanArrayList(x.size());
102         for (int i = 0; i < x.size(); i++) {
103             double a = x.getQuick(i);
104             answer.add(!(Double.isNaN(a) || Double.isInfinite(a) || a < -1e-15));
105         }
106         return answer;
107     }
108 
109     private static final BooleanArrayList okDfs(DoubleMatrix1D x) {
110         assert x.size() > 0;
111         BooleanArrayList answer = new BooleanArrayList(x.size());
112 
113         for (int i = 0; i < x.size(); i++) {
114             double a = x.getQuick(i);
115             answer.add(!(Double.isNaN(a) || Double.isInfinite(a) || a < TOOSMALL));
116         }
117         return answer;
118     }
119 
120     /*
121      * @param vars variances
122      * @param df1s vector of degrees of freedom
123      * @return the scale (aka s2.prior or s20 in limma) and df2 (aka df.prior) in a double array of length 2
124      */
125     protected static double[] fitFDist(final DoubleMatrix1D vars, final DoubleMatrix1D df1s) {
126 
127         BooleanArrayList ok = MatrixUtil.conjunction(okVars(vars), okDfs(df1s));
128 
129         DoubleMatrix1D x = MatrixUtil.stripNonOK(vars, ok);
130         DoubleMatrix1D df1 = MatrixUtil.stripNonOK(df1s, ok);
131 
132         if (x.size() == 0) {
133             throw new IllegalStateException("There were no valid values of variance to perform eBayes parameter estimation");
134         }
135 
136         // stay away from zero variance
137         x = x.assign(Functions.max(1e-5 * DescriptiveWithMissing.median(new DoubleArrayList(vars.toArray()))));
138 
139         // z <- log(x)
140         DoubleMatrix1D z = x.copy().assign(Functions.log);
141 
142         // e <- z-digamma(df1/2)+log(df1/2)
143         DoubleMatrix1D e1 = df1.copy().assign(Functions.div(2.0)).assign(digamma);
144         DoubleMatrix1D e2 = df1.copy().assign(Functions.div(2.0)).assign(Functions.log);
145         DoubleMatrix1D e = z.copy().assign(e1, Functions.minus).assign(e2, Functions.plus);
146 
147         int n = x.size();
148         if (n < 2) {
149             throw new IllegalStateException("Too few valid variance values to do eBayes parameter estimation (require at least 2)");
150         }
151 
152         //  emean <- mean(e)
153         double emean = e.zSum() / n;
154         // evar <- sum((e-emean)^2)/(n-1)
155         double evar = e.copy().assign(Functions.minus(emean)).aggregate(Functions.plus, Functions.square)
156                 / (n - 1);
157 
158         // evar <- evar - mean(trigamma(df1/2))
159         evar = evar - df1.copy().assign(Functions.div(2.0)).assign(trigamma).zSum() / df1.size();
160         double df2;
161         double s20;
162         if (evar > 0.0) {
163             // df2 <- 2*trigammaInverse(evar)
164             df2 = 2 * SpecFunc.trigammaInverse(evar);
165 
166             // s20 <- exp(emean+digamma(df2/2)-log(df2/2))
167             s20 = Math.exp(emean + Gamma.digamma(df2 / 2.0) - Math.log(df2 / 2.0));
168 
169         } else {
170             df2 = Double.POSITIVE_INFINITY;
171             // s20 <- mean(x)
172             s20 = x.copy().aggregate(Functions.plus, Functions.identity) / x.size();
173         }
174 
175         return new double[]{s20, df2};
176     }
177 
178 
179     /*
180      * Return the scale and df2. Original implementation, does not handle missing values, kept here for posterity/comparison/debugging.
181      */
182     @Deprecated
183     private static double[] fitFDistNoMissing(DoubleMatrix1D x, double df1) {
184 
185         // stay away from zero valuess
186         x = x.copy().assign(Functions.max(1e-5));
187         // z <- log(x)
188         // e <- z-digamma(df1/2)+log(df1/2)
189         DoubleMatrix1D e = x.copy().assign(Functions.log).assign(Functions.minus(Gamma.digamma(df1 / 2.0)))
190                 .assign(Functions.plus(Math.log(df1 / 2.0)));
191 
192         int nok = x.size(); // number of oks - need to implement checks
193         //  emean <- mean(e)
194         double emean = e.copy().zSum() / nok;
195         // evar <- sum((e-emean)^2)/(nok-1L)
196         double evar = e.copy().assign(Functions.minus(emean)).aggregate(Functions.plus, Functions.square)
197                 / (nok - 1);
198 
199         evar = evar - Gamma.trigamma(df1 / 2.0);
200 
201         double df2;
202         double s20;
203         if (evar > 0) {
204             // df2 <- 2*trigammaInverse(evar)
205             df2 = 2 * SpecFunc.trigammaInverse(evar);
206 
207             // s20 <- exp(emean+digamma(df2/2)-log(df2/2))
208             s20 = Math.exp(emean + Gamma.digamma(df2 / 2.0) - Math.log(df2 / 2.0));
209 
210         } else {
211             df2 = Double.POSITIVE_INFINITY;
212             // s20 <- mean(x)
213             s20 = x.copy().zSum() / x.size();
214         }
215 
216         return new double[]{s20, df2};
217     }
218 
219     /**
220      * Ignoring robust and covariate for now
221      *
222      * @param var initial values of estimated residual variance = sigma^2 = rssq/rdof; this will be moderated
223      * @param df  vector of degrees of freedom
224      * @param fit will be updated with new info; call fit.summarize() to get updated pvalues etc.
225      * @return varPost for testing mostly
226      */
227     protected static DoubleMatrix1D squeezeVar(DoubleMatrix1D var, DoubleMatrix1D df, LeastSquaresFit fit) {
228 
229         double[] ffit = fitFDist(var, df);
230         double dfPrior = ffit[1];
231 
232         DoubleMatrix1D varPost = squeezeVariances(var, df, ffit);
233 
234         if (fit != null)
235             fit.ebayesUpdate(dfPrior, ffit[0], varPost);
236 
237         return varPost;
238     }
239 
240     /**
241      * @param var vector of estimated residual variances from original model fit
242      * @param df  vector of dfs
243      * @param fit result of fitFDist() (s2.prior and dfPrior)
244      * @return vector of squeezed variances (varPost or s2.post)
245      */
246     protected static DoubleMatrix1D squeezeVariances(DoubleMatrix1D var, DoubleMatrix1D df, double[] fit) {
247         double varPrior = fit[0];
248         double dfPrior = fit[1];
249 
250         //   out$var.post <- (df*var + out$df.prior*out$var.prior) / df.total
251 
252         DoubleMatrix1D dfTotal = df.copy().assign(Functions.plus(dfPrior));
253 
254         return var.copy().assign(df, Functions.mult)
255                 .assign(Functions.plus(dfPrior * varPrior)).assign(dfTotal, Functions.div);
256 
257     }
258 
259 }