View Javadoc
1   /*
2    * The baseCode project
3    * 
4    * Copyright (c) 2010 University of British Columbia
5    * 
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   *
18   */
19  package ubic.basecode.math.linearmodels;
20  
21  import java.io.Serializable;
22  import java.util.Arrays;
23  import java.util.Collection;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.List;
27  import java.util.Map;
28  
29  import org.apache.commons.lang3.ArrayUtils;
30  import org.apache.commons.lang3.StringUtils;
31  import org.apache.commons.math3.distribution.FDistribution;
32  import org.rosuda.REngine.REXP;
33  import org.rosuda.REngine.REXPDouble;
34  import org.rosuda.REngine.REXPMismatchException;
35  import org.rosuda.REngine.RList;
36  
37  import ubic.basecode.dataStructure.matrix.DoubleMatrix;
38  import ubic.basecode.dataStructure.matrix.DoubleMatrixFactory;
39  
40  /**
41   * Represents the results of a linear model analysis from R. Both the summary.lm and anova objects are represented.
42   * 
43   * FIXME make this have the capabilities of the "fit" object, rather than just the summary.
44   * 
45   * @author paul
46   */
47  public class LinearModelSummary implements Serializable {
48  
49      public static final String INTERCEPT_COEFFICIENT_NAME = "(Intercept)";
50  
51      private static final long serialVersionUID = 1L;
52  
53      private Double adjRSquared = Double.NaN;
54  
55      private GenericAnovaResult anovaResult;
56  
57      private Double[] coefficients = null;
58  
59      /**
60       * 
61       */
62      private DoubleMatrix<String, String> contrastCoefficients = null;
63  
64      private Integer denominatorDof = null;
65  
66      /**
67       * AKA Qty
68       */
69      private Double[] effects = null;
70  
71      private List<String> factorNames = null;
72  
73      private Map<String, List<String>> factorValueNames = new HashMap<>();
74  
75      private String formula = null;
76  
77      private Double fStat = Double.NaN;
78  
79      /**
80       * Name for this summary e.g. probe identifier
81       */
82      private String key = null;
83  
84      private Integer numeratorDof = null;
85  
86      /**
87       * Only if ebayes has been applied
88       */
89      private Double priorDof = null;
90  
91      private Double[] residuals = null;
92  
93      private Double rSquared = Double.NaN;
94  
95      /**
96       * True = ebayes has been applied
97       */
98      private boolean shrunken = false;
99  
100     private Double sigma = null;
101 
102     private Double[] stdevUnscaled;
103 
104     private Map<String, Collection<String>> term2CoefficientNames = new HashMap<>();
105 
106     /**
107      * Construct from the result of evaluation of an R call. should be deprecated because we're not using R integration
108      * 
109      * @param summaryLm
110      * @param anova
111      * @param factorNames as referred to in the model. Used as keys to keep track of coefficients etc.
112      * 
113      */
114     public LinearModelSummary( REXP summaryLm, REXP anova, String[] factorNames ) {
115         try {
116             basicSetup( summaryLm );
117 
118             this.factorNames = Arrays.asList( factorNames );
119 
120             if ( anova != null ) {
121                 this.anovaResult = new GenericAnovaResult( anova );
122             }
123 
124             setupCoefficientNames();
125 
126         } catch ( REXPMismatchException e ) {
127             throw new RuntimeException( e );
128         }
129 
130     }
131 
132     /**
133      * Construct an empty summary. Use for model fits that fail due to 0 degrees of freedom, etc.
134      * 
135      * @param key identifier
136      */
137     public LinearModelSummary( String key ) {
138         this();
139         this.key = key;
140     }
141 
142     /**
143      * @param k optional identifier
144      * @param coefficients
145      * @param residuals
146      * @param terms
147      * @param contrastCoefficients
148      * @param effects AKA Qty
149      * @param rsquared
150      * @param adjRsquared
151      * @param fstat
152      * @param ndof
153      * @param ddof
154      * @param anovaResult
155      * @param sigma
156      * @param isShrunken
157      */
158     public LinearModelSummary( String k, Double[] coefficients, Double[] residuals, List<String> terms,
159             DoubleMatrix<String, String> contrastCoefficients, Double[] effects, Double[] stdevUnscaled,
160             double rsquared,
161             double adjRsquared,
162             double fstat, Integer ndof,
163             Integer ddof, GenericAnovaResult anovaResult, double sigma, boolean isShrunken ) {
164 
165         this.residuals = residuals;
166         this.coefficients = coefficients;
167         this.contrastCoefficients = contrastCoefficients;
168         this.rSquared = rsquared;
169         this.adjRSquared = adjRsquared;
170         this.fStat = fstat;
171         this.numeratorDof = ndof;
172         this.denominatorDof = ddof;
173         this.key = k;
174         this.anovaResult = anovaResult;
175         this.effects = effects;
176         this.stdevUnscaled = stdevUnscaled;
177         this.factorNames = terms;
178         this.sigma = sigma;
179         this.shrunken = isShrunken;
180 
181         if ( anovaResult != null ) {
182             if ( anovaResult.getKey() == null ) {
183                 anovaResult.setKey( key );
184             } else {
185                 if ( !anovaResult.getKey().equals( key ) ) {
186                     throw new IllegalArgumentException( "Keys of ANOVA and holding LM must match" );
187                 }
188             }
189         }
190         this.setupCoefficientNames();
191     }
192 
193     /**
194      * Construct an empty summary. Use for model fits that fail due to 0 residual degrees of freedom, etc.
195      */
196     protected LinearModelSummary() {
197     }
198 
199     /**
200      * @return the adjRSquared
201      */
202     public Double getAdjRSquared() {
203         return adjRSquared;
204     }
205 
206     /**
207      * @return may be null if ANOVA was not run.
208      */
209     public GenericAnovaResult getAnova() {
210         return this.anovaResult;
211     }
212 
213     public Double[] getCoefficients() {
214         return coefficients;
215     }
216 
217     /**
218      * @return The contrast coefficients and associated statistics for all tested contrasts.
219      *         <p>
220      *         Row names are the contrasts, for example for a model with one
221      *         factor "f" with two
222      *         levels "a" and "b": {"(Intercept)", "fb"}. columns are always {"Estimate" ,"Std. Error", "t value",
223      *         "Pr(>|t|)"}
224      * 
225      */
226     public DoubleMatrix<String, String> getContrastCoefficients() {
227         return contrastCoefficients;
228     }
229 
230     /**
231      * 
232      * @param factorName
233      * @return
234      */
235     public Map<String, Double> getContrastCoefficients( String factorName ) {
236         Collection<String> terms = term2CoefficientNames.get( factorName );
237 
238         if ( terms == null ) return null;
239 
240         Map<String, Double> results = new HashMap<>();
241         for ( String term : terms ) {
242             results.put( term, contrastCoefficients.getByKeys( term, "Estimate" ) );
243         }
244 
245         return results;
246     }
247 
248     /**
249      * For the requested factor, return the standard errors associated with the contrast coefficient estimates.
250      * 
251      * 
252      * @param factorName
253      * @return
254      */
255     public Map<String, Double> getContrastCoefficientStderr( String factorName ) {
256         Collection<String> terms = term2CoefficientNames.get( factorName );
257 
258         if ( terms == null ) return null;
259 
260         Map<String, Double> results = new HashMap<>();
261         for ( String term : terms ) {
262             results.put( term, contrastCoefficients.getByKeys( term, "Std. Error" ) );
263         }
264 
265         return results;
266     }
267 
268     /**
269      * @param factorName
270      * @return Map of pvalues for the given factor. For continuous factors or factors with only one level, there will be
271      *         just one value. For factors with N>2 levels, there will be N-1 values, one for each contrast
272      *         (since we compute treatment contrasts to the baseline)
273      */
274     public Map<String, Double> getContrastPValues( String factorName ) {
275         Collection<String> terms = term2CoefficientNames.get( factorName );
276 
277         if ( terms == null ) return null;
278 
279         Map<String, Double> results = new HashMap<>();
280         for ( String term : terms ) {
281             results.put( term, contrastCoefficients.getByKeys( term, "Pr(>|t|)" ) );
282         }
283 
284         return results;
285     }
286 
287     /**
288      * @param factorName
289      * @return Map of T statistics for the given factor. For continuous factors or factors with only one level, there
290      *         will be just one value. For factors with N>2 levels, there will be N-1 values, one for each contrast
291      *         (since we compute treatment contrasts to the baseline)
292      */
293     public Map<String, Double> getContrastTStats( String factorName ) {
294         Collection<String> terms = term2CoefficientNames.get( factorName );
295 
296         if ( terms == null ) return null;
297 
298         Map<String, Double> results = new HashMap<>();
299         for ( String term : terms ) {
300             results.put( term, contrastCoefficients.getByKeys( term, "t value" ) );
301         }
302 
303         return results;
304 
305     }
306 
307     /**
308      * @return
309      */
310     public Double[] getEffects() {
311         return this.effects;
312     }
313 
314     /**
315      * @return F statistic for overall model fit.
316      */
317     public Double getF() {
318         return this.fStat;
319     }
320 
321     /**
322      * @return the factorNames
323      */
324     public List<String> getFactorNames() {
325         return factorNames;
326     }
327 
328     /**
329      * Return the factor names in the order they are stored here. Pvalues and T statistics for this factor are in the
330      * same order, but the 'baseline' must be accounted for.
331      * 
332      * @param factorName
333      * @return
334      */
335     public List<String> getFactorValueNames( String factorName ) {
336         return factorValueNames.get( factorName );
337     }
338 
339     /**
340      * @return the formula
341      */
342     public String getFormula() {
343         return formula;
344     }
345 
346     /**
347      * @param fnames names of the factors
348      * @return
349      * @see ubic.basecode.math.linearmodels.GenericAnovaResult#getInteractionEffectP(java.lang.String)
350      */
351     public Double getInteractionEffectP( String... fnames ) {
352         if ( anovaResult == null ) return Double.NaN;
353         return anovaResult.getInteractionEffectP( fnames );
354     }
355 
356     /**
357      * @return
358      */
359     public Double getInterceptCoeff() {
360         if ( contrastCoefficients != null ) {
361             if ( contrastCoefficients.hasRow( INTERCEPT_COEFFICIENT_NAME ) ) {
362                 return contrastCoefficients.getByKeys( INTERCEPT_COEFFICIENT_NAME, "Estimate" );
363             } else if ( contrastCoefficients.rows() == 1 ) {
364                 /*
365                  * This is a bit of a kludge. When we use lm.fit instead of lm, we end up with a somewhat screwy
366                  * coefficent matrix in the case of one-sample ttest, and R put in x1 (I think it starts as 1 and it
367                  * prepends the x).
368                  */
369                 assert contrastCoefficients.getRowName( 0 ).equals( "x1" );
370                 return contrastCoefficients.getByKeys( contrastCoefficients.getRowName( 0 ), "Estimate" );
371             }
372         }
373 
374         return Double.NaN;
375     }
376 
377     /**
378      * @return
379      */
380     public Double getInterceptP() {
381         if ( contrastCoefficients != null ) {
382             if ( contrastCoefficients.hasRow( INTERCEPT_COEFFICIENT_NAME ) ) {
383                 return contrastCoefficients.getByKeys( INTERCEPT_COEFFICIENT_NAME, "Pr(>|t|)" );
384             } else if ( contrastCoefficients.rows() == 1 ) {
385                 /*
386                  * This is a bit of a kludge. When we use lm.fit instead of lm, we end up with a somewhat screwy
387                  * coefficent matrix in the case of one-sample ttest, and R put in x1 (I think it starts as 1 and it
388                  * prepends the x).
389                  */
390                 assert contrastCoefficients.getRowName( 0 ).equals( "x1" );
391                 return contrastCoefficients.getByKeys( contrastCoefficients.getRowName( 0 ), "Pr(>|t|)" );
392             }
393         }
394         return Double.NaN;
395     }
396 
397     /**
398      * @return
399      */
400     public Double getInterceptT() {
401         if ( contrastCoefficients != null ) {
402             if ( contrastCoefficients.hasRow( INTERCEPT_COEFFICIENT_NAME ) ) {
403                 return contrastCoefficients.getByKeys( INTERCEPT_COEFFICIENT_NAME, "t value" );
404             } else if ( contrastCoefficients.rows() == 1 ) {
405                 /*
406                  * This is a bit of a kludge. When we use lm.fit instead of lm, we end up with a somewhat screwy
407                  * coefficent matrix in the case of one-sample ttest, and R put in x1 (I think it starts as 1 and it
408                  * prepends the x).
409                  */
410                 assert contrastCoefficients.getRowName( 0 ).equals( "x1" );
411                 return contrastCoefficients.getByKeys( contrastCoefficients.getRowName( 0 ), "t value" );
412             }
413         }
414 
415         return Double.NaN;
416     }
417 
418     public String getKey() {
419         return key;
420     }
421 
422     /**
423      * @return
424      * @see ubic.basecode.math.linearmodels.GenericAnovaResult#getMainEffectFactorNames()
425      */
426     public Collection<String> getMainEffectFactorNames() {
427         if ( anovaResult == null ) return null;
428         return anovaResult.getMainEffectFactorNames();
429     }
430 
431     /**
432      * @param factorName
433      * @return overall p-value for the given factor
434      * @see ubic.basecode.math.linearmodels.GenericAnovaResult#getMainEffectP(java.lang.String)
435      */
436     public Double getMainEffectP( String factorName ) {
437         if ( anovaResult == null ) return Double.NaN;
438         return anovaResult.getMainEffectP( factorName );
439     }
440 
441     /**
442      * @return
443      */
444     public Integer getNumeratorDof() {
445         return this.numeratorDof;
446     }
447 
448     /**
449      * Overall p value for F stat of model fit (upper tail probability)
450      * 
451      * @return value or NaN if it can't be computed for some reason
452      */
453     public Double getP() {
454         if ( numeratorDof == null || denominatorDof == null || numeratorDof == 0 || denominatorDof == 0 )
455             return Double.NaN;
456 
457         FDistribution f = new FDistribution( numeratorDof, denominatorDof );
458 
459         return 1.0 - f.cumulativeProbability( this.getF() );
460 
461     }
462 
463     /**
464      * @return the priorDof
465      */
466     public Double getPriorDof() {
467         return priorDof;
468     }
469 
470     /**
471      * @return
472      */
473     public Integer getResidualDof() {
474         return this.denominatorDof;
475     }
476 
477     /**
478      * @return the residuals
479      */
480     public Double[] getResiduals() {
481         return residuals;
482     }
483 
484     /**
485      * @return the rSquared
486      */
487     public Double getRSquared() {
488         return rSquared;
489     }
490 
491     /**
492      * Residual standard deviation
493      * 
494      * @return
495      */
496     public Double getSigma() {
497         return sigma;
498     }
499 
500     /**
501      * Unscaled standard deviations for the coefficient estimators in same order as coefficients. The standard errors
502      * are given by stdev.unscaled * sigma (a la limma)
503      */
504     public Double[] getStdevUnscaled() {
505         return stdevUnscaled;
506     }
507 
508     /**
509      * @return
510      * @see ubic.basecode.math.linearmodels.GenericAnovaResult#hasInteractions()
511      */
512     public boolean hasInteractions() {
513         if ( anovaResult == null ) return false;
514         return anovaResult.hasInteractions();
515     }
516 
517     public boolean isBaseline( String factorValueName ) {
518         return !contrastCoefficients.hasRow( factorValueName );
519     }
520 
521     /**
522      * Whether this is the result of emprical bayes shrinkage of variance estimates
523      * 
524      * @return
525      */
526     public boolean isShrunken() {
527         return shrunken;
528     }
529 
530     /**
531      * @param genericAnovaResult
532      */
533     public void setAnova( GenericAnovaResult genericAnovaResult ) {
534         this.anovaResult = genericAnovaResult;
535     }
536 
537     public void setKey( String key ) {
538         this.key = key;
539     }
540 
541     /**
542      * @param priorDof the priorDof to set
543      */
544     public void setPriorDof( Double priorDof ) {
545         this.priorDof = priorDof;
546     }
547 
548     /*
549      * (non-Javadoc)
550      * 
551      * @see java.lang.Object#toString()
552      */
553     @Override
554     public String toString() {
555         StringBuilder buf = new StringBuilder();
556         if ( StringUtils.isNotBlank( this.key ) ) {
557             buf.append( this.key + "\n" );
558         }
559         buf.append( "F=" + String.format( "%.2f", this.fStat ) + " Rsquare=" + String.format( "%.2f", this.rSquared )
560                 + "\n" );
561 
562         buf.append( "Residuals:\n" );
563         if ( residuals != null ) {
564             for ( Double d : residuals ) {
565                 buf.append( String.format( "%.2f ", d ) );
566             }
567         } else {
568             buf.append( "Residuals are null" );
569         }
570 
571         buf.append( "\n\nCoefficients:\n" + contrastCoefficients + "\n" );
572         if ( this.anovaResult != null ) {
573             buf.append( this.anovaResult.toString() + "\n" );
574         }
575 
576         return buf.toString();
577     }
578 
579     /**
580      * @param summaryLm
581      * @return
582      * @throws REXPMismatchException
583      */
584     private RList basicSetup( REXP summaryLm ) throws REXPMismatchException {
585         RList li = summaryLm.asList();
586 
587         extractCoefficients( li );
588 
589         this.residuals = ArrayUtils.toObject( ( ( REXP ) li.get( "residuals" ) ).asDoubles() );
590 
591         this.rSquared = ( ( REXP ) li.get( "r.squared" ) ).asDouble();
592 
593         this.adjRSquared = ( ( REXP ) li.get( "adj.r.squared" ) ).asDouble();
594 
595         REXP fstats = ( REXP ) li.get( "fstatistic" ); // for overall model fit.
596         if ( fstats != null ) {
597             Double[] ff = ArrayUtils.toObject( fstats.asDoubles() );
598             this.fStat = ff[0];
599             this.numeratorDof = ff[1].intValue();
600             this.denominatorDof = ff[2].intValue();
601         }
602         // this.residualDof = ArrayUtils.toObject( ( ( REXP ) li.get( "df" ) ).asIntegers() )[1];
603 
604         // li.get( "cov.unscaled" );
605         // this.doF = ArrayUtils.toObject( ( ( REXP ) li.get( "df" ) ).asIntegers() );
606 
607         return li;
608     }
609 
610     /**
611      * @param li
612      * @throws REXPMismatchException
613      */
614     private void extractCoefficients( RList li ) throws REXPMismatchException {
615         REXPDouble coraw = ( REXPDouble ) li.get( "coefficients" );
616 
617         RList dimnames = coraw.getAttribute( "dimnames" ).asList();
618         String[] itemnames = ( ( REXP ) dimnames.get( 0 ) ).asStrings();
619         String[] colNames = ( ( REXP ) dimnames.get( 1 ) ).asStrings();
620 
621         double[][] coef = coraw.asDoubleMatrix();
622         contrastCoefficients = DoubleMatrixFactory.dense( coef );
623         contrastCoefficients.setRowNames( Arrays.asList( itemnames ) );
624         contrastCoefficients.setColumnNames( Arrays.asList( colNames ) );
625     }
626 
627     /**
628      */
629     private void setupCoefficientNames() {
630 
631         for ( String string : factorNames ) {
632             term2CoefficientNames.put( string, new HashSet<String>() );
633         }
634 
635         assert this.contrastCoefficients != null;
636 
637         List<String> coefRowNames = contrastCoefficients.getRowNames();
638 
639         for ( String coefNameFromR : coefRowNames ) {
640 
641             if ( coefNameFromR.equals( INTERCEPT_COEFFICIENT_NAME ) ) {
642                 continue; // ?
643             } else if ( coefNameFromR.contains( ":" ) ) {
644                 /*
645                  * We're counting on the interaction terms names ending up like this: f1001fv1005:f1002fv1006 (see
646                  * LinearModelAnalyzer in Gemma, which sets up the factor and factorvalue names in the way that
647                  * generates this). Risky, and it won't work for continuous factors. But R makes kind of a mess from the
648                  * interactions. If we assume there is only one interaction term we can work it out by the presence of
649                  * ":".
650                  */
651                 String cleanedInterationTermName = coefNameFromR.replaceAll( "fv_[0-9]+(?=(:|$))", "" );
652 
653                 for ( String factorName : factorNames ) {
654                     if ( !factorName.contains( ":" ) ) continue;
655 
656                     if ( factorName.equals( cleanedInterationTermName ) ) {
657                         assert term2CoefficientNames.containsKey( factorName );
658                         term2CoefficientNames.get( factorName ).add( coefNameFromR );
659                     }
660 
661                 }
662 
663             } else {
664 
665                 for ( String factorName : factorNames ) {
666                     if ( coefNameFromR.startsWith( factorName ) ) {
667                         assert term2CoefficientNames.containsKey( factorName );
668                         term2CoefficientNames.get( factorName ).add( coefNameFromR );
669 
670                     }
671                 }
672             }
673         }
674     }
675 
676 }