#!/usr/local/bin/perl # Paul Pavlidis # Calculate two-side ttest pvalues with welch option. use lib '.'; use Stats; $usage = "ttest [-r: rdb format line needs to be removed\n -w: use Welch approximate t\n -m; do mann-whitney U (a.k.a. Wilcoxon) test instead\n -rank: use rank transformation of the data for ttest\n -l: log transform the data\n -c: use class file instead of layout] \n"; die $usage unless @ARGV > 1; while ($ARGV[0] =~ /^-/) { $opt = shift @ARGV; if ($opt eq "-w") { $welch++; } elsif ($opt eq "-m") { $mann++; } elsif ($opt eq "-r") { $rdb++; } elsif ($opt eq "-l") { $log++; } elsif ($opt eq "-rank") { $ranktransform++; } elsif ($opt eq "-c") { $classfile++; } else { die "Illegal option\n"; } } ($data, $layout) = @ARGV; die "Can't do welch and MW test at once!\n" if ($welch && $mann); if ($classfile) { readclass($layout, \%cat); } else { readlayout($layout, \%cat); } # do some checking if (scalar keys %cat != 1) { die "ttests can only be done on two groups\n"; } $i = 0; foreach $m (keys %cat) { #only one foreach $k (keys %{$cat{$m}}) { # only two. $numcat[$i] = scalar @{$cat{$m}{$k}}; $i++; } } # whsehw. open (IN, "<$data") or die "couldn't open data\n"; $header = ; # remove header line. if ($rdb) { ; # if rdb print STDERR "Removed format line\n"; } else { print STDERR "Assuming this is NOT rdb format!\n"; } my $n = -1 + scalar split "\t", $header; my $totaldf = $n-2; my $fold; print STDERR "DF: Total uncorrected df $totaldf\n"; print "label\tstat\tp\tfold\n"; while () { chomp; s/\cM//; if (!$_) { print STDERR "Skipping blank line\n"; next; } ($label, @data) = split /\t/, $_; if (scalar @data != $n) { print STDERR "$label: Skipping because it is missing data\n"; next; } if ($log) { for ($i=0; $i $u2) { $t = "$u1"; } else { $t = "$u2"; } $prob = sprintf("%.g5", 2 * uprob($t, $numcat[0], $numcat[1])); } } else { # do regular, rank, or welch ttest on it. if ($ranktransform) { # convert the values in @data into ranks. my @ranks; ranks(\@data, \@ranks); @data = @ranks; } foreach $m (keys %cat) { # only one so this is silly @k = keys %{$cat{$m}}; # do fold calculation. $group1mean = calcGroupMean(\@data, $k[0], $cat{$m}); $group2mean = calcGroupMean(\@data, $k[1], $cat{$m}); $fold = foldChange($group1mean, $group2mean); $group1ssq = calcSSQ(\@data, $group1mean, $k[0], $cat{$m}); $group2ssq = calcSSQ(\@data, $group2mean, $k[1], $cat{$m}); $group1var = calcGroupVar(\@data, $group1mean, $k[0], $cat{$m}); $group2var = calcGroupVar(\@data, $group2mean, $k[1], $cat{$m}); } if ($welch) { $t = abs(( $group1mean - $group2mean ) / sqrt(($group1var/$numcat[0] + $group2var/$numcat[1]))); $totaldf = int( ($group1var/$numcat[0] + $group2var/$numcat[1])**2 / (($group1var/$numcat[0])**2/($numcat[0]-1) + ($group2var/$numcat[1])**2/($numcat[1]-1)) ); } else { $pooledvar = ($group1ssq + $group2ssq)/($numcat[0] + $numcat[1] - 2); $t = abs(( $group1mean - $group2mean ) / sqrt(($pooledvar/$numcat[0] + $pooledvar/$numcat[1]))); } $prob = sprintf("%.5g", Stats::tprob($totaldf, $t)); } print "$label\t$t\t$prob\t$fold\n"; } close IN; ################### Subroutines ############################## sub calcSSQ { my ($data, $mean, $m, $cat) = @_; my $sum = 0; my $grandtotal = 0; $catsize = scalar @{$cat->{$m}}; $sumdev = 0; foreach $k ( @{$cat->{$m}} ) { $sumdev+=($$data[$k] - $mean)**2; } return $sumdev; } sub calcGroupMean { my ($data, $m, $cat) = @_; my $sum = 0; $catsize = scalar @{$cat->{$m}} ; foreach $k ( @{$cat->{$m}} ) { $sum+=$$data[$k]; } return $sum / $catsize; } sub calcGroupVar { my ($data, $mean, $m, $cat) = @_; my $sum = 0; my $grandtotal = 0; $catsize = scalar @{$cat->{$m}}; $sumdev = 0; foreach $k ( @{$cat->{$m}} ) { $sumdev+=($$data[$k] - $mean)**2; } return $sumdev / ($catsize - 1); } sub foldChange { my ($groupmean1, $groupmean2) = @_; my $fold; if ($group1mean == 0) { $fold = "-inf"; } elsif ($group2mean == 0) { $fold = "+inf"; } else { $fold = $group1mean / $group2mean; } return $fold; }