#!/usr/local/bin/perl # Paul Pavlidis use Stats; $usage = "patternmatch \n\t[-a: use absolute value of correl;\n\t -r: rdb formatl line is present;\n\t -d: negative correlations not set to 1.0\n\t -c: pattern is a 'classification' file]\n\t \n"; my $GNUSORT = "/usr/local/bin/sort"; my $correct = 1.0; while ($ARGV[0] =~ /^-/) { $opt = shift @ARGV; if ($opt eq "-d") { $correct = 0; print STDERR "Negative correlations will not be given p values of 1.0 by default\n"; } elsif ($opt eq "-r") { $rdb++; } elsif ($opt eq "-a") { $useabs++; print STDERR "Taking the absolute value of the correlation\n"; } elsif ($opt eq "-c") { $classfile = 1; } else { die "Illegal option\n"; } } die $usage if @ARGV < 2; ($pattern, $data) = @ARGV; ($dataname = $pattern) =~ s/.+?\/(.+)?\.pat$/$1/; $dataname =~ s/.+\///; open (IN, "<$pattern") or die "Could not open $pattern:$!\n"; if ($classfile) { ; if ($rdb) { ; } while () { chomp; s/\cM//; ($sample, $val) = split "\t", $_; push @patt, $val; } } else { $patternline = ; close IN; if ($patternline =~ /^\s$/) { die "No pattern found!\n"; } chomp $patternline; @patt = split /\s+/, $patternline; } close IN; ($dataname2 = $data) =~ s/.+\/(.+)$/$1/; $dataname2 =~ s/\.txt$//; $outputname = "${dataname}-${dataname2}-patt-correlpvals.txt"; print STDERR "Output to $outputname\n"; # precalc as much as possible my $sumx = 0; my $sumxs = 0; my $m = 0; my %tobeskipped; my @pattcopy; foreach $i (@patt) { if ($i eq "-") { $tobeskipped{$m}++; } elsif ($i !~ /[0-9]/) { die "Illegal pattern symbol found ($i)\n"; } else { push @pattcopy, $i; $sumx+=$i; $sumxs+=$i*$i; } $m++; } my $meanx; if ($centered) { $meanx = 0; } else { $meanx = $sumx / $m; } open (DATA, "<$data") or die "Could not open data file $data\n";; $head = ; if ($rdb) { ; print STDERR "Removing extra rdb format line\n"; } else { print STDERR "Assuming this is not rdb format\n"; } while () { chomp; s/\cM//; if (!$_) { print STDERR "Skipping blank line\n"; next; } ($gene, @dat) = split /\t/, $_; my @datacopy; if (scalar keys %tobeskipped) { # remove the unwanted data. for ($i=0; $i<@dat; $i++) { if (!$tobeskipped{$i}) { push @datacopy, $dat[$i]; } } } else { @datacopy = @dat; } $n = scalar @datacopy; if (scalar @pattcopy != $n) { die "Mismatch in pattern size compared to data length ($n).\n"; } # measure the correlation coefficient with the pattern if ($centered) { $meany = 0; } else { $meany = mean(\@datacopy); } $correl = correlation_op($sumx, $sumxs, $meanx, \@pattcopy, $meany, \@datacopy, 3); if ($useabs) { $correl = abs($correl); } $correls{$gene} = $correl; if ($correl < 0 && $correct) { $pvals{$gene} = 1.0; } elsif ($correl == 1.0) { $pvals{$gene} = 0.0; } else { $t = $correl/sqrt((1-$correl*$correl)/($n-2)); $pvals{$gene} = sprintf ("%.8g", (tprob($n-2, $t))); } } close DATA; open(OUT, "| $GNUSORT -gk3 > $outputname"); print OUT "Gene\tCorrel\tPvalue\n"; foreach $k (keys %correls) { print OUT "$k\t$correls{$k}\t$pvals{$k}\n"; } close OUT;