#!/usr/local/bin/perl
# Paul Pavlidis. For one way analysis of variance with equal numbers of replicates in each group.
use Stats;
$usage = "anova-oneway [-l: log transform; -v: verbose output; -r: format line needs to be removed; -c: use class file instead of layout] <datafile> <layoutfile>\n";
die $usage unless @ARGV > 1;

while ($ARGV[0] =~ /^-/) {
  $opt = shift @ARGV;
  if ($opt eq "-r") {
    $rdb++;
  } elsif ($opt eq "-l") {
    $log++;
  } elsif ($opt eq "-v") {
    $verbose++;
  } elsif ($opt eq "-c") {
    $classfile++;
  } else {
    die "Illegal option\n";
  }
}


($data, $layout) = @ARGV;
my $numcategories = 0;
if ($classfile) {
  $numcategories = readclass($layout, \%cat);
} else {
  $numcategories = readlayout($layout, \%cat);
}

if (0) {
  open (IN, "<$layout") or die "Couldn't open layout $layout\n";
# = category title
# % category name
  my (@numcats);
  $/= "=";
  <IN>;
  
  my $catnum = 0;
  $firstcat = 1;
  while (<IN>) {
    chomp;
    s/\cM//;
  ($category, @dat) = split /\n/, $_;
    $category =~ s/=//;
    print STDERR "$category =";
    push @maincats, $category;
    $incat=0;
    
    foreach $m (@dat) {
      if($m=~/\%(.+)/) { # start a new category.
	$incat=1;
	$catnum++;
	$catname = $1;
	print STDERR "\n$catname:\t";
	if (!$firstcat) {
	}
	next;
      }
      if ($incat) { # subcategories.
	push @{$cat{$category}->{$catname}}, $m;
	$numcat[$catnum]++;
	print STDERR "$m\t";
	if ($firstcat) {
	  $n++;
	}
      }
    }
    $firstcat = 0;
    print STDERR "\n";
  }

  shift @numcat;
  $numcategories = scalar @numcat;
  print STDERR "N=$n. Numcats=$numcategories. Number in each category:";
  foreach $m (@numcat) {
    print STDERR " $m";
  }
  print STDERR "\n";
  close IN;
  
# construct the table of replicates. This is a bit tricky.
  foreach $m (keys %cat) {
    foreach $k (keys %{$cat{$m}}) {
      foreach $q (@{$cat{$m}->{$k}}) {
	push @{$numwithcats{$q}}, $k; # associate categories with this trial.
      }
    }
  }
  
  
  foreach $k (sort keys %numwithcats) {
    $cat = join " ", @{$numwithcats{$k}};
    push @{$replicate{$cat}}, $k; # reverse the hash...
  }

# check
  foreach $k (sort keys %replicate) {
    foreach $m (@{$replicate{$k}}) {
      print STDERR "$m $k\n";
    }
  } 
} #skip

if ($numcategories == 0) {
  die "No categories found\n";
}

# whsehw.
open (IN, "<$data") or die "couldn't open data\n";

$/="\n";
my $header = <IN>; # remove header
if ($rdb) {
  <IN>; # if rdb
  print STDERR "Removed format line\n";
} else {
  print STDERR "Assuming this is NOT rdb format!\n";
}

my $n = -1 + scalar split "\t", $header;
my $totaldf = $n-2;
#$totaldf = $n-1;
$groupsdf = $numcategories - 1;
$errordf = $totaldf - $groupsdf;
print STDERR "DF: Total $totaldf  Groups: $groupsdf Error: $errordf\n";
#print "label\ttotalssq\ttotaldf\tcellssq\tcelldf\tffssq\tffdf\tsfssq\tsfdf\tffxsfssq\tffxsfdf\terrorssq\terrordf\tfcell\tfff\tsff\tfxsf\n";
#print "label\tffdf\tsfdf\tffxsfdf\terrordf\tfff\tsff\tfxsf\n";
if ($verbose) {
  print "label\ttotalssq\ttotaldf\tgroupssq\tgroupsdf\terrorssq\terrordf\tf\tp\n";
} else {
  print "label\tf\tp\n";
}

while (<IN>) {
  chomp;
  s/\cM//;
  if (!$_) {
    print STDERR "Skipping blank line\n";
    next;
  }

  ($label, @data) = split /\t/, $_;

  if (scalar @data == 0) {
    print STDERR "$label: Skipping because it has no data\n";
    next;
  }

  if (scalar @data != $n) {
    $numfound = scalar @data;
    print STDERR "$label: Skipping because it is missing data (expect $n, found $numfound\n";
    next;
  }

  if ($log) {
    for ($i=0; $i<scalar @data; $i++) {
      if ($data[$i] <= 0) {
	die "You can't use the '-l' option when there are values less than or equal to zero in the data. (at $label)\n";
      }
      $data[$i] = log($data[$i]);
    }
  }

  # do anova on it.

  #calculate C;
  $C = calcC(\@data);
  if ($C == 0) {
    if ($verbose) {
      print STDERR "$label: Warning: Skipping because the data values are all equal\n";
    }
    print "$label\t0\t1.0\n";
    next;
  }
  
  # total SS
  $totalssq = calcTotalSSQ(\@data, $C);
  
  # groups SS
  $groupssq = calcGroupSSQ(\@data, $C, \%cat);
  $groupsmsq = $groupssq / $groupsdf;

  # error
  $errorssq = $totalssq - $groupssq;
  $errormsq = $errorssq/$errordf;

  # F
  $f = $groupsmsq/$errormsq;
  
  if ($f < 0) {
    print STDERR "Warning: negative value found for f=$f (probably roundoff error, using 0.0)\n";
    $f = 0.0;
  }

  $p = sprintf("%.6g", fprob($f, $groupsdf, $errordf));

  if ($verbose) {
    print "$label\t$totalssq\t$totaldf\t$groupssq\t$groupsdf\t$errorssq\t$errordf\t$f\t$p\n";
  } else {
    print "$label\t$f\t$p\n";
  }
}


##################################
# Subs
##################################
sub calcC {
  my ($data) = @_;
  my $n=0;
  my $sum = 0;
  foreach $m (@$data) {
    $sum+=$m;
    $n++;
  }
  die "Hey!!!! No zero divides, please.\n" if $n == 0;
  return ($sum * $sum) / $n;
}

sub calcTotalSSQ {
  my ($data, $c) = @_;
  my $sum = 0;
  foreach $m (@$data) {
    $sum+=$m*$m;
  }
  $sum-=$c;
  return $sum;
}

sub calcGroupSSQ {
  my ($data, $c, $cat) = @_;
  my $sum = 0;
  my $grandtotal = 0;
  foreach $m (sort keys %$cat) { # for each cell (only one category here )
    foreach $k (sort keys %{$$cat{$m}}) { # for each subcategory;
      $sum=0;
      $catsize = scalar @{$cat{$m}{$k}};
      foreach $m ( @{$cat{$m}{$k}} ) {
	$sum+=$$data[$m];
      }
      $grandtotal+=($sum*$sum)/$catsize;
    }
  }
  $grandtotal-=$c;
  return $grandtotal;
}

##--
