# datascan.pl 
#
# collects and outputs the data from a comp file generated by executions for Rasmus's problem, and organizes it
# in a latex table.
# Sept. 2014
# updated Feb. 5, 2015 to compute averages over the datasets examined.
#
open (IN, $ARGV[0]);
open (OUT, ">$ARGV[0]table");

$openinglines = q/
\begin{table}
\begin{center}
\begin{tabular}{ccccccccc} 
/;

$closinglines = q/
\end{tabular}
\end{center}
\end{table}
/;

print OUT "\\documentclass[12pt]{article}\n";
print OUT "\\begin{document}\n";

print OUT "$openinglines";
print OUT "& c/p & hp & sites & HPD & PD/ub & case con & secs &  SNP-dist\\\\ \n" ;
print OUT "\\hline \n";

$outline = "";
$count = 0;
$interrupt = 0;
$totalcases = $totalcontrols = $totaln = $totalm = $totalHPD = $totalPD = $totaltime  = $totalHamming = $totalcount = 0;
$totalpHPD = $totalpPD = $totalptime =  $totalpHamming = 0;
$totallb  = $totalgap = $interruptcount = 0;

while  ($line = <IN>) {

   if ($line =~ /Using unpermuted/) {
       $outline .= "& c";
       $correct = 1;
   }

   if ($line =~ /Using a permutation/) {
       $outline .= "\n& p & & & ";
       $correct = 0;
   }

   if ($line =~ /cases are ((\d+), (\d+))/) {
     # print "In the cases are block\n";
         $outline .= "& $1 ";
         if ($correct == 1) {  
         $totalcases += $2;
         $totalcontrols += $3;
       #  print "$totalcases, $totalcontrols\n";
      }
   }
 

   if ($line =~ /dimensions: \d+: (\d+) (\d+)/) {
      $outline .= "& $1 & $2 ";
      $sites = $2;
      if ($correct == 1) {
         $totaln += $1;
         $totalm += $2;
      #   print "$totaln, $totalm\n";
      }
   }

   if ($line =~ /SNPs: (\d+)/) {
      $outline .= "& $1 ";
      if ($correct == 1) {
         $totalHPD += $1;
      }
      else {
         $totalpHPD += $1;
      }
   }

   if ($line =~ /function is (\d+)/) {
      $outline .= "& $1 ";
      if ($correct == 1) {
         $totalPD += $1;
      #   print "$totalPD\n";
      }
      else {
         $totalpPD += $1;
      }
   }

   if ($line =~ /found in (\d+.\d+)/) {
      $outline .= "& $1 ";
      if ($correct == 1) {
         $totaltime += $1;
      #   print "$totaltime\n";
      }
      else {
         $totalptime += $1;
      }
   }

   if ($line =~ /Hamming.+?(\d+)/) {
      if ($correct == 1) {
         $outline .= "& $1 ";
         $totalHamming += $1;
      #   print "$totalHamming\n\n";
      }
      if ($correct == 0) {
         $totalpHamming += $1;
         if ($interrupt) {
           $outline .= "\\\\ \n$secondline" . $1;
           $interrupt = 0;
         }
         else {
            $outline .= "& $1 ";
         }
      }
    }

      

   if ($line =~ /Best objective (.+),.+best bound (.+),.+gap.+?(\d+\.\d\d)/) {
      $interrupt = 1;
      $interruptcount++;
    #  print "$1, $2, $3\n";
      $bestobj = int($1);
      $bestbound = int($2);
      $gap = $3;
    #  print "$bestobj, $bestbound, $gap\n";
      $secondline =  "& lb/gap &  &  &  & $bestbound, $gap\\% &  &  &  " ;
      $totallb += $bestbound;
      $totalgap += $gap;
   }

   if ($line =~/------/) {
       
   if ($correct == 1) {
      $count++;
      $totalcount++;
   }

      if ($sites > 10) {  # we only want to include datasets of sufficient size to be of interest. When the number of sites is <= 10, they are too small.
        print OUT "$outline \\\\ \n"; 
        if ($correct == 0) {
           print OUT "\n\\hline \n";
        }
      }
      $outline = "";

      if ($count == 20) {
	      print OUT "$closinglines";
              print OUT  "\\newpage\n";

              print OUT "$openinglines";
              print OUT "& c/p & hp & sites & HPD & PD/ub & case con & secs &  SNP-dist\\\\ \n" ;
              print OUT "\\hline \n";
              $outline = "";
              $count = 0;
      }
    }
}

$avgn =  $totaln/$totalcount;
$avgm =   $totalm/$totalcount;
$avgHPD =  $totalHPD/$totalcount;
$avgpHPD =  $totalpHPD/$totalcount;
$avgPD = $totalPD/$totalcount;
$avgpPD = $totalpPD/$totalcount;
$avgcases = $totalcases/$totalcount;
$avgcontrols =  $totalcontrols/$totalcount;
$avgtime = $totaltime/$totalcount;
$avgptime = $totalptime/$totalcount;
$avgHamming =  $totalHamming/$totalcount;
$avgpHamming =  $totalpHamming/$totalcount;


print OUT " \\hline \\\\ \n";
print OUT "&AVG c & $avgn & $avgm & $avgHPD & $avgPD & $avgcases, $avgcontrols & $avgtime &  $avgHamming \\\\ \n" ;
print OUT "&AVG p &       &       &         & $avgpPD & $avgcases, $avgcontrols & $avgptime &  $avgpHamming \\\\ \n" ;
if ($interruptcount > 0) {
     $avglb = $totallb/$interruptcount;
     $avggap = $totalgap/$interruptcount;
     print OUT "&AVG Int. &       &       &         & $avglb, $avggap\\% &  & &  \\\\ \n" ;
}  # It is expected that these will only be obtained from permuted or non-causal genomic cases, but that is not 
   # explicitly encoded.

print OUT "$closinglines";
print OUT "\\end\{document\}";
