#listcollectstats.pl
#
# This looks at all of the .sol files created by the execution of gurobi_cl, to extract out the X values
# and compare those to the real SNP values held in the file trueSNPs. Also, collect the objective function
# value and the running times. Also extract the trial dimensions and the objective function value of the trueSNPs
# from the file trueSNPs.
#
#December 2, 2014 modified to extract best bound in case the computation was interrupted
#June 14, 2014

open (INLOG, 'gurobi.log');
open (IN, 'datalist'); # the list of datafiles - we extract the extensions for use here.
open (SNP, 'trueSNPs');# a listing of the dimensions, the true SNP vectors, the obj value wrt. the true SNPs,
                       # created by multextract.pl, or multextractrp.pl 
                       # or multextractandrp.pl.
                       # In the case of multextractrp.pl, it also has the number of controls and cases. 

$outfile = $ARGV[0]; # where to put the comps results
open (COMPS, ">$outfile");
open (TRACE, ">trace");

@trueSNP = <SNP>;

$k = 0; # a counter for the number of datasets we process. We can't rely on $trialnum because these
        # might not start at 0, and $dataset might not be sorted correctly in order of $trialnum
        #
while ($dataset = <IN>) {
  chomp $dataset;
  $dataset =~ /((\d+).(\d+))/; # extract the extension
  $ext = $1;
  $trialnum = $2;
  $index = $3;
  if ($index == 0) { # the convension I have used is that index 0 is for the correct phenotypes
     print COMPS "\nUsing unpermuted phenotypes for trial $trialnum\n";
     $dimensions = $trueSNP[3*$k];
     chomp $dimensions;
     $true = $trueSNP[3*$k + 1];
     chomp $true;
     $length = length($true);
     @trueM = split(//,$true);
     ($objvalue, $numcontrols, $numcases)  = split(/ /, $trueSNP[3*$k + 2]);
     chomp $numcases;
     print TRACE "XXX $objvalue, $numcontrols, $numcases\n";
     chomp $objvalue;
     print COMPS "problem dimensions: $dimensions \nThe objective function value from true SNPs: $objvalue\n";
     $k++;
  }
  else {
     print COMPS "Using a permutation of the phenotypes for trial $trialnum\n";
  }

  $results = "$dataset.sol"; # this is the file that the variable values are written when gurobi executes $dataset.lp
#  print COMPS "$results\n";
  open (RESULTS, "$results"); 
#  print COMPS "$dimension";
  @X = ();
  foreach $i (0 ... $length-1) {
     $X[$i] = '*';   # initially we fill the X vector with *s because if a column in the data contains only 0s, the
                     # there will be no corresponding X variable for that column. Hence in the opt. solution, we
                     # can make that X value either 0 or 1, and so we ignore it when computing the Hamming distance
                     # between the SNP vectors.
  }
  

  $oneline= <RESULTS>;  # the first line in the results file gives the objective value.
  $oneline =~ m/= (\d.*)/;
  print TRACE "The obj. sol value is $1\n";
  $obj = int(0.5 + $1);
#  $oneline =~ /(\d+)/;
#  $obj = $1;
  print COMPS ("The optimal value for the objective function is $obj \n");
  print COMPS "The numbers of controls and cases are $numcontrols, $numcases\n";

  $line = <INLOG>;
#  until ($line =~ /Read LP format/) { # this line lists the problem identifier, which has already been output.
                                      # it is used as a correctness check.
#     $line = <INLOG>;
#  }
#  print COMPS "$line";

  until ($line =~ /Explored/) { # this line lists the time used.
     $line = <INLOG>;
  }
  $line =~ /(\d+.\d+ seconds)/;
  $timeused = $1;
  print COMPS "The solution was found in $timeused\n";
  foreach $i (0..2) {
     $line = <INLOG>;
  }
  if ($line =~ /interrupted/) {
      $line = <INLOG>;
      print COMPS "The computation was Interrupted\n";
      print COMPS "$line";
  }
  if ($line =~ /Time limit/) {
      $line = <INLOG>;
      print COMPS "The Time Limit was reached\n";
      print COMPS "$line";
  }



  print COMPS "The true SNPs and the SNPs assigned in this solution (a * can be either 0 or 1)\n";
  print COMPS "@trueM\n";
  while ($line = <RESULTS>) {
     if ($line =~ m/X(\d+) .*(\d)/) {
    #print "  X$1\n";
       $X[$1] = $2;
    }
   }
  $true1 = $true0 = $snp1 = $snp0 = $snpstar = 0;
  print COMPS "@X\n";
  $hamdist = 0;
  foreach $i (0 ... $length-1) { 
    if (($trueM[$i] != $X[$i]) and ($X[$i] ne '*')) { 
       $hamdist++;
    }

    if ($X[$i] == 1) {
       $snp1++;
    }
    elsif ($X[$i] eq '*') {
       $snpstar++;
    }
    else {
       $snp0++;
    }

    if ($trueM[$i] == 1) {
       $true1++;
    }
    else {
       $true0++;
    }
  }

  print COMPS "The Hamming distance between SNP vectors is:  $hamdist\n";
  print COMPS "The number of 0s and 1s in the true SNP vectors is:  $true0, $true1\n";
  print COMPS "The number of 0s and 1s  and *s in the computed SNP vectors is:  $snp0, $snp1, $snpstar\n";

  print COMPS "----------------------\n";
}
