
# Dec. 23, 2014
#
# multextract_for_gwas.pl is adapted from multextractandps.pl. It takes in a file with multiple datasets 
# produced by GWAparser2 and extracts out
# separate files in the form that my ilp generating programs can use. However, it does not create files where the phenotypes
# are permuted. 
#

use diagnostics;
$infile = "$ARGV[0]"; # a binary haplotype file from Rasmus's simulaton.

$permphen = 0; # this turns on (1) or off (0) the creation of the permuted dataset.

open (IN, "$infile");
open (OUTD, '>datalist');  # list of the files for the extracted data
#open (STAT, '>>stats');
open (OUTSNP, '>trueSNPs');

$trialnumberstart = $ARGV[1];
$trialnumber = $trialnumberstart;
$totalp1 = $totalp2 = $count = 0;
$num0targets = $num1targets = 0;

while ($line = <IN>) {
$outfile = "$ARGV[0]$trialnumber.0";
open OUT, ">$outfile";  # where the converted binary data is output in a form that my lp generating programs
                          # can understand
print OUTD "$outfile\n";
  print "trial$trialnumber.0\n";
  print OUT "trial$trialnumber.0\n";

  chomp ($line);
  ($col, $numpairs) = split(/ /, $line);
  print OUT "$numpairs $col \n";
  print OUTSNP "$trialnumber: $numpairs $col \n";
  $correctSNPs = <IN>;
  print OUTSNP "$correctSNPs"; 
  @bin = split(//, $correctSNPs);
  $obj = 0;
  $real1 = $real0 = $convert1 = $convert0 = 0;
  $i = 0;
  $j = 0;
  @accumulated = ();

  for (;;) 
  {
    if (eof IN) {
    close(IN); 
    close(OUT);
    last; # jump out of the for-loop because the end of the file has been reached..
    }


  $line1 = <IN>; 
#  print "$line1";

  chomp ($line1);
  if ($line1 eq "") {
#    close(OUT);
    last; # jump out of the for-loop because the end of a dataset has been reached.
  }
  ($line, $target) = split(/ /, $line1);

    if ($target == 0) {   # Dec. 11, GWAparser2.c now codes the phenotypes as 0 for controls and 1 for cases, just as I do.
       $target = 0;
       $num0targets++;
    }
    else {
       $target = 1;
       $num1targets++;
    }

    $line2 = <IN>;
    chomp ($line2);
    @data1 = split(//, $line);
    @data2 = split(//, $line2);

    $sum1 = $sum2 = 0;  # compute the objective value for the true SNPs
    foreach $j (0 ... $col - 1) {
             $sum1 += $bin[$j]*$data1[$j];
             $sum2 += $bin[$j]*$data2[$j];
    }
    if (($sum1 > 0) and ($sum2 > 0)) {
            $W = 1;
            $real1++;
    }
    else {
            $W = 0;
            $real0++;
         }
    if ($W != $target) {
       $obj++;
       if ($W == 1) {
          $convert1++;  # a case was changed to a control 
       }
       else {
          $convert0++; # a control was changed to a case
       }
    }         


    print OUT "@data1\n";
    print OUT "@data2\n";
#    print OUT "$line\n";
#    print OUT "$line2\n";
    print OUT "$target\n";
    $accumulated[$i] = $line;
    $i++;
    $accumulated[$i] = $line2;
    $i++;
#     $accumulated[$i] = $target;
#     $i++;
     $targets[$j] = $target;
     $j++;
  }
    print OUTSNP "$obj ";
    $obj = 0;
    print OUTSNP "$num0targets $num1targets\n";
    $num0targets = $num1targets = 0;
    close(OUT);

    # now we generate data with permuted phenotypes.
  if ($permphen == 1)   
  {

    $outfile = "$ARGV[0]$trialnumber.1";
    print OUTD "$outfile\n";
    open OUT, ">$outfile";  # where the converted binary data is output in a form that my lp generating programs
                          # can understand 

    foreach ($ii = 0; $ii < $numpairs*2; $ii++) {
      @line = split(//,$accumulated[$ii]); 
      print "@line\n";
    }

    print "@targets\n";
        foreach $ri (0..$numpairs - 1) { # permute the targets for the haplotype pairs
            $j = int(rand($numpairs - 1));
            ($targets[$ri], $targets[$j]) = ($targets[$j], $targets[$ri]);
         }
     print "@targets\n";

         print OUT "trial$trialnumber.1\n";

         print OUT "$numpairs $col \n";
         for ($ii = 0; $ii < $numpairs; $ii++) {
            @line = split(//, $accumulated[$ii * 2]); 
            @line2 = split(//, $accumulated[($ii * 2)+1]); 
            print OUT "@line\n";
            print OUT "@line2\n";
            print OUT "$targets[$ii]\n";
          }
    } # end of if permphen

     $trialnumber++;
#    print OUTSNP "$obj\n";

    if ($real1 > 0) {
        $p1 = $convert1/$real1;
    }
    else {
        $p1 = 0;
    }

    if ($real0 > 0) {
        $p2 = $convert0/$real0;
    }
    else {
        $p2 = 0;
    }
    print "$p1, $p2 \n";
    $totalp1 += $p1;
    $totalp2 += $p2;
    $count++;
}

$averagep1 = $totalp1/$count;
$averagep2 = $totalp2/$count;

print "$averagep1,  $averagep2 \n";
#close (OUT);
close (OUTD);
close (OUTSNP);

