#!/pkg/bin/perl -w
# Written  by Dan Gusfield  copyright 2006
# # Permission to use this program comes with no guarantees. You may use it if you don't laugh at it and
# # don't expect the author to understand or explain the code in the future.
#
#use diagnostics
# March 15, 2006 changed the description below to better document what this program does.
#
# minthap.pl
# This takes in a list of genotypes and generates the integer
# program whose solution resolves all the genotypes using the minimum
# number of distinct haplotypes. It counts the input haplotypes (homozygotes) whether they
# are used in any *other* solutions (genotype explanation) or not, hence it essentially minimizes the
# number of additional haplotypes created. That is, since the homozygotes will be paid for
# in the objective function, there is an incentive to reuse them in the explainations of the
# ambiguous genotypes, and would therefore produce better solutions that mindhap.pl which does not
# count the homozygotes unless they are used in some other solution. This version also reduces the
# number of variables used by not generating a variable for a haplotype
# unless it is used in more than one pair, or it is part of a pair where
# the other haplotype is used more than once.  Further, a pair variable
# is only created if both of the haplotype variables are created as above.
# 
# One caution: The program implements the RTIP formulation (see the published LNCS paper), which means
# that certain haplotypes can be ignored, and (less commonly) a genotype will be ignored if all the
# possible phasings of that genotpe involve ignored haplotypes. Such a genotype can be phased arbitrarilly
# in an optimal solution. The objective function only reflects the non-ignored genotypes,
# and therefore it may have a lower value than the true total number of haplotypes used. One has to examine the result of
# the ILP solution to see if any input genotypes were ignored, and then add two to the objective function
# for each such ignored genotype. As a convenience, I may add this adjustment into a future version of the program.
#
# Also, the LNCS paper describes a faster way to generate the ILP (that never generates haplotypes that will later be
# ignored), but that is not implemented in this program. The speed of the ILP solution is what was of interest, not
# the speed of generating the ILP formulation itself.
#dg
#5/24/01
#

open YLIST, ">ylist";
%index = ();
%haps = ();
@pairconsts = ();
$hapcount = $tot = $numgens = 0;
$input = $ARGV[0];
open IN, "$input";
$line1 = <IN>;
$line2 = <IN>;
@gens = <IN>; # this is the genX file.
 foreach $line (@gens) {
 chomp $line;
 $line =~ tr/ //d;
 if ( not ($line =~ /^[01 ]+$/)) { #1
 # print "$line";
 $numtwos = $line =~ tr/2/2/;

 # print "$numtwos \n";
 $ptwo = 2 ** $numtwos;
 $tot += $ptwo;
 $numgens++;
 } #1
 else { #1
 if (! defined $index{$line}) {
   $haps{$line} = 1; # hash each haplotype in the sacred set and give it a count
                     # of one.
    $const = "Y$hapcount = 1";
    push (@pairconsts, $const);
    print YLIST "$hapcount: $line\n";
    $index{$line} = $hapcount++;
 }
 } #1
} 

print "There are $numgens ambiguous genotype lines (identities not removed)\n";
print "The total number of potential haplotypes from ambiguous genotypes is $tot\n";
$numpairs = $tot/2;
print "The total number of such haplotype pairs is $numpairs\n";
print "The number of distinct homozygotes in the input (sacred set) is $hapcount\n";

@covers = ();
open OUT, ">$ARGV[1]";

($ext, $rep) = ($line1 =~ /(\d+)/g);
open TRAN, ">transfilet$ext.$rep";
print TRAN "ext, rep: $line1";
print TRAN "Each X-type LP variable is associated with a genotype vector in the input,
and two haplotype vectors whose superposition explains that vector.
The following gives first the X-LP variable subscript i, and the position in
the input (gen file) list where the associated genotype vector is. This is also
the position of the ID of that genotype in the ID list that is found at the
end of the hap(p)sav input file.  Then the genotype vector is given, followed
by two haplotype vectors that can explain it.\n\n";

foreach $line (@gens) {  #0
 if ( not ($line =~ /^[01 ]+$/)) { #1  Generate all the haplotypes consistent
                                  # with that genotype and hash the haplotypes
                                  # and count the number of times each occurs.

$numtwos = $line =~ tr/2/2/;
# print "$numtwos\n";
$xcount = 0;

$oline = $line;
@lines = $line;
@clines = $line;

for ($i = 0; $i < $numtwos; $i++) { #2
  @savlines = @lines;
  foreach $line (@lines) { #3
  $line =~ s/2/0/;
  } #3
  foreach $line (@savlines) { #3
  $line =~ s/2/1/;
  } #3
  push (@lines, @savlines);
} #2


foreach $line (@lines) {   # hash the haplotypes generated and update the count.
  if (defined $haps{$line}) {
    $haps{$line} += 1;
  }
  else {
#    print "haplotype $line is new\n";
    $haps{$line} = 1;
  }
# print "$line, XX $haps{$line} \n";
}

} # 1
} # 0


# Now go back over each genotype and generate the haplotype pairs and
# check whether the two haplotypes only occur in that genotype. If so
# do not give an index to those two haplotypes, and do not create
# a variable for that pair. But if the pair should be generated etc.
# create the index.

$paircount = 0;
$positionnumber = 0; # positionnumber is the place in the list of genotypes
                     # where this pair is derived from. The need for this is
                     # due to the fact that a genotype will be skipped
                     # if no pairs are generated for it.

foreach $line (@gens) {  #0
$pairsforthisgen = 0;
 if ( not ($line =~ /^[01 ]+$/)) { #1

$numtwos = $line =~ tr/2/2/;

$oline = $line;
@lines = $line;
@clines = $line;

for ($i = 0; $i < $numtwos; $i++) { #2
  @savlines = @lines;
  foreach $line (@lines) { #3
  $line =~ s/2/0/;
  } #3
  foreach $line (@savlines) { #3
  $line =~ s/2/1/;
  } #3
  push (@lines, @savlines);
} #2

for ($i = 0; $i < $numtwos; $i++) { #2
  @savlines = @clines;
  foreach $line (@clines) { #3
  $line =~ s/2/1/;
  } #3
  foreach $line (@savlines) { #3
  $line =~ s/2/0/;
  } #3
  push (@clines, @savlines);
} #2

$cover = "";
$ptwos = 2**$numtwos;
foreach $i (0..$ptwos-1) { #2

  $hap1 = $lines[$i];
  $hap2 = $clines[$i];
# print "\n$hap1\n$hap2 \n";

if ($hap1 lt $hap2) { #3
# print "hap1 is lt hap2\n";
# $stopit = <STDIN>;
  if (($haps{$hap1} > 1) || ($haps{$hap2} > 1)) { #4
    if (! defined $index{$hap1}) {
      print YLIST "$hapcount: $hap1\n";
      $index{$hap1} = $hapcount++;
    }
    if (! defined $index{$hap2}) {
      print YLIST "$hapcount: $hap2\n";
      $index{$hap2} = $hapcount++;
    }
# print "indices: $index{$hap1}, $index{$hap2}\n";

#print OUT "$paircount:\n";
#print OUT "$oline\n";
#print OUT "$hap1\n";
#print OUT "$hap2\n";
print TRAN "$paircount: $positionnumber\n";
#print TRAN "$paircount:\n";
$oline =~ tr/ //d;
print TRAN "$oline\n";  # print the genotype line
print TRAN "$hap1\n";
print TRAN "$hap2\n";

   $cover .= '+X' . $paircount; # generate a variable for the pair.

   $xcount++;
   if (0 == $xcount % 8) {  # new May 22
    $cover .= "\n";
   }

   foreach $hap ($hap1, $hap2) { #5
 #    print "paircount $paircount\n";  # debugging statement

     $pairconst = "X$paircount - Y$index{$hap} <= 0";
 #    print OUT "$pairconst \n";
     push (@pairconsts, $pairconst);
   } #5
  $paircount++;
  $pairsforthisgen++;
 } #4
} #3
} #2
# print OUT "End of case: $pairsforthisgen pairs produced. \n";
if ($pairsforthisgen > 0) {
   $numgensinlp++;
 #  print OUT "$cover = 1\n";
   # $stopit = <STDIN>;
   push (@covers, "$cover = 1");
}
} #1
$positionnumber++;
} #0

 print OUT "The number of distinct haplotypes is $hapcount. \n";
 print OUT "The number of haplotype pairs is $paircount. \n";
 print OUT "The number of ambiguous genotypes is $numgens\n";
 print OUT "The number that show up in the IP is $numgensinlp\n";

 print OUT "minimize \n";
 foreach $i (0...$hapcount -1) {
   print OUT "+Y$i "; 
   if (0 == $i % 8) {
     print OUT "\n";
   }
 }

 print OUT "\n such that \n";

# foreach $i (0...$hapcount -1) {
#   print OUT "+Y$i "; 
#   if (0 == $i % 8) {
#     print OUT "\n";
#   }
# }
#
# print OUT "\n";
#
# foreach $i (0...$paircount-1) {
#   print OUT "+X$i "; 
#   if (0 == $i % 8) {
#     print OUT "\n";
#   }
# }
# print OUT "= 159\n";

 foreach $cover (@covers) {
   print OUT "$cover \n";
 }

 foreach $pairconst (@pairconsts) {
   print OUT "$pairconst \n";
 }
 print OUT "binaries \n";
 foreach $i (0...$paircount -1) {
  print OUT "X$i\n";
 }
 foreach $i (0...$hapcount -1) {
  print OUT "Y$i\n";
 }
 print OUT "end";
