#!/pkg/bin/perl -w # Written by Dan Gusfield copyright 2006 # Permission to use this program comes with no guarantees. You may use it if you don't laugh at it and # don't expect the author to understand or explain the code in the future. # #use diagnostics # March 15, 2006 making small changes to better illustrate how this program differs from minthap. # # mindhap.pl # this converts minthap.pl to mindhap.pl by commenting out the lines marked DG from minthap # This takes in a list of genotypes and generates the integer # program whose solution resolves all the genotypes using the minimum # number of distinct haplotypes. It only counts the input haplotypes (homozygotes) if # they are used in some other solution (explanation of a genotype), hence it minimizes the # number of haplotypes used to explain the ambiguous genotypes, without consideration of the # known haplotypes. So in fact, if we remove the input haplotypes, we should get the same solution. # This version also reduces the # number of variables used by not generating a variable for a haplotype # unless it is used in more than one pair, or it is part of a pair where # the other haplotype is used more than once. Further, a pair variable # is only created if both of the haplotype variables are created as above. # One caution: The program implements the RTIP formulation (see the published LNCS paper), which means # that certain haplotypes can be ignored, and (less commonly) a genotype will be ignored if all the # possible phasings of that genotpe involve ignored haplotypes. Such a genotype can be phased arbitrarilly # in an optimal solution. The objective function only reflects the non-ignored genotypes, # and therefore it may have a lower value than the true total number of haplotypes used. One has to examine the result of # the ILP solution to see if any input genotypes were ignored, and then add two to the objective function # for each such ignored genotype. As a convenience, I may add this adjustment into a future version of the program. # # Also, the LNCS paper describes a faster way to generate the ILP (that never generates haplotypes that will later be # ignored), but that is not implemented in this program. The speed of the ILP solution is what was of interest, not # the speed of generating the ILP formulation itself. # #dg #5/24/01 # open YLIST, ">ylistd"; %index = (); %haps = (); @pairconsts = (); $hapcount = $tot = $numgens = 0; $input = $ARGV[0]; open IN, "$input"; $line1 = ; $line2 = ; @gens = ; # this is the genX file. foreach $line (@gens) { chomp $line; $line =~ tr/ //d; #DG comment out the following to create mindhap.pl # if ( not ($line =~ /^[01 ]+$/)) { #1 # print "$line"; # $numtwos = $line =~ tr/2/2/; # # print "$numtwos \n"; # $ptwo = 2 ** $numtwos; # $tot += $ptwo; # $numgens++; # } #1 # else { #1 # if (! defined $index{$line}) { # $haps{$line} = 1; # hash each haplotype in the sacred set and give it a count # of one. # $const = "Y$hapcount = 1"; # push (@pairconsts, $const); # $index{$line} = $hapcount++; # } # } #1 } #print "There are $numgens ambiguous genotype lines (identities not removed)\n"; #print "The total number of potential haplotypes from ambiguous genotypes is $tot\n"; #$numpairs = $tot/2; #print "The total number of such haplotype pairs is $numpairs\n"; #print "The number of distinct haplotypes in the input is $hapcount\n"; @covers = (); open OUT, ">$ARGV[1]"; ($ext, $rep) = ($line1 =~ /(\d+)/g); open TRAN, ">transfiled$ext.$rep"; print TRAN "ext, rep: $line1"; print TRAN "Each X-type LP variable is associated with a genotype vector in the input, and two haplotype vectors whose superposition explains that vector. The following gives first the X-LP variable subscript i, and the position in the input (gen file) list where the associated genotype vector is. This is also the position of the ID of that genotype in the ID list that is found at the end of the hap(p)sav input file. Then the genotype vector is given, followed by two haplotype vectors that can explain it.\n\n"; foreach $line (@gens) { #0 if ( not ($line =~ /^[01 ]+$/)) { #1 Generate all the haplotypes consistent # with that genotype and hash the haplotypes # and count the number of times each occurs. $numtwos = $line =~ tr/2/2/; # print "$numtwos\n"; $xcount = 0; $oline = $line; @lines = $line; @clines = $line; for ($i = 0; $i < $numtwos; $i++) { #2 @savlines = @lines; foreach $line (@lines) { #3 $line =~ s/2/0/; } #3 foreach $line (@savlines) { #3 $line =~ s/2/1/; } #3 push (@lines, @savlines); } #2 foreach $line (@lines) { # hash the haplotypes generated and update the count. if (defined $haps{$line}) { $haps{$line} += 1; } else { # print "haplotype $line is new\n"; $haps{$line} = 1; } # print "$line, XX $haps{$line} \n"; } } # 1 } # 0 # Now go back over each genotype and generate the haplotype pairs and # check whether the two haplotypes only occur in that genotype. If so # do not give an index to those two haplotypes, and do not create # a variable for that pair. But if the pair should be generated etc. # create the index. $paircount = 0; $positionnumber = 0; # positionnumber is the place in the list of genotypes # where this pair is derived from. The need for this is # due to the fact that a genotype will be skipped # if no pairs are generated for it. foreach $line (@gens) { #0 $pairsforthisgen = 0; if ( not ($line =~ /^[01 ]+$/)) { #1 $numtwos = $line =~ tr/2/2/; $oline = $line; @lines = $line; @clines = $line; for ($i = 0; $i < $numtwos; $i++) { #2 @savlines = @lines; foreach $line (@lines) { #3 $line =~ s/2/0/; } #3 foreach $line (@savlines) { #3 $line =~ s/2/1/; } #3 push (@lines, @savlines); } #2 for ($i = 0; $i < $numtwos; $i++) { #2 @savlines = @clines; foreach $line (@clines) { #3 $line =~ s/2/1/; } #3 foreach $line (@savlines) { #3 $line =~ s/2/0/; } #3 push (@clines, @savlines); } #2 $cover = ""; $ptwos = 2**$numtwos; foreach $i (0..$ptwos-1) { #2 $hap1 = $lines[$i]; $hap2 = $clines[$i]; # print "\n$hap1\n$hap2 \n"; if ($hap1 lt $hap2) { #3 # print "hap1 is lt hap2\n"; # $stopit = ; if (($haps{$hap1} > 1) || ($haps{$hap2} > 1)) { #4 if (! defined $index{$hap1}) { print YLIST "$hapcount: $hap1\n"; $index{$hap1} = $hapcount++; } if (! defined $index{$hap2}) { print YLIST "$hapcount: $hap2\n"; $index{$hap2} = $hapcount++; } # print "indices: $index{$hap1}, $index{$hap2}\n"; #print OUT "$paircount:\n"; #print OUT "$oline\n"; #print OUT "$hap1\n"; #print OUT "$hap2\n"; print TRAN "$paircount: $positionnumber\n"; #print TRAN "$paircount:\n"; $oline =~ tr/ //d; print TRAN "$oline\n"; # print the genotype line print TRAN "$hap1\n"; print TRAN "$hap2\n"; $cover .= '+X' . $paircount; # generate a variable for the pair. $xcount++; if (0 == $xcount % 8) { # new May 22 $cover .= "\n"; } foreach $hap ($hap1, $hap2) { #5 # print "paircount $paircount\n"; # debugging statement $pairconst = "X$paircount - Y$index{$hap} <= 0"; # print OUT "$pairconst \n"; push (@pairconsts, $pairconst); } #5 $paircount++; $pairsforthisgen++; } #4 } #3 } #2 # print OUT "End of case: $pairsforthisgen pairs produced. \n"; if ($pairsforthisgen > 0) { $numgensinlp++; # print OUT "$cover = 1\n"; # $stopit = ; push (@covers, "$cover = 1"); } } #1 $positionnumber++; } #0 print OUT "The number of distinct haplotypes is $hapcount. \n"; print OUT "The number of haplotype pairs is $paircount. \n"; print OUT "The number of ambiguous genotypes is $numgens\n"; print OUT "The number that show up in the IP is $numgensinlp\n"; print OUT "minimize \n"; foreach $i (0...$hapcount -1) { print OUT "+Y$i "; if (0 == $i % 8) { print OUT "\n"; } } print OUT "\n st \n"; foreach $cover (@covers) { print OUT "$cover \n"; } foreach $pairconst (@pairconsts) { print OUT "$pairconst \n"; } print OUT "binaries \n"; foreach $i (0...$paircount -1) { print OUT "X$i\n"; } foreach $i (0...$hapcount -1) { print OUT "Y$i\n"; } print OUT "end";