#!/pkg/bin/perl -w
# perfreduce.pl
# March 29, 2015
# Dan Gusfield copyright 2015 - 
# you may use this if you do not laugh at it or expect me to explain it or maintain it.
# however, if you incorporate it into your own programs, you should acknowledge my work.
#
# this program is modified from perftest.pl, to create a matrix where the perfectly compatible sites are removed. It also removes 
# any duplicated rows, although
# if the data has been passed through galledtree.pl, duplicated rows will already have been removed.
#
# modified March 12, 2015 for use with files generated by persistentpipe.pl
#
# modified October 23, 2004
# perftest.pl 
 use diagnostics
#This takes in a list of strings (DNA or binary), and
#checks to see that the data satisfies the requirement for
#an undirected perfect phylogeny, 
#i.e., that in each pair of columns, the number of
#distinct paired character states is less than four.
# It also computes the lower bound on the number of
# recurrent mutations needed that is stated in Steel's book
#
#
$infile = $ARGV[0];
open IN, "$infile";
open OUT, ">outcount";
open SAVIN, ">sav$ARGV[0]";
open STATS, ">>stats";
$set = <IN>; # There is one line before the haplotypes.
print SAVIN "$set";
#
@lines = <IN>;  
close (IN);
$line = $lines[1];
$line =~ tr/ //d;
#print "$line";
chomp $line;
$stringlen = length ($line);
$newstring = "0"x($stringlen);   # March, 2015. Add the all-zero string to the matrix, because the persistent phylogeny model assumes that
                                 # the ancestral sequence is the all-zero string, and this program is adapted from one that
                                 # applies the four-gamete rule rather than the three gamete rule. So, we want it included when
                                 # we look for conlficts, and since we will add a constant all-zero row to the extended
                                 # matrix, it will not do any harm to remain in - after it has been converted to all-? in 
                                 # the extended matrix, it can be set to the all-zero string (which is a constant string there),
                                 # so it can't do any harm by remaining in. That simplifies the programming.
push (@lines, $newstring);
# print "@lines\n";

#print "$stringlen ***\n";

   %count = ();

$left = 0;
$right = $stringlen;

   foreach $i ($left...$right -2) {
     foreach $j ($i+1...$right -1) {
     $ii = $i+1;
     $jj = $j+1;
     $key = $ii . ',' . $jj;
     %{$hash{$key}} = ();     
     $count{$key} = 0;
     }
   }
     
%distinctline = ();
foreach $line (@lines) {
 print SAVIN "$line";
 chomp $line;
 $line =~ tr/ //d;
   $distinctline{$line} = 1;

   @chars = split(//, $line);
   foreach $i ($left...$right -2) {
     foreach $j ($i+1...$right -1) {
     $ii = $i+1;
     $jj = $j+1;
     $key = $ii . ',' . $jj;
     $value  = $chars[$i] . ',' . $chars[$j];
     if (! defined $hash{$key}{$value}) {
       $hash{$key}{$value} = 1;
       $count{$key}++;
   # print OUT "Another $key; $value, $count{$key}\n";
     }
     }
  } 
}

 $count = $conflictcount = 0;
 foreach $ekey (sort keys %count) {
  if ($count{$ekey} == 4) {
  print OUT "$ekey  $count{$ekey} \n";
  $count++;
  ($i,$j) = split(/,/,$ekey);
    if (! defined $conflict{$i}) {
       $conflict{$i} = 1;
       $conflictcount++;
    }
    else {
       $conflict{$i}++;
    }
    if (! defined $conflict{$j}) {
       $conflict{$j} = 1;
       $conflictcount++;
    }
    else {
       $conflict{$j}++;
    }
  }
 }
 print OUT "$ARGV[0]: The number of non-perfect pairs is $count\n";
 print STATS "$ARGV[0]: The number of non-perfect pairs is $count\n";
if ($conflictcount > 0) {
 print OUT "The number of sites that are involved in a conflict is $conflictcount\n"; 
 print OUT "The conflicted sites are: \n";
}

 @conflicted =  (sort {$a <=> $b} keys %conflict); 

 foreach $i (@conflicted) {
  print OUT "$i $conflict{$i}\n";
 }

  print OUT "\n\n";
 $totval = 0;
 foreach $val (sort {$b <=> $a} values %conflict) {
  $totval += $val;
  print OUT "$val, $totval\n";
 }

%line_inventory = ();
open NEWOUT, ">$infile";
print NEWOUT "$set";
 foreach $line (@lines) {
    @chars = split (//, $line); 
    $newline = "";
    foreach $site (@conflicted) {
       $newline .= $chars[$site-1];
    }
    if (!defined $line_inventory{$newline}) {
    print NEWOUT "$newline\n";
    $line_inventory{$newline} = 1;
    }
 }

close (SAVIN);
close (OUT);
close (STATS);
close (NEWOUT);
