#makegraph2.pl
#
# Feb. 3, 2009 augment to collect stats
# Jan. 24, 2009 Add in the check for any pair of colors which has too many edges for a tree between
# them. This does not explicitly check for a cycle, but is more effective than allowing 2k-1 edges between
# them, since not all colors have k states.
# 
# October 24, add in the necessary check for number of edges for a PP.
# October 4, 2008 modify to allow missing data - only build the graph from known data.
# July 31, 2008 modify to print out the partition intersection graph with the char,state labels in the
# same order as when the nodes are given integers.
#

#input is a character-state data matrix and output is an adjacenty structure for the partition-
#intersection graph of the data, and a translation table.
open (DATA, $ARGV[0]); # input of the character state data matrix
open (GRAPH, ">$ARGV[1]"); #output of the adjacency lists for the graph with node number labels
open (GRAPH1, ">p$ARGV[1]"); #output of the adjacency lists for the graph with char-state node labels
open (TRANS, ">$ARGV[2]");# output of the translation table, translating between node numbers
                          #for the graph and the data matrix.
open (SUMMARY, '>>msummary');

$k = $ARGV[3]; # k is the maximum number of states allowed.

$ext = <DATA>;
$line = <DATA>;
($n, $m) = $line =~ /(\d+) +(\d+)/;
print "$n, $m\n";

$nodenumber = 1;
$maxstate[$m] = 0;
foreach $i (1..$m-1) {
   $maxstate[$i] = 0;
   foreach $j ($i+1..$m) {
     $pairededgecount{$i}{$j} = 0;
   }
}

while ($line = <DATA>) {
   chomp $line;
   @dataline = split (/ /, $line);
     for ($j = 0; $j < $m; $j++) {
	     if (($dataline[$j] ne '?') && ($dataline[$j] > $maxstate[$j+1])) {
	        $maxstate[$j+1] = $dataline[$j];
	     }
     }

     for ($i = 0; $i < $m-1; $i++) {
       for ($j = $i+1; $j < $m; $j++) {
#        if (($dataline[$i] ne "?") && ($dataline[$j] ne "?"))
         if (($dataline[$i] ne '?') && ($dataline[$j] ne '?'))
         {

         $ip1 = $i+1;
         $jp1 = $j+1;
         $keyi = "($ip1,$dataline[$i])";
         $keyj = "($jp1,$dataline[$j])";

     $edges{$keyi}{$keyj} = 1;
         $edges{$keyj}{$keyi} = 1;
         if (! defined $nodes{$keyi}) {
                $keycode{$nodenumber} = $keyi;
                $nodes{$keyi} = $nodenumber;
#                print "Node number set $nodes{$keyi}\n";
                print TRANS "$nodes{$keyi} $keyi\n";
#                print TRANS "$keycode{$nodenumber} $nodenumber\n";
                $column{$nodenumber} = $ip1;
                $nodenumber++;
         }
         if (! defined $nodes{$keyj}) {
                $keycode{$nodenumber} = $keyj;
                $nodes{$keyj} = $nodenumber;
                print TRANS "$nodes{$keyj} $keyj\n";
#                print TRANS "$keycode{$nodenumber} $nodenumber\n";
                $column{$nodenumber} = $jp1;
                $nodenumber++;
         }


         $nedges {$nodes{$keyi}}  {$nodes{$keyj}} = 1;
#     print "nedges set: $nodes{$keyi},  $nodes{$keyj}\n";
         $nedges {$nodes{$keyj}}  {$nodes{$keyi}} = 1;
#     print "nedges set: $nodes{$keyj},  $nodes{$keyi}\n";
       } # matching bracket for 'if ne ?'
      }
     }
}


# print out the translation table indexed by char-state labels
print TRANS "\n";
foreach $key (sort keys %nodes) {
  print TRANS "$key $nodes{$key}\n";
}


# output the graph with node numbers and also with char-state labels in the same order
$edgecount = 0;
foreach $node1 (sort {$a <=> $b} keys %nedges) {
 print GRAPH "$node1: ";
  print GRAPH1 "$keycode{$node1}: ";
   @neighbors = (sort {$a <=> $b} keys %{$nedges{$node1}});
    $first = shift(@neighbors);  # I don't know why it is done this way.
       print GRAPH "$first";
       print GRAPH1 "$keycode{$first}";
         $edgecount++;
           $pairededgecount{$column{$node1}}{$column{$first}}++;

         foreach $node2  (@neighbors) {
           print GRAPH ", $node2";
           print GRAPH1 ", $keycode{$node2}";
           $edgecount++;
              $pairededgecount{$column{$node1}}{$column{$node2}}++;
         }
    print GRAPH "\n";
    print GRAPH1 "\n";
}
    $edgecount = $edgecount/2;
    $nodenumber-- ;
    $permitededges = ($nodenumber - ($m/2)) * ($m-1);
    print "nodecount $nodenumber edgecount $edgecount max permited edges  $permitededges\n";
    print SUMMARY "nodecount $nodenumber edgecount $edgecount max permited edges  $permitededges\n";

    if ($edgecount > $permitededges) {
      print "No PP because the edge count is too high\n";
   }
   else {
$permitedcount = (2 * $k) -1;
foreach $i (1..$m-1) {
   foreach $j ($i+1..$m) {
   # print "$i, $j: $pairededgecount{$i}{$j}, $permitedcount, $maxstate[$i], $maxstate[$j]\n";
     if ($pairededgecount{$i}{$j} > $permitedcount) {
        print "No PP because the paired edgecount for characters $i and $j is
         $pairededgecount{$i}{$j},
        which is larger than the permitted pair count $permitedcount. \n";

        print SUMMARY "No PP because the paired edgecount for characters $i and $j is
         $pairededgecount{$i}{$j},
        which is larger than the permitted pair count $permitedcount. \n";
	exit 1;
     }

     $precisepermitedcount = $maxstate[$i] + $maxstate[$j] + 1;
     if ($pairededgecount{$i}{$j} > $maxstate[$i] + $maxstate[$j] + 1) {
        print "No PP because the paired edgecount for characters $i and $j is
         $pairededgecount{$i}{$j},
        which is larger than the precise permitted pair count $precisepermitedcount. \n\n";

        print SUMMARY "No PP because the paired edgecount for characters $i and $j is
         $pairededgecount{$i}{$j},
        which is larger than the precise permitted pair count $precisepermitedcount. \n\n";
	exit 1;
     }
   }
}
  }
#  close (SUMMARY);
         
