# This takes in a file of aligned sequences (given as input)
# and converts the sequences into an array of character state pairs for use
# in multi-state PP. Gaps in the alignment are turned into question marks.
# The intended use is to see aligned sequence data that fits the multi-state PP
# model
# 
# Usage: perl convertcharnum.pl <file-base>
#
# Format of <file-base>.txt file:
# <num-rows> <num-cols>
# sequences of characters not separated by spaces

# use diagnostics

open (IN, "$ARGV[0].txt");
open (OUT, ">$ARGV[0].dat");
$line = <IN>;
chomp $line;
($m, $n) = split (/ /, $line);
print "$m $n\n";
print OUT "Processed file from $ARGV[0].txt\n";
print OUT "$m $n\n";
foreach $i (0..$n-1) {
	$count{$i} = 0;
}

foreach $row (0..$m-1) {
	$line = <IN>;
	chomp $line;
	@line = split (//, $line);
	print "@line\n";
	$linelength  = @line;
	foreach $i (0..$linelength-1) {	
		if ($line[$i] =~ /^-?\d/ || $line[$i] eq ' ') {
			$line[$i]=-1;
		}
		if (($line[$i] eq '.') || ($line[$i] eq '-')) {
			$newline[$i]='?';
		}
		else {
			if (defined $symb{$i}{$line[$i]}) {	
				$newline[$i]=$symb{$i}{$line[$i]};
			}
			else {
				$symb{$i}{$line[$i]}=$count{$i}++;
				$newline[$i]=$symb{$i}{$line[$i]};
			}
		}
	}
	foreach $i (0..$linelength-1) {
		if ($newline[$i] != -1) {
			print OUT "$newline[$i]";
			print OUT " ";
		}
	}	
	print OUT "\n";
	
}
