#!/usr/bin/perl
# Convert colour-space FASTA sequences to nucleotide FASTA sequences.
# Written by Shaun Jackman <sjackman@bcgsc.ca>.
# Usage: cstofasta data.csfa >data.fa

use strict;
use Getopt::Long;
use Pod::Usage;

my %opt;
GetOptions(\%opt, qw'help man');
pod2usage(-verbose => 1) if $opt{help};
pod2usage(-verbose => 2) if $opt{man};

my %table = (
	'A' => ['A', 'C', 'G', 'T'],
	'C' => ['C', 'A', 'T', 'G'],
	'G' => ['G', 'T', 'A', 'C'],
	'T' => ['T', 'G', 'C', 'A']
);

sub cs_to_nt($$)
{
	my $seed = shift;
	my $cs = shift;
	for (my $i = 0; $i < length $cs; $i++) {
		my $p = \substr($cs, $i, 1);
		$$p = $seed = $table{$seed}[$$p];
	}
	return $cs;
}

my ($id, $comment);
while (<>) {
	chomp;
	if (/^[ACGT]/) {
		my $seed = substr $_, 0, 1, '';
		print "$id$comment\n",
			substr(cs_to_nt($seed, $_), 1),
			"\n";
	} elsif (/^[0123]/) {
		for my $seed (qw'A C G T') {
			print "${id}_$seed$comment\n$seed",
				cs_to_nt($seed, $_), "\n";
		}
	} elsif (/^>/) {
		($id, $comment) = split ' ', $_, 2;
		$comment = ' ' . $comment if $comment;
	} elsif (/^#/) {
		print "$_\n";
	} else {
		die "error: what is `$_'";
	}
}

=pod

=head1 NAME

abyss-cstont - convert colour-space FASTA sequences to nucleotide FASTA sequences

=head1 SYNOPSIS

B<cstofasta> F<data.csfa> >F<data.fa>

=head1 DESCRIPTION

Either reads or contigs may be converted from colour-space sequences
to nucleotide sequences. If the first character of the input sequence
is not a nucleotide, each colour-space contig will be converted to
four nucleotide contigs, one for each possible starting nucleotide.

=head1 EXAMPLE

 $ printf '>1\nA0000' |abyss-cstont
 >1
 AAA

 $ printf '>1\n0000' |abyss-cstont
 >1_A
 AAAAA
 >1_C
 CCCCC
 >1_G
 GGGGG
 >1_T
 TTTTT

=head1 AUTHOR

Written by Shaun Jackman.

=head1 REPORTING BUGS

Report bugs to <abyss-users@bcgsc.ca>.

=head1 COPYRIGHT

Copyright 2009 Canada's Michael Smith Genome Science Centre

=head1 SEE ALSO

L<ABYSS(1)>

http://www.bcgsc.ca/platform/bioinfo/software/abyss
