#!/usr/bin/env perl
use warnings;
use strict;
use FindBin;
use lib "$FindBin::RealBin/../perl5";
###LINE_FOR_BREW_CONDA###
use Snippy::Version;
use List::Util qw(min max);
use File::Basename;

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Globals

my $VERSION = Snippy::Version->version;
my $EXE = $FindBin::RealScript;
my @CMDLINE = (basename($0), @ARGV); # before we process it

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Options

my(@Options, $debug, $fasta, $haploid, $info_tags, $format_tags, $set_filter);
setOptions();

my @keep_info = split m/,/, $info_tags;
my %keep_info = (map { $_ => 1 } @keep_info);

my @keep_format = split m/,/, $format_tags;
my %keep_format = (map { $_ => 1 } @keep_format);

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Main

my @hdr;
my @vcf;

while (<ARGV>) {
  if (m/^#/) {
    # VCF header lines
    if (m/^##fileformat=/) {
      push @hdr, $_;
      push @hdr, qq{##snippy="@CMDLINE"\n};
    }
    elsif (m/^##INFO=<ID=(\w+)/) {
      next unless $keep_info{$1};
    }
    elsif (m/^##FORMAT=<ID=(\w+)/) {
      next unless $keep_format{$1};
    }
    elsif (m/^#CHROM/) {
      push @hdr, qq{##INFO=<ID=OLDVAR,Number=R,Type=String,Description="Original REF,ALT before decomposition">\n};
    }
    push @hdr, $_;
  }
  else {
    chomp;
    # CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 [SAMPLE2] ...
    my($chrom,$pos,$id,$ref,$alt,$qual,$filter,$info,$format,@sample) = split m/\t/;

    die "This script only works with single sample VCF files" if @sample > 1;
    
    my $itag = decode_info($info);
    my $ftag = decode_format($format, $sample[0]); # assume only 1 sample
  
    # only want substitutions
    my $type = $itag->{TYPE};
    next if $type =~ m/ins|del/i;
      
    # filter out heterozygous sites
    if ($haploid) {
      !exists $ftag->{GT} and die "No GT tag found in $format $sample[0]";
      next if $ftag->{GT} =~ m/0/ and $ftag->{GT} =~ /1/; # het
      $ftag->{GT} = substr($ftag->{GT}, -1, 1);
    }

    $filter = $set_filter if $set_filter;  # eg. "PASS"
    
    ($format,$sample[0]) = recode_format($ftag, @keep_format);

    if ($type eq 'complex' or $type eq 'mnp') {
      my $L = min( length($ref), length($alt) );
      $itag->{OLDVAR} = "$type,$pos,$ref,$alt";
      $itag->{TYPE} = 'snp';
      $info = recode_info($itag, @keep_info, 'OLDVAR');
      for my $i (0 .. $L-1) {
        my $r = substr($ref, $i, 1);
        my $a = substr($alt, $i, 1);
        if ($r ne $a) {
          push @vcf, [ $chrom,$pos+$i,$id,$r,$a,$qual,$filter,$info,$format,@sample ];
        }
      }
    }
    elsif ($type eq 'snp') {
      $info = recode_info($itag, @keep_info);
      push @vcf, [ $chrom,$pos,$id,$ref,$alt,$qual,$filter,$info,$format,@sample ];
    }
    else {
      print STDERR "[NOTICE] Skipping TYPE=$type @ $chrom:$pos\n$info\n";
    }
  }
}

# sort VCF as coordinates can overlap when decompsing key = chr(ALPHA),pos(NUMERIC)
@vcf = sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] } @vcf;

# output VCF
print @hdr;
for my $row (@vcf) {
  print join("\t", @$row),"\n";
}

#----------------------------------------------------------------------

sub recode_format {
  my($kv, @keep) = @_;
  @keep = keys %$kv unless @keep;
  my $format = join(':', @keep);
  my $sample = join(':', map { $kv->{$_} } @keep);
  return ($format, $sample);
}

#----------------------------------------------------------------------

sub decode_format {
  my($format, $sample) = @_;
  my @fmt = split m/:/, $format;
  my @sam = split m/:/, $sample;
  die "Mismatch between FORMAT=$format and SAMPLE=$sample" if @fmt != @sam;
  my $kv = { map { ( $fmt[$_] => $sam[$_] ) } (0 .. $#fmt) };
  return $kv;
}

#----------------------------------------------------------------------

sub recode_info {
  my($kv, @keep) = @_;
  @keep = keys %$kv unless @keep;
  my @tag;
  for my $k (@keep) {
    my $v = $kv->{$k};
    push @tag, "$k=$v" if defined $v;
  }
  return join(';', @tag);
  #Issue #221 when no value for key
  #return join(';', map { "$_=".$kv->{$_} } @keep);
}

#----------------------------------------------------------------------

sub decode_info {
  my($info) = @_;
  my $kv;
  for my $pair (split m{;}, $info) {
    my($k,$v) = split m{=}, $pair;
    $kv->{$k} = $v;
  }
  return $kv;
}

#----------------------------------------------------------------------

sub minimal_rep {
 my($pos, $ref, $alt) = @_;
 my @ref = split m//, $ref;
 my @alt = split m//, $alt;
 while (@ref > 1 and @alt > 1 and $ref[-1] eq $alt[-1]) {
   pop @ref;
   pop @alt;
   print STDERR "  RIGHT : ref=[@ref] alt=[@alt] pos=$pos\n" if $debug;
 }
 while (@ref > 1 and @alt > 1 and $ref[0] eq $alt[0]) {
   shift @ref;
   shift @alt;
   $pos++;
   print STDERR "   LEFT : ref=[@ref] alt=[@alt] pos=$pos\n" if $debug;
 }
 return ( $pos, join('', @ref), join('', @alt) );
}

#----------------------------------------------------------------------
sub show_version {
  print "$EXE $VERSION\n";
  exit(0);
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    {OPT=>"help!",     VAR=>\&usage,                DESC=>"This help"},
    {OPT=>"version!",  VAR=>\&show_version,         DESC=>"Print version and exit"},
    {OPT=>"debug+",    VAR=>\$debug,   DEFAULT=>0,  DESC=>"Output verbose debug info"},
    {OPT=>"ref=s",     VAR=>\$fasta,   DEFAULT=>'', DESC=>"FASTA reference"},
    {OPT=>"haploid!",  VAR=>\$haploid, DEFAULT=>0,  DESC=>"Convert to haploid genotypes"},
    {OPT=>"info-tags=s",   VAR=>\$info_tags,   DEFAULT=>'TYPE,DP,RO,AO,AF',  DESC=>"Keep these INFO tags"},
    {OPT=>"format-tags=s", VAR=>\$format_tags, DEFAULT=>'GT,DP,RO,AO,QR,QA',  DESC=>"Keep these FORMAT tags"},
    {OPT=>"filter=s",      VAR=>\$set_filter,  DEFAULT=>'',    DESC=>"Set FILTER to this"},
  );

  (!@ARGV) && (usage());

  &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();

  # Now setup default values.
  foreach (@Options) {
    if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

sub usage {
  print "SYNOPSIS\n  Convert MNP,COMPLEX into SNP and ignore INS,DEL\n";
  print "USAGE\n";
  print "  $EXE [options] in.vcf > out.vcf\n";
  print "OPTIONS\n";
  foreach (@Options) {
    printf "  --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
           defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
  }
  exit(0);
}
 
#----------------------------------------------------------------------
