#!/usr/bin/perl

use strict;
use warnings;

#
# the dbSNP release 142 was downloaded by NCBI
#
# wget -m -np ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b142_GRCh37p13/VCF/
#

#
# TABIX PROGRAM
#
my $TABIX = '/kfs/knosys00/public/opt/tabix-0.2.6/tabix';

#my $TABIX = 'tabix';

#
# list of chromosome used by tabix
#
my @c = (
    "chr1",  "chr2",  "chr3",  "chr4",  "chr5",  "chr6",  "chr7",  "chr8",
    "chr9",  "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16",
    "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX",  "chrY",
    "chrM"
);

if ( !defined $ARGV[1] || !-f $ARGV[0] || !-f $ARGV[1] )
{
    print STDERR qq|SYNTAX:
> $0 <NCBI_dbSNP.vcf.gz> <genome.vcf.gz>

NOTE: dbSNP and genome.vcf have to be compressed by bgzip
and indexed by tabix.
|;
    exit;
}

my $dest = $ARGV[1];
$dest =~ s/.vcf.gz$/.evcf/;

open( FILE, ">$dest" ) || die;

print FILE `zcat $ARGV[1] | head -n 1000 | grep '^#'`;

#
# we process each chromosome separately
#
foreach my $ch (@c)
{
    # change the chromosome name to the NCBI dbSNP
    my $new = $ch;
    # !!!!!!!!!!!!!!!!!!!!NOTA!!!!!!!!!!!!!!!!1! (Alberto) ho commentato
    # le 2 righe successive perche' il VCF di riferimento creato per il 
    # progetto nestle' ha i cromosomi che iniziano con chr
    #$new =~ s/chr//;
    #$new =~ s/M/MT/;
    my %p = ();

    #
    # store each known variant for the current chromosome into a hash
    #
    foreach (`$TABIX $ARGV[0] $new`)
    {
        chomp;
        next if ( !m/^$new\t/ );
        my @a = split /\t/;

        my $ref    = $a[3];
        my $alt    = $a[4];
        my $fref   = '.';
        my $aref   = '.';
        my $rscode = $a[2];

        if ( $a[$#a] =~ m/^.+CAF=\[(\d*\.*\d+),(\d*\.*\d+)\].+$/ )
        {
            $fref = $1;
            $aref = $2;
        }

        $p{ $a[1] } = [ $ref, $alt, $fref, $aref, $rscode ];
    }

    foreach (`$TABIX $ARGV[1] $ch`)
    {
        chomp;
        my @a = split /\t/;

      # skip the non PASS variant / regions or without alele position in GT flag
        next if ( $a[6] ne 'PASS' || $a[9] !~ m/^\d/ );

        # get the GT fields
        my @GT = split ':', $a[9];

        if ( length( $GT[0] ) == 1 )
        {
            $GT[0] = $GT[0] . '/' . $GT[0];
            $a[9] = join ':', @GT;
        }

        #
        # if position is a variant then print
        #
        if ( $GT[0] !~ m/0$/ )
        {
            $a[2] = $p{ $a[1] }->[4] if ( exists $p{ $a[1] } );
            print FILE join( "\t", @a ), "\n";
        }
        #
        # if the position is non a variant then check for block
        #
        else
        {
            my $blockend = $a[1];
            $blockend = $1 if ( $a[7] =~ m/^END=(\d+);/ );

            for ( my $i = $a[1] ; $i <= $blockend ; $i++ )
            {
                if ( exists $p{$i} )
                {
                    my $deletionend = $i - 1 + length( $p{$i}->[0] );

                    # check for deletion
                    # the block must have the undeleted bases
                    # if the position is no a deletion then
                    # $deletionend = 1 -1 +1
                    if ( $deletionend <= $blockend )
                    {
                        $a[1] = $i;
                        $a[2] = $p{$i}->[4];
                        $a[3] = $p{$i}->[0];
                        $a[4] = $p{$i}->[1];

                        $a[7] = &getType( $a[3], $a[4] );
                        print FILE join( "\t", @a ), "\n";
                    }
                }
            }
        }
    }
    print STDERR qq|END chr $ch\n|;
}
close FILE;

sub getType
{
    my $ref = shift;
    my $alt = shift;

    my $refln = length($ref);
    my $altln = 0;

    if ( $alt =~ m/,/ )
    {
        foreach my $a ( split ',', $alt )
        {
            my $tt = length($a);
            $altln = $tt if ( $tt > $altln );
        }
    }
    else
    {
        $altln = length($alt);
    }

    if    ( $refln == $altln ) { return 'VARTYPE_SNV'; }
    elsif ( $refln > $altln )  { return 'VARTYPE_DEL'; }
    return 'VARTYPE_INS';
}