#!/usr/bin/perl use strict; use warnings; # # the dbSNP release 142 was downloaded by NCBI # # wget -m -np ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b142_GRCh37p13/VCF/ # # # TABIX PROGRAM # my $TABIX = '/kfs/knosys00/public/opt/tabix-0.2.6/tabix'; #my $TABIX = 'tabix'; # # list of chromosome used by tabix # my @c = ( "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY", "chrM" ); if ( !defined $ARGV[1] || !-f $ARGV[0] || !-f $ARGV[1] ) { print STDERR qq|SYNTAX: > $0 NOTE: dbSNP and genome.vcf have to be compressed by bgzip and indexed by tabix. |; exit; } my $dest = $ARGV[1]; $dest =~ s/.vcf.gz$/.evcf/; open( FILE, ">$dest" ) || die; print FILE `zcat $ARGV[1] | head -n 1000 | grep '^#'`; # # we process each chromosome separately # foreach my $ch (@c) { # change the chromosome name to the NCBI dbSNP my $new = $ch; # !!!!!!!!!!!!!!!!!!!!NOTA!!!!!!!!!!!!!!!!1! (Alberto) ho commentato # le 2 righe successive perche' il VCF di riferimento creato per il # progetto nestle' ha i cromosomi che iniziano con chr #$new =~ s/chr//; #$new =~ s/M/MT/; my %p = (); # # store each known variant for the current chromosome into a hash # foreach (`$TABIX $ARGV[0] $new`) { chomp; next if ( !m/^$new\t/ ); my @a = split /\t/; my $ref = $a[3]; my $alt = $a[4]; my $fref = '.'; my $aref = '.'; my $rscode = $a[2]; if ( $a[$#a] =~ m/^.+CAF=\[(\d*\.*\d+),(\d*\.*\d+)\].+$/ ) { $fref = $1; $aref = $2; } $p{ $a[1] } = [ $ref, $alt, $fref, $aref, $rscode ]; } foreach (`$TABIX $ARGV[1] $ch`) { chomp; my @a = split /\t/; # skip the non PASS variant / regions or without alele position in GT flag next if ( $a[6] ne 'PASS' || $a[9] !~ m/^\d/ ); # get the GT fields my @GT = split ':', $a[9]; if ( length( $GT[0] ) == 1 ) { $GT[0] = $GT[0] . '/' . $GT[0]; $a[9] = join ':', @GT; } # # if position is a variant then print # if ( $GT[0] !~ m/0$/ ) { $a[2] = $p{ $a[1] }->[4] if ( exists $p{ $a[1] } ); print FILE join( "\t", @a ), "\n"; } # # if the position is non a variant then check for block # else { my $blockend = $a[1]; $blockend = $1 if ( $a[7] =~ m/^END=(\d+);/ ); for ( my $i = $a[1] ; $i <= $blockend ; $i++ ) { if ( exists $p{$i} ) { my $deletionend = $i - 1 + length( $p{$i}->[0] ); # check for deletion # the block must have the undeleted bases # if the position is no a deletion then # $deletionend = 1 -1 +1 if ( $deletionend <= $blockend ) { $a[1] = $i; $a[2] = $p{$i}->[4]; $a[3] = $p{$i}->[0]; $a[4] = $p{$i}->[1]; $a[7] = &getType( $a[3], $a[4] ); print FILE join( "\t", @a ), "\n"; } } } } } print STDERR qq|END chr $ch\n|; } close FILE; sub getType { my $ref = shift; my $alt = shift; my $refln = length($ref); my $altln = 0; if ( $alt =~ m/,/ ) { foreach my $a ( split ',', $alt ) { my $tt = length($a); $altln = $tt if ( $tt > $altln ); } } else { $altln = length($alt); } if ( $refln == $altln ) { return 'VARTYPE_SNV'; } elsif ( $refln > $altln ) { return 'VARTYPE_DEL'; } return 'VARTYPE_INS'; }