Skip to content

Commit

Permalink
fix bug in venn diagram plotting
Browse files Browse the repository at this point in the history
  • Loading branch information
yunfeiguo committed Aug 14, 2015
1 parent 073e8d9 commit a6076f1
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 50 deletions.
94 changes: 44 additions & 50 deletions bin/secondary/stats
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ use Getopt::Long qw/GetOptions/;
my $ARGCOUNT=100000; #number of regions to extract using command line, max number if restricted by kernel config
my $INDEL_EXTENSION = 10; #when determining indel overlapping, interalize indels by extending indel INDEL_EXTENSION bp towards both ends
my $TMPDIR = $ENV{TMPDIR} || "/tmp";
my %LEGAL_VAR_FILETYPE = (
'avinput' => 1,
'vcf' => 1,
'consensus' => 1,
'vcf.gz' => 1,
);
##################################################
my ($prefix,
$bam,$vcf,
Expand Down Expand Up @@ -111,18 +117,22 @@ if ($union_files)
&conv2anno(split /,/,$union_files) );
}

if ($consensus_files)
{
if ($consensus_files) {
&var_consensus(
&conv2anno(split /,/,$consensus_files)
);
}

if ($venn)
{
if ($venn) {
my @files=split /,/,$venn;
@files=&SeqMule::Utils::getNonEmptyVCF(@files);
&venn( \@files,&conv2anno(&normalize_vcf(@files)) );
&SeqMule::Utils::checkFileType(\%LEGAL_VAR_FILETYPE,1,@files);

if(&SeqMule::Utils::getSuffix($files[0]) =~ /^vcf$|^vcf\.gz$/) {
@files=&SeqMule::Utils::getNonEmptyVCF(@files);
&venn( \@files,&conv2anno(&normalize_vcf(@files)) );
} else {
&venn( \@files,&conv2anno(@files) );
}
}

if (defined $capture)
Expand Down Expand Up @@ -809,8 +819,7 @@ sub coverage_stat
}

#generate Venn Diagram
sub venn
{
sub venn {
warn "NOTICE: Do not support more than 5 files right now\n" and return undef unless (@_<=5);
warn "NOTICE: No need to plot for single file\n" and return undef if (@_==1);

Expand Down Expand Up @@ -838,16 +847,14 @@ sub venn
my $non_snp_index_pool;

#remove empty file
for (@avinput_pool)
{
for (@avinput_pool) {
$_=undef unless (-s $_ >0);
}
@avinput_pool=grep {defined $_} @avinput_pool;
warn "WARNING: failed to plot Venn diagram, number of nonempty files is more than 5 or smaller than 2.\n" and return unless @avinput_pool<=5 && @avinput_pool>=2;

#generate index matrix
for my $count(0..$#avinput_pool)
{
for my $count(0..$#avinput_pool) {
#*_index_matrix is array of array
#first layer is array of file index
#2nd layer is array of file name plus variants
Expand All @@ -857,18 +864,19 @@ sub venn
open IN,"<",$input or die "Cannot open $input:$!\n";
$snp_index_matrix[$count]=[];
$non_snp_index_matrix[$count]=[];
while (<IN>)
{
my $illegal_input_count = 0;
while (<IN>) {
#chr start end ref obs ...
#1 2 3 4 5
/^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t.*$/ or next;
unless(/^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t?.*$/) {
$illegal_input_count++;
next;
}

if ($2 == $3 and length($4) == 1 and length($5) == 1 ) #start == end means this is a SNP
{
if ($2 == $3 and length($4) == 1 and length($5) == 1 ) {#start == end means this is a SNP
my $key=[$1,$2,$3,$4,$5];
push @{$snp_index_matrix[$count]},$key and $snp_appear{$key}=1 unless $snp_appear{$key};
} else
{
} else {
#for indels, MVNs
#use intervalization of 20bp (10bp in each direction)
#as long as there's an overlap of interval
Expand All @@ -879,11 +887,11 @@ sub venn
}
}
close IN;
warn "WARNING: $input has $illegal_input_count illegal lines, expect >= 5 tab delimited fields\n" if $illegal_input_count > 0;
}

@non_snp_index_matrix = &gen_index_for_interval(\@non_snp_index_matrix);
for my $count(0..$#avinput_name)
{
for my $count(0..$#avinput_name) {
#add file name to head of array
my $input = $avinput_name[$count];
my ($name_prefix) = $input=~/^(.*)(\.avinput|\.tmp|\.vcf|\.consensus)$/i;
Expand All @@ -904,8 +912,7 @@ sub venn
&genPlot("SNV",$snp_index_pool,scalar @avinput_pool);
&genPlot("NON-SNV",$non_snp_index_pool,scalar @avinput_pool);
}
sub gen_index_for_interval
{
sub gen_index_for_interval {
#be careful here
#we are dealing with references
#so any change to the content may have unexpected side effects
Expand Down Expand Up @@ -992,14 +999,12 @@ sub gen_index_for_interval
#and there shouldn't be duplicates
return @new;
}
sub write_index_pool
{
sub write_index_pool {
my $max=0;
my $outfile = shift;
my @index_matrix = @_;

for my $i(0..$#index_matrix)
{
for my $i(0..$#index_matrix) {
$max = @{$index_matrix[$i]} if @{$index_matrix[$i]} > $max;
}

Expand All @@ -1009,9 +1014,8 @@ sub write_index_pool
my @line;
for my $j(0..$#index_matrix)
{
if(defined $index_matrix[$j][$i])
{
push @line,(join "+",@{$index_matrix[$j][$i]});
if(defined $index_matrix[$j][$i] and @{$index_matrix[$j][$i]}>0) {
push @line,(join "+",(grep {defined} @{$index_matrix[$j][$i]}));
} else
{
push @line,"NA";
Expand All @@ -1021,8 +1025,7 @@ sub write_index_pool
}
close OUT;
}
sub genPlot
{
sub genPlot {
my $category = shift; #SNP or NON-SNP
die "ERROR: category missing" unless defined $category;
my $variant_index_pool=shift;
Expand All @@ -1044,8 +1047,7 @@ sub genPlot
"y=list()\n".
"title='Overlapping of $category variants'\n";
#convert dataframe to list such that na=remove works
for my $no(1..$count)
{
for my $no(1..$count) {
$rscript.=
"y[[$no]]=x[[$no]]\nnames(y)[$no]=$no\n".
"tmp=paste($no,names(x)[$no],sep=':')\n".
Expand All @@ -1066,41 +1068,35 @@ sub genPlot
}\n";

#high resolution tiff output
if ($count==5)
{
if ($count==5) {
$rscript.="
venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=0.65,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
";
} elsif ($count==4)
{
} elsif ($count==4) {
$rscript.="
venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=0.8,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
";
} else
{
} else {
$rscript.="
venn.diagram(y,fill=rainbow(length(y)),filename='$tiff',na=\"remove\",cex=1,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=0,ext.length=0.9)
";
}
#jpeg output for html report
if ($count==5)
{
if ($count==5) {
$rscript.="
obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=0.65,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
jpeg('$jpg',width=5,height=5,units='in',res=500)
grid.draw(obj)
garbage <- dev.off()
";
} elsif ($count==4)
{
} elsif ($count==4) {
$rscript.="
obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=0.8,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=-0.1,ext.length=0.9)
jpeg('$jpg',width=5,height=5,units='in',res=500)
grid.draw(obj)
garbage <- dev.off()
";
} else
{
} else {
$rscript.="
obj=venn.diagram(y,fill=rainbow(length(y)),filename=NULL,na=\"remove\",cex=1,main=title,main.cex=0.7,main.pos=c(0.5,1.2),main.just=c(0,1),ext.dist=0,ext.length=0.9)
jpeg('$jpg',width=5,height=5,units='in',res=500)
Expand All @@ -1117,13 +1113,11 @@ sub genPlot
}

#convert files to annovar input format for easy handling
sub conv2anno
{
sub conv2anno {
my @files=@_;
my @return_files;
my $exe_conver=&getExe("convert2annovar.pl");
for (@files)
{
for (@files) {
if (/^(.*)\.vcf(\.gz)?$/i)
{
my $local_prefix=$1;
Expand Down
78 changes: 78 additions & 0 deletions doc/User Guide/Manuals/stats.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# NAME

SeqMule an automatic pipeline for next-generation sequencing data analysis

# SYNOPSIS

seqmule stats <options>

For details, please use 'seqmule stats -h':

Options:

--prefix,-p <STRING> output prefix. Mandatory for multiple input files.
--bam <BAM> a sorted BAM file (used with --capture, --aln)
--capture [BED] a BED file for capture regions (or any other regions of interest). Effective for --bam and --vcf.
--vcf <VCF> output variant stats for a VCF file. If a BED file is supplied, extract variants based on the BED file.
--aln output alignment stats for a BAM file
--consensus,-c <LIST> comma separated list of files for extracting consensus calls.
VCF4 and SOAPsnp *.consensus format or ANNOVAR *.avinput required.
--union,-u <LIST> comma separated list of files for pooling variants (same format as above).
--venn <LIST> comma separated list of files for Venn diagram plotting (same format as above).
--c-vcf <LIST> comma separated list of SORTED VCF files for extracting consensus calls. *.vcf or *.vcf.gz suffix required
--u-vcf <LIST> comma separated list of SORTED VCF files for extracting union calls. *.vcf or *.vcf.gz suffix required
--ref <FASTA> reference file in FASTA format. Effective for --c-vcf and --u-vcf.
-s,--sample <STRING> sample name for VCF file, used for -vcf, -u, -venn, -c options.
--plink convert VCF to PLINK format (PED,MAP). Only works with --vcf option.
--mendel-stat generate Mendelian error statistics
--paternal <STRING> sample ID for paternal ID (case-sensitive). Rest are either maternal or offspring. Only one family allowed.
--maternal <STRING> sample ID for maternal ID (case-sensitive). Rest are either paternal or offspring. Only one family allowed.
-N <INT> extract variants appearing in at least N input files. Currently only effective for --c-vcf option.
--jmem <STRING> max java memory. Only effective for --c-vcf and --u-vcf. Default: 1750m
--jexe <STRING> Java executable path. Default: java
-t <INT> number of threads. Only effective for --aln, --c-vcf and --u-vcf. Default: 1
--tmpdir <DIR> use DIR for storing large temporary files. Default: $TMPDIR(in your ENV variables) or /tmp
--nofilter If specified, consider all variants, otherwise, only unfiltered variants.
-h,--help help
--noclean do not clean temporary files
-v,--verbose verbose


EXAMPLE

#draw Venn Diagram to examine overlapping between different VCF files
seqmule stats -p gatk-soap-varscan -venn gatk.vcf,soap.avinput,varscan.vcf

#extract union of all variants, ouput in ANNOVAR format
seqmule stats -p gatk-soap-varscan -u gatk.vcf,soap.avinput,varscan.vcf

#extract consensus of all variants, output in ANNOVAR format
seqmule stats -p gatk_soap_varscan -c gatk.vcf,soap.avinput,varscan.vcf

#extract consensus of all variants, output in VCF format
seqmule stats -p gatk_soap_varscan -c-vcf gatk.vcf,soapsnp.vcf,varscan.vcf -ref hg19.fa

#extract union of all variants, output in VCF format
seqmule stats -p gatk_soap_varscan -u-vcf gatk.vcf,soapsnp.vcf,varscan.vcf -ref hg19.fa

#generate coverage statistics for specified region (region.bed)
seqmule stats -p sample -capture region.bed --bam sample.bam

#generate alignment statistics
seqmule stats -bam sample.bam -aln

#generate variant statistics
seqmule stats -vcf sample.vcf

#extract variants in specified region generate variant statistics
seqmule stats -vcf sample.vcf -capture region.bed

#generate Mendelian error statistics
#NOTE, sample.vcf contains 3 samples!
seqmule stats -vcf sample.vcf --plink --mendel-stat --paternal father --maternal mother

# OPTIONS

- **--capture**

SeqMule automatizes analysis of next-generation sequencing data by simplifying program installation, downloading of various databases, generation of analysis script, and customization of your pipeline.
27 changes: 27 additions & 0 deletions lib/SeqMule/Utils.pm
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,33 @@ no warnings 'File::Find';
use SeqMule::SeqUtils;


sub checkFileType {
#take accepted filetype (hash ref)
#and a list of files
#make sure file types are all accepted and are the same
my $ft = shift;
my $same_flag = shift; #require all files to be the same?
my @files = @_;
my $first = shift @files;
my $first_ft = &getSuffix($first);
unless ($ft->{$first_ft}) {
croak("ERROR: illegal file type ($first_ft), only accept ".(join(",",keys %$ft))."\n");
}
if($same_flag) {
for my $i(@files) {
my $i_ft = &getSuffix($i);
croak("ERROR: suffix $i_ft is not same as suffix $first_ft\n") unless $i_ft eq $first_ft;
}
}
}
sub getSuffix {
my $file = shift;
my ($suffix) = $file=~/\.([^\.]+)$/ or croak("ERROR: did not find suffix ($file)\n");
if($suffix eq 'gz') {
return &getSuffix($file=~/(.*?)\.$suffix$/).".$suffix";
}
return $suffix;
}
sub get_rank_by_mergingrule
{
#given index for sample, index for file, and merging rule
Expand Down

0 comments on commit a6076f1

Please sign in to comment.