vcf_filter.xml

<tool id="mimodd_vcf_filter" name="MiModD VCF Filter" version="@MIMODD_WRAPPER_VERSION@">
    <description>
    extracts lines from a vcf variant file based on field-specific filters
    </description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <expand macro="stdio" />
    <expand macro="version_command" />
    <command><![CDATA[
	mimodd vcf-filter
	    '$inputfile' -o '$outputfile'
	  #if len($datasets):
	    -s
	    #for $i in $datasets
		  '$i.sample'
	    #end for
	    --gt
	    #for $i in $datasets
	      ## remove whitespace from free-text input
	      '#echo ("".join($i.GT.split()) or "ANY")#'
	      #echo " "
	    #end for
	    --dp
	    #for $i in $datasets
	      $i.DP
	    #end for
	    --gq
	    #for $i in $datasets
	      $i.GQ
	    #end for
	    --af
	    #for $i in $datasets
	      '#echo ($i.AF or "::")#'
	    #end for
	  #end if
	  #if len($regions):
	    -r
	    #for $i in $regions
	      #if $i.stop:
	        '$i.chrom:$i.start-$i.stop'
	      #else:
	        '$i.chrom:$i.start'
	      #end if
	    #end for
	  #end if
	  #if $vfilter:
	    --vfilter
	    ## remove ',' and replace with ' '
	    '#echo ('" "'.join($vfilter.split(',')))#'
	  #end if
	  $vartype
    ]]></command>
  
    <inputs>
        <param name="inputfile" type="data" format="vcf" label="VCF input file" />
        <repeat name="datasets" title="Sample-specific Filter" default="0" min="0">
            <param name="sample" type="text" label="sample"
            help="name of a sample as it appears in the VCF input file and that indicates the sample that this filter should be applied to.">
                <expand macro="lex_sam_header" message="Non-ASCII characters are not valid in sample names." />
            </param>
	        <param name="GT" type="text" 
	        label="genotype pattern(s) for the inclusion of variants"
	        help="keep only variants for which the genotype of the sample matches the specified pattern; format: x/x where x = 0 is wildtype and x = 1 is mutant. Multiple genotypes can be specified as a comma-separated list.">
	            <validator type="expression" message="Malformed genotype pattern">not value or all(c.isdigit() or c in './|' for token in value.split(',') for c in token.strip(' '))</validator>
	        </param>
	        <param name="DP" type="integer" value="0" 
	        label="depth of coverage for the sample at the variant site"
	        help="keep only variants with at least this sample-specific coverage at the variant site" />
        	<param name="GQ" type="integer" value="0"
        	label="genotype quality for the variant in the sample"
        	help="keep only variants for which the genotype prediction for the sample has at least this quality" />
	        <param name="AF" type="text"
	        label="allelic fraction filter"
	        help="expected format: [allele number]:[minimal fraction]:[maximal fraction]; keep only variants for which the fraction of sample-specific reads supporting a given allele number is between minimal and maximal fraction; if allele number is omitted, the filter operates on the most frequent non-reference allele instead">
	            <validator type="expression" message="Malformed allelic fraction filter">not value or all(c.isdigit() or c in '.:' for c in value)</validator>
	        </param>
        </repeat>
        <repeat name="regions" title="Region Filter" default="0" min="0"
        help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported.">
            <param name="chrom" type="text" label="Chromosome">
                <expand macro="lex_sam_header" message="Non-ASCII characters are not valid in chromosome names." />
            </param>
            <param name="start" type="text" label="Region Start">
                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
            </param>
            <param name="stop" type="text" label="Region End">
                <validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
            </param>
        </repeat>
        <param name="vartype" type="select" 
        label="Select the types of variants to include in the output">
            <option value="">all types of variants</option>
            <option value="--no-indels">exclude indels</option>
            <option value="--indels-only">only indels</option>
        </param>
        <param name="vfilter" type="text" label="sample"
        help="Filter output by sample name; only the sample-specific columns with their sample name matching any of the comma separated filters will be retained in the output.">
            <expand macro="lex_sam_header" message="Non-ASCII characters are not valid in sample names." />
        </param>
    </inputs>
  
    <outputs>
        <data name="outputfile" format="vcf" />
    </outputs>

    <tests>
        <test>
            <param name="inputfile" value="a.vcf" />
            <repeat name="datasets">
                <param name="sample" value="N2" />
                <param name="GT" value="0/0" />
            </repeat>
            <output name="outputfile" ftype="vcf" compare="diff">
                <assert_contents>
                    <has_text text="GT:PL:DP:DPR:GQ&#009;0/0" />
                    <not_has_text text="GT:PL:DP:DPR:GQ&#009;1/1" />
                    <not_has_text text="GT:PL:DP:DPR:GQ&#009;0/1" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="inputfile" value="a.vcf" />
            <repeat name="regions">
                <param name="chrom" value="chrX" />
            </repeat>
            <output name="outputfile" ftype="vcf">
                <assert_contents>
                    <has_text text="chrX&#009;" />
                    <not_has_text text="chrI&#009;" />
                    <not_has_text text="chrII&#009;" />
                    <not_has_text text="chrIII&#009;" />
                    <not_has_text text="chrIV&#009;" />
                    <not_has_text text="chrV&#009;" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="inputfile" value="a.vcf" />
            <param name="vartype" value="--no-indels" />
            <param name="vfilter" value="ot266" />
            <output name="outputfile" ftype="vcf">
                <assert_contents>
                    <not_has_text text="INDEL;" />
                    <has_line line="#CHROM&#009;POS&#009;ID&#009;REF&#009;ALT&#009;QUAL&#009;FILTER&#009;INFO&#009;FORMAT&#009;ot266" />
                </assert_contents>
            </output>
        </test>
    </tests>
    
    <help><![CDATA[
.. class:: infomark

   **What it does**

The tool filters a variant file in VCF format to generate a new VCF file with only a subset of the original variants.

The following types of variant filters can be set up:

1) Sample-specific filters:
   
   Filter variants based on their characteristics in the sequenced reads of a specific sample. Multiple sample-specific filters are combined by logical AND, i.e., only variants that pass ALL sample-specific filters are kept.
   
2) Region filters:
   
   Filter variants based on the genomic region they affect. Multiple region filters are combined by logical OR, i.e., variants passing ANY region filter are kept.
   
3) Variant type filter:

   Filter variants by their type, i.e. whether they are single nucleotide variations (SNVs) or indels
   
In addition, the *sample* filter can be used to reduce the samples encoded in a multi-sample VCF file to just those specified by the filter.
The *sample* filter is included mainly for compatibility reasons: if an external tool cannot deal with the multisample file format, but instead looks only at the first sample-specific column of the file, you can use the filter to turn the multi-sample file into a single-sample file. Besides, the filter can also be used to change the order of the samples since it will sort the samples in the order specified in the filter field.

**Examples of sample-specific filters:**

*Simple genotype pattern*

genotype pattern: 1/1 ==> keep all variants in the vcf input file for which the specified sample's genotype is homozygous mutant

*Complex genotype pattern*

genotype pattern: 0/1, 0/0 ==> keep all variants for which the sample's genotype is either heterozygous or homozygous wildtype

*Multiple sample-specific filters*

Filter 1: genotype pattern: 0/0, Filter 2: genotype pattern 1/1:
==> keep all variants for which the first sample's gentoype is homozygous wildtype **and** the second sample's genotype is homozygous mutant

*Combining sample-specific filter criteria*

genotype pattern: 1/1, depth of coverage: 3, genotype quality: 9
==> keep variants for which the sample's genotype is homozygous mutant **and** for which this genotype assignment is corroborated by a genotype quality score of at least 9
**and** at least three reads from the sample cover the variant site

**TIP:**

As in the example above, genotype quality is typically most useful in combination with a genotype pattern.
It acts then, effectively, to make the genotype filter more stringent.

@HELP_FOOTER@
    ]]></help>
    <expand macro="citations" />
</tool>