-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_filter.xml
205 lines (179 loc) · 9.24 KB
/
vcf_filter.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
<tool id="mimodd_vcf_filter" name="MiModD VCF Filter" version="@MIMODD_WRAPPER_VERSION@">
<description>
extracts lines from a vcf variant file based on field-specific filters
</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements" />
<expand macro="stdio" />
<expand macro="version_command" />
<command><![CDATA[
mimodd vcf-filter
'$inputfile' -o '$outputfile'
#if len($datasets):
-s
#for $i in $datasets
'$i.sample'
#end for
--gt
#for $i in $datasets
## remove whitespace from free-text input
'#echo ("".join($i.GT.split()) or "ANY")#'
#echo " "
#end for
--dp
#for $i in $datasets
$i.DP
#end for
--gq
#for $i in $datasets
$i.GQ
#end for
--af
#for $i in $datasets
'#echo ($i.AF or "::")#'
#end for
#end if
#if len($regions):
-r
#for $i in $regions
#if $i.stop:
'$i.chrom:$i.start-$i.stop'
#else:
'$i.chrom:$i.start'
#end if
#end for
#end if
#if $vfilter:
--vfilter
## remove ',' and replace with ' '
'#echo ('" "'.join($vfilter.split(',')))#'
#end if
$vartype
]]></command>
<inputs>
<param name="inputfile" type="data" format="vcf" label="VCF input file" />
<repeat name="datasets" title="Sample-specific Filter" default="0" min="0">
<param name="sample" type="text" label="sample"
help="name of a sample as it appears in the VCF input file and that indicates the sample that this filter should be applied to.">
<expand macro="lex_sam_header" message="Non-ASCII characters are not valid in sample names." />
</param>
<param name="GT" type="text"
label="genotype pattern(s) for the inclusion of variants"
help="keep only variants for which the genotype of the sample matches the specified pattern; format: x/x where x = 0 is wildtype and x = 1 is mutant. Multiple genotypes can be specified as a comma-separated list.">
<validator type="expression" message="Malformed genotype pattern">not value or all(c.isdigit() or c in './|' for token in value.split(',') for c in token.strip(' '))</validator>
</param>
<param name="DP" type="integer" value="0"
label="depth of coverage for the sample at the variant site"
help="keep only variants with at least this sample-specific coverage at the variant site" />
<param name="GQ" type="integer" value="0"
label="genotype quality for the variant in the sample"
help="keep only variants for which the genotype prediction for the sample has at least this quality" />
<param name="AF" type="text"
label="allelic fraction filter"
help="expected format: [allele number]:[minimal fraction]:[maximal fraction]; keep only variants for which the fraction of sample-specific reads supporting a given allele number is between minimal and maximal fraction; if allele number is omitted, the filter operates on the most frequent non-reference allele instead">
<validator type="expression" message="Malformed allelic fraction filter">not value or all(c.isdigit() or c in '.:' for c in value)</validator>
</param>
</repeat>
<repeat name="regions" title="Region Filter" default="0" min="0"
help="Filter variant sites by their position in the genome. If multiple Region Filters are specified, all variants that fall in ONE of the regions are reported.">
<param name="chrom" type="text" label="Chromosome">
<expand macro="lex_sam_header" message="Non-ASCII characters are not valid in chromosome names." />
</param>
<param name="start" type="text" label="Region Start">
<validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
</param>
<param name="stop" type="text" label="Region End">
<validator type="expression" message="an integer number is required">not value or value.isdigit()</validator>
</param>
</repeat>
<param name="vartype" type="select"
label="Select the types of variants to include in the output">
<option value="">all types of variants</option>
<option value="--no-indels">exclude indels</option>
<option value="--indels-only">only indels</option>
</param>
<param name="vfilter" type="text" label="sample"
help="Filter output by sample name; only the sample-specific columns with their sample name matching any of the comma separated filters will be retained in the output.">
<expand macro="lex_sam_header" message="Non-ASCII characters are not valid in sample names." />
</param>
</inputs>
<outputs>
<data name="outputfile" format="vcf" />
</outputs>
<tests>
<test>
<param name="inputfile" value="a.vcf" />
<repeat name="datasets">
<param name="sample" value="N2" />
<param name="GT" value="0/0" />
</repeat>
<output name="outputfile" ftype="vcf" compare="diff">
<assert_contents>
<has_text text="GT:PL:DP:DPR:GQ	0/0" />
<not_has_text text="GT:PL:DP:DPR:GQ	1/1" />
<not_has_text text="GT:PL:DP:DPR:GQ	0/1" />
</assert_contents>
</output>
</test>
<test>
<param name="inputfile" value="a.vcf" />
<repeat name="regions">
<param name="chrom" value="chrX" />
</repeat>
<output name="outputfile" ftype="vcf">
<assert_contents>
<has_text text="chrX	" />
<not_has_text text="chrI	" />
<not_has_text text="chrII	" />
<not_has_text text="chrIII	" />
<not_has_text text="chrIV	" />
<not_has_text text="chrV	" />
</assert_contents>
</output>
</test>
<test>
<param name="inputfile" value="a.vcf" />
<param name="vartype" value="--no-indels" />
<param name="vfilter" value="ot266" />
<output name="outputfile" ftype="vcf">
<assert_contents>
<not_has_text text="INDEL;" />
<has_line line="#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	ot266" />
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
.. class:: infomark
**What it does**
The tool filters a variant file in VCF format to generate a new VCF file with only a subset of the original variants.
The following types of variant filters can be set up:
1) Sample-specific filters:
Filter variants based on their characteristics in the sequenced reads of a specific sample. Multiple sample-specific filters are combined by logical AND, i.e., only variants that pass ALL sample-specific filters are kept.
2) Region filters:
Filter variants based on the genomic region they affect. Multiple region filters are combined by logical OR, i.e., variants passing ANY region filter are kept.
3) Variant type filter:
Filter variants by their type, i.e. whether they are single nucleotide variations (SNVs) or indels
In addition, the *sample* filter can be used to reduce the samples encoded in a multi-sample VCF file to just those specified by the filter.
The *sample* filter is included mainly for compatibility reasons: if an external tool cannot deal with the multisample file format, but instead looks only at the first sample-specific column of the file, you can use the filter to turn the multi-sample file into a single-sample file. Besides, the filter can also be used to change the order of the samples since it will sort the samples in the order specified in the filter field.
**Examples of sample-specific filters:**
*Simple genotype pattern*
genotype pattern: 1/1 ==> keep all variants in the vcf input file for which the specified sample's genotype is homozygous mutant
*Complex genotype pattern*
genotype pattern: 0/1, 0/0 ==> keep all variants for which the sample's genotype is either heterozygous or homozygous wildtype
*Multiple sample-specific filters*
Filter 1: genotype pattern: 0/0, Filter 2: genotype pattern 1/1:
==> keep all variants for which the first sample's gentoype is homozygous wildtype **and** the second sample's genotype is homozygous mutant
*Combining sample-specific filter criteria*
genotype pattern: 1/1, depth of coverage: 3, genotype quality: 9
==> keep variants for which the sample's genotype is homozygous mutant **and** for which this genotype assignment is corroborated by a genotype quality score of at least 9
**and** at least three reads from the sample cover the variant site
**TIP:**
As in the example above, genotype quality is typically most useful in combination with a genotype pattern.
It acts then, effectively, to make the genotype filter more stringent.
@HELP_FOOTER@
]]></help>
<expand macro="citations" />
</tool>