forked from getwilds/ww-fastq-to-cram
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ww-fastq-to-cram.wdl
205 lines (173 loc) · 6.21 KB
/
ww-fastq-to-cram.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
version 1.0
# Convert a group of paired fastq.gz files into an unmapped cram
# Uses the convention: READ_GROUP_NAME=~{sample_name}_~{flowcell_name}
#### STRUCT DEFINITIONS
struct FlowcellFastqs {
String flowcell_name # name of flowcell
Array[File] fastq_r1_locations # array of input R1 fastq file locations
Array[File] fastq_r2_locations # array of input R2 fastq file locations
}
struct InputData {
String dataset_id # unique ID to identify a particular dataset, even if sample_name is not unique
String sample_name # sample name to insert into the read group header
String library_name # library name to place into the LB attribute in the read group header
String sequencing_center # location where the sample was sequenced
Array[FlowcellFastqs] filepaths # array of flowcell fastq file locations
}
#### WORKFLOW DEFINITION
workflow PairedFastqsToUnmappedCram {
input {
Array[InputData] batch_info
}
String gatk_docker = "ghcr.io/getwilds/gatk:4.3.0.0"
String samtools_docker = "ghcr.io/getwilds/samtools:1.11"
scatter (sample in batch_info) { # for every sample in your batch,
String base_file_name = sample.sample_name + "_" + sample.dataset_id
scatter (flowcell in sample.filepaths) { # and for every flowcell that sample's library was run on,
call FastqToUnmappedBam { # take all the fastqs for that sample in that flowcell and make an unmapped bam
input:
r1_fastq = flowcell.fastq_r1_locations,
r2_fastq = flowcell.fastq_r2_locations,
base_file_name = base_file_name + "_" + flowcell.flowcell_name,
sample_name = sample.sample_name,
sequencing_center = sample.sequencing_center,
library_name = sample.library_name,
flowcell_name = flowcell.flowcell_name,
docker = gatk_docker
}
} # End flowcell scatter
call MergeBamsToCram { # then for each flowcell, merge all unmapped bams into one unmapped cram for the library
input:
bams_to_merge = FastqToUnmappedBam.unmapped_bam,
base_file_name = base_file_name,
docker = samtools_docker,
threads = 6
}
call ValidateCram { # then validate to make sure it checks out
input:
unmapped_cram=MergeBamsToCram.cram,
base_file_name = base_file_name,
docker = gatk_docker
}
} # End sample scatter
# Outputs that will be retained when execution is complete
output {
Array[File] unmapped_crams = MergeBamsToCram.cram
Array[File] unmapped_cram_indexes = MergeBamsToCram.crai
Array[File] validation = ValidateCram.validation
}
parameter_meta {
batch_info: "array of InputData structs describing the relevant metadata for each sample"
unmapped_crams: "array of unmapped cram files for each sample"
unmapped_cram_indexes: "array of index files for each unmapped cram file"
validation: "text file containing all relevant validation statistics for the cram in question"
}
} # End workflow
#### TASK DEFINITIONS
# Converts fastq files to unaligned bams
task FastqToUnmappedBam {
input {
Array[File] r1_fastq
Array[File] r2_fastq
String base_file_name
String sample_name
String flowcell_name
String library_name
String sequencing_center
String docker
}
command <<<
set -eo pipefail
gatk --java-options "-Dsamjdk.compression_level=5 -Xms4g" \
FastqToSam \
--FASTQ ~{sep=" " r1_fastq} \
--FASTQ2 ~{sep=" " r2_fastq} \
--OUTPUT "~{base_file_name}.unmapped.bam" \
--READ_GROUP_NAME "~{sample_name}_~{flowcell_name}" \
--SAMPLE_NAME "~{sample_name}" \
--LIBRARY_NAME "~{library_name}" \
--PLATFORM illumina \
--SEQUENCING_CENTER "~{sequencing_center}"
>>>
output {
File unmapped_bam = "~{base_file_name}.unmapped.bam"
}
runtime {
cpu: 4
memory: "8 GB"
docker: docker
}
parameter_meta {
r1_fastq: "array of R1 fastq files for the library in question"
r2_fastq: "array of R2 fastq files for the library in question"
base_file_name: "base file name to use in the read group and bam file names"
sample_name: "name of the sample in question"
flowcell_name: "name of the flowcell on which the sample is being sequenced"
library_name: "name of the library in question"
sequencing_center: "location where the sample was sequenced"
docker: "location of Docker image to use for this task"
unmapped_bam: "final unmapped bam file containing the reads from the fastqs in question"
}
}
# Validates cram files for formatting issues.
task ValidateCram {
input {
File unmapped_cram
String base_file_name
String docker
}
command <<<
set -eo pipefail
gatk --java-options "-Dsamjdk.compression_level=5 -Xms2g" \
ValidateSamFile \
--INPUT "~{unmapped_cram}" \
--MODE SUMMARY \
--IGNORE_WARNINGS false > "~{base_file_name}.validation.txt"
>>>
output {
File validation = "~{base_file_name}.validation.txt"
}
runtime {
cpu: 2
memory: "4 GB"
docker: docker
}
parameter_meta {
unmapped_cram: "unmapped cram file to validate"
base_file_name: "base file name to use when saving the validation text file"
docker: "location of Docker image to use for this task"
validation: "text file containing all relevant validation statistics for the cram in question"
}
}
# Merges multiple bam files into a single cram file
task MergeBamsToCram {
input {
Array[File] bams_to_merge
String base_file_name
String docker
Int threads
}
command <<<
set -eo pipefail
samtools merge -@ ~{threads-1} \
--write-index --output-fmt CRAM \
"~{base_file_name}.merged.cram" ~{sep=" " bams_to_merge}
>>>
output {
File cram = "~{base_file_name}.merged.cram"
File crai = "~{base_file_name}.merged.cram.crai"
}
runtime {
cpu: threads
memory: "12 GB"
docker: docker
}
parameter_meta {
bams_to_merge: "array of bam files to merge into a single cram file"
base_file_name: "base file name to use when saving the cram file"
docker: "location of Docker image to use for this task"
threads: "number of threads to use during the merging process"
cram: "final cram file containing all reads"
crai: "index file of final cram"
}
}