-
Notifications
You must be signed in to change notification settings - Fork 5
/
myco_sra.wdl
executable file
·386 lines (331 loc) · 17.2 KB
/
myco_sra.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
version 1.0
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.13.0/tasks/combined_decontamination.wdl" as clckwrk_combonation
import "https://raw.githubusercontent.com/aofarrel/clockwork-wdl/2.12.2/tasks/variant_call_one_sample.wdl" as clckwrk_var_call
import "https://raw.githubusercontent.com/aofarrel/SRANWRP/v1.1.24/tasks/pull_fastqs.wdl" as sranwrp_pull
import "https://raw.githubusercontent.com/aofarrel/SRANWRP/v1.1.24/tasks/processing_tasks.wdl" as sranwrp_processing
import "https://raw.githubusercontent.com/aofarrel/tree_nine/0.0.16/tree_nine.wdl" as build_treesWF
import "https://raw.githubusercontent.com/aofarrel/vcf_to_diff_wdl/0.0.3/vcf_to_diff.wdl" as diff
import "https://raw.githubusercontent.com/aofarrel/tb_profiler/0.2.5/tbprofiler_tasks.wdl" as profiler
import "https://raw.githubusercontent.com/aofarrel/tb_profiler/0.2.5/thiagen_tbprofiler.wdl" as tbprofilerFQ_WF # fka earlyQC
import "https://raw.githubusercontent.com/aofarrel/goleft-wdl/0.1.2/goleft_functions.wdl" as goleft
workflow myco {
input {
File biosample_accessions
Boolean clean_after_decontam = false
Int clean_average_q_score = 29
Boolean clean_before_decontam = true
Boolean covstatsQC_skip_entirely = true # false in myco_raw
Boolean decontam_use_CDC_varpipe_ref = false # true in myco_raw
File? mask_bedfile
Boolean TBProf_on_bams_not_fastqs = true # effectively false in myco_raw
# QC stuff
Float QC_max_pct_low_coverage_sites = 0.20
Int QC_max_pct_unmapped = 2
Int QC_min_mean_coverage = 10
Int QC_min_q30 = 90
Boolean QC_soft_pct_mapped = false
Int QC_this_is_low_coverage = 10
Int quick_tasks_disk_size = 10
Boolean guardrail_mode = true
# shrink large samples
Int subsample_cutoff = 450
Int subsample_seed = 1965
# phylogenetics
Boolean tree_decoration = false
File? tree_to_decorate
}
parameter_meta {
biosample_accessions: "File of BioSample accessions to pull, one accession per line"
TBProf_on_bams_not_fastqs: "If true, run TBProfiler on BAMs instead of fastqs"
covstatsQC_skip_entirely: "Should we skip covstats entirely?"
diffQC_mask_bedfile: "Bed file of regions to mask when making diff files (default: R00000039_repregions.bed)"
quick_tasks_disk_size: "Disk size in GB to use for quick file-processing tasks; increasing this might slightly speed up file localization"
subsample_cutoff: "If a fastq file is larger than than size in MB, subsample it with seqtk (set to -1 to disable)"
subsample_seed: "Seed used for subsampling with seqtk"
tree_decoration: "Should usher, taxonium, and NextStrain trees be generated?"
tree_to_decorate: "Base tree to use if tree_decoration = true"
}
String pass = "PASS" # used later... much later
Float QC_max_pct_low_coverage_sites_float = QC_max_pct_low_coverage_sites
call sranwrp_processing.extract_accessions_from_file as get_sample_IDs {
input:
accessions_file = biosample_accessions,
filter_na = true
}
scatter(biosample_accession in get_sample_IDs.accessions) {
call sranwrp_pull.pull_fq_from_biosample as pull {
input:
biosample_accession = biosample_accession,
fail_on_invalid = false,
subsample_cutoff = subsample_cutoff,
subsample_seed = subsample_seed,
tar_outputs = false
} # output: pull.fastqs
if(length(pull.fastqs)>1) {
Array[File] paired_fastqs=select_all(pull.fastqs)
}
}
call sranwrp_processing.cat_strings as merge_reports {
input:
strings = pull.results,
out = "pull_reports.txt",
disk_size = quick_tasks_disk_size
}
Array[Array[File]] pulled_fastqs = select_all(paired_fastqs)
scatter(pulled_fastq in pulled_fastqs) {
call clckwrk_combonation.clean_and_decontam_and_check as fastp_decontam_check {
input:
docker_image = if decontam_use_CDC_varpipe_ref then "ashedpotatoes/clockwork-plus:v0.11.3.11-CDC" else "ashedpotatoes/clockwork-plus:v0.11.3.11-CRyPTIC",
unsorted_sam = true,
reads_files = pulled_fastq,
fastp_clean_avg_qual = clean_average_q_score,
fastp_clean_before_decontam = clean_before_decontam,
fastp_clean_after_decontam = clean_after_decontam,
QC_min_q30 = QC_min_q30 / 100.0,
preliminary_min_q30 = if guardrail_mode then 0.2 else 0.0000001,
timeout_map_reads = if guardrail_mode then 120 else 0,
timeout_decontam = if guardrail_mode then 300 else 0
# no subsample cutoff here because that happens during the pull task
}
if(defined(fastp_decontam_check.decontaminated_fastq_1)) {
# This region only executes if decontaminated fastqs exist. We can use this to coerce File? into File by using
# select_first() where the first element is the File? we know must exist, and the second element is bogus.
File real_decontaminated_fastq_1=select_first([fastp_decontam_check.decontaminated_fastq_1, biosample_accessions])
File real_decontaminated_fastq_2=select_first([fastp_decontam_check.decontaminated_fastq_2, biosample_accessions])
if(!(TBProf_on_bams_not_fastqs)) {
call tbprofilerFQ_WF.ThiagenTBProfiler as thiagenTBprofilerFQ {
input:
fastq1 = real_decontaminated_fastq_1,
fastq2 = real_decontaminated_fastq_2,
soft_pct_mapped = QC_soft_pct_mapped,
soft_coverage = if guardrail_mode then false else true,
minimum_coverage = if guardrail_mode then 3 else 0,
minimum_pct_mapped = if guardrail_mode then 10 else 0, # unlike covstats, this is a MINIMUM of % MAPPED
sample = fastp_decontam_check.sample
}
}
String tbprofiler_fq_status_or_bogus = select_first([thiagenTBprofilerFQ.status_code, "bogus"]) # prevent "cannot compare String? to String" error
if(tbprofiler_fq_status_or_bogus == pass || TBProf_on_bams_not_fastqs) {
call clckwrk_var_call.variant_call_one_sample_ref_included as variant_calling {
input:
reads_files = [real_decontaminated_fastq_1, real_decontaminated_fastq_2],
tarball_bams_and_bais = false,
timeout = if guardrail_mode then 600 else 0
}
}
}
}
Array[File] minos_vcfs = flatten([select_all(variant_calling.adjudicated_vcf)])
Array[File] final_bams = flatten([select_all(variant_calling.bam)])
Array[File] final_bais = flatten([select_all(variant_calling.bai)])
Array[Array[File]] bams_and_bais = [final_bams, final_bais]
Array[Array[File]] bam_per_bai = transpose(bams_and_bais)
scatter(vcfs_and_bams in zip(bam_per_bai, minos_vcfs)) {
# scatter(vcfs_and_bams in zip(bam_per_bai, minos_vcfs)) is now sort of a three-way scatter:
# * bam file accessible via vcfs_and_bams.left[0]
# * bai file accessible via vcfs_and_bams.left[1]
# * vcf file accessible via vcfs_and_bams.right
# This relies on your WDL executor being consistent with how it orders arrays. That SHOULD always be the case per
# the spec, but if things break catastrophically, let me save you some debug time: As of 2.9.2, clockwork-wdl's
# ref-included version of the variant caller has an option to output the bams and bais as a tarball. You can use
# that to recreate the simplier scatter of version 4.4.1 or earlier of myco. You will need to modify some tasks to
# untar things, of course.
if(!covstatsQC_skip_entirely) {
# covstats to check coverage and percent mapped to reference
call goleft.covstats as covstats {
input:
inputBamOrCram = vcfs_and_bams.left[0],
allInputIndexes = [vcfs_and_bams.left[1]]
}
if((covstats.percentUnmapped < QC_max_pct_unmapped) || QC_soft_pct_mapped) {
if(covstats.coverage > QC_min_mean_coverage) {
# make diff files
call diff.make_mask_and_diff as make_mask_and_diff_after_covstats {
input:
bam = vcfs_and_bams.left[0],
vcf = vcfs_and_bams.right,
min_coverage_per_site = QC_this_is_low_coverage,
tbmf = mask_bedfile,
max_ratio_low_coverage_sites_per_sample = QC_max_pct_low_coverage_sites_float
}
}
}
}
if(covstatsQC_skip_entirely) {
# make diff files
call diff.make_mask_and_diff as make_mask_and_diff_no_covstats {
input:
bam = vcfs_and_bams.left[0],
vcf = vcfs_and_bams.right,
min_coverage_per_site = QC_this_is_low_coverage,
tbmf = mask_bedfile,
max_ratio_low_coverage_sites_per_sample = QC_max_pct_low_coverage_sites_float
}
}
# TBProfiler (will run even if fails covstats qc)
if(TBProf_on_bams_not_fastqs) {
call profiler.tb_profiler_bam as profile_bam {
input:
bam = vcfs_and_bams.left[0]
}
}
}
# even though diffs and reports are technically optional outputs, this does work, and will avoid nulls in the final output
Array[File] real_diffs = flatten([select_all(make_mask_and_diff_after_covstats.diff), select_all(make_mask_and_diff_no_covstats.diff)])
Array[File] real_reports = flatten([select_all(make_mask_and_diff_after_covstats.report), select_all(make_mask_and_diff_no_covstats.report)])
Array[File] real_masks = flatten([select_all(make_mask_and_diff_after_covstats.mask_file), select_all(make_mask_and_diff_no_covstats.mask_file)])
# pull TBProfiler information, if we ran TBProfiler on bams
# coerce optional types into required types (doesn't crash even if profile_bam didn't run)
Array[String] coerced_bam_strains=select_all(profile_bam.sample_and_strain)
Array[String] coerced_bam_resistances=select_all(profile_bam.sample_and_resistance)
Array[String] coerced_bam_depths=select_all(profile_bam.sample_and_median_depth)
# workaround for "defined(profile_bam.strain) is always true even if profile_bam didn't run" part of SOTHWO
if(!(length(coerced_bam_strains) == 0)) {
# if there is more than one sample, run some tasks to concatenate the outputs
if(length(pulled_fastqs) != 1) {
Array[String] bam_strains_with_header = flatten([["sample\tsublineage"], coerced_bam_strains])
Array[String] bam_resista_with_header = flatten([["sample\tresistance"], coerced_bam_resistances])
Array[String] bam_meddept_with_header = flatten([["sample\tmedn_depth"], coerced_bam_depths])
call sranwrp_processing.cat_strings as collate_bam_strains {
input:
strings = bam_strains_with_header,
out = "strain_reports.tsv",
disk_size = quick_tasks_disk_size
}
call sranwrp_processing.cat_strings as collate_bam_resistance {
input:
strings = bam_resista_with_header,
out = "resistance_reports.tsv",
disk_size = quick_tasks_disk_size
}
call sranwrp_processing.cat_strings as collate_bam_depth {
input:
strings = bam_meddept_with_header,
out = "depth_reports.tsv",
disk_size = quick_tasks_disk_size
}
}
# if there is only one sample, there's no need to run tasks
if(length(pulled_fastqs) == 1) {
String single_sample_tbprof_bam_depth = coerced_bam_depths[0]
String single_sample_tbprof_bam_resistance = coerced_bam_resistances[0]
String single_sample_tbprof_bam_strain = coerced_bam_strains[0]
}
}
# pull TBProfiler information, if we ran TBProfiler on fastqs
# coerce optional types into required types (doesn't crash if these are null)
Array[String] coerced_fq_strains=select_all(thiagenTBprofilerFQ.sample_and_strain)
Array[String] coerced_fq_resistances=select_all(thiagenTBprofilerFQ.sample_and_resistance)
Array[String] coerced_fq_depths=select_all(thiagenTBprofilerFQ.sample_and_coverage)
# workaround for "defined(qc_fastq.strains) is always true" part of SOTHWO
if(!(length(coerced_fq_strains) == 0)) {
# if there is more than one sample, run some tasks to concatenate the outputs
if(length(pulled_fastqs) != 1) {
Array[String] fq_strains_with_header = flatten([["sample\tsublineage"], coerced_fq_strains])
Array[String] fq_resista_with_header = flatten([["sample\tresistance"], coerced_fq_resistances])
Array[String] fq_meddept_with_header = flatten([["sample\tmedn_depth"], coerced_fq_depths])
call sranwrp_processing.cat_strings as collate_fq_strains {
input:
strings = fq_strains_with_header,
out = "strain_reports.tsv",
disk_size = quick_tasks_disk_size
}
call sranwrp_processing.cat_strings as collate_fq_resistance {
input:
strings = fq_resista_with_header,
out = "resistance_reports.tsv",
disk_size = quick_tasks_disk_size
}
call sranwrp_processing.cat_strings as collate_fq_depth {
input:
strings = fq_meddept_with_header,
out = "depth_reports.tsv",
disk_size = quick_tasks_disk_size
}
}
# if there is only one sample, there's no need to run tasks
if(length(pulled_fastqs) == 1) {
String single_sample_tbprof_fq_depth = coerced_fq_depths[0]
String single_sample_tbprof_fq_resistance = coerced_fq_resistances[0]
String single_sample_tbprof_fq_strain = coerced_fq_strains[0]
}
}
if(tree_decoration) {
if(length(real_diffs)>0) {
Array[File] coerced_diffs = select_all(real_diffs)
Array[File] coerced_reports = select_all(real_reports)
call build_treesWF.Tree_Nine as trees {
input:
diffs = coerced_diffs,
input_tree = tree_to_decorate,
coverage_reports = coerced_reports
}
}
}
# TODO: in later releases, the differences between the CDC and CRyPTIC pipelines will
# be handled in the decontamination task itself. This is a bandaid fix to avoid breaking
# the call cache for some previously processed data.
Array[String] CDC_columns = ["BioSample","raw_pct_above_q20","raw_pct_above_q30","raw_total_reads","post_cleaning_pct_above_q20","post_cleaning_pct_above_q30","post_decontam_pct_above_q20","post_decontam_pct_above_q30","post_decontam_total_reads","reads_is_contam","reads_reference","reads_unmapped","docker","status"]
Array[String] CRyPTIC_columns = ["BioSample","raw_pct_above_q20","raw_pct_above_q30","raw_total_reads","post_cleaning_pct_above_q20","post_cleaning_pct_above_q30","post_decontam_pct_above_q20","post_decontam_pct_above_q30","post_decontam_total_reads","reads_bacteria","reads_human","reads_NTM","docker","status"]
Array[String] todays_columns = if decontam_use_CDC_varpipe_ref then CDC_columns else CRyPTIC_columns
call sranwrp_processing.several_arrays_to_tsv as fastp_decont_report {
input:
row_keys = fastp_decontam_check.sample,
column_keys = todays_columns,
value1 = fastp_decontam_check.raw_pct_above_q20,
value2 = fastp_decontam_check.raw_pct_above_q30,
value3 = fastp_decontam_check.raw_total_reads,
value4 = fastp_decontam_check.cleaned_pct_above_q20,
value5 = fastp_decontam_check.cleaned_pct_above_q30,
# cleaned_total_reads purposely excluded; it's borked
value6 = fastp_decontam_check.dcntmd_pct_above_q20,
value7 = fastp_decontam_check.dcntmd_pct_above_q30,
value8 = fastp_decontam_check.dcntmd_total_reads,
value9 = fastp_decontam_check.reads_is_contam,
value10 = fastp_decontam_check.reads_reference,
value11 = fastp_decontam_check.reads_unmapped,
value12 = fastp_decontam_check.docker_used,
value13 = fastp_decontam_check.errorcode
}
output {
File download_report = merge_reports.outfile
File fastp_decont_report_tsv = fastp_decont_report.tsv
# raw files
Array[File] bais = final_bais
Array[File] bams = final_bams
Array[File] diffs = real_diffs
Array[File] masks = real_masks # bedgraph
Array[File] vcfs = minos_vcfs
# metadata
Array[File?] decontam_reports = fastp_decontam_check.counts_out_tsv
Array[File?] covstats_reports = covstats.covstatsOutfile
Array[File?] diff_reports = real_reports
Array[File?] tbprof_bam_jsons = profile_bam.tbprofiler_json
Array[File?] tbprof_bam_summaries = profile_bam.tbprofiler_txt
Array[File?] tbprof_fq_jsons = thiagenTBprofilerFQ.tbprofiler_json
Array[File?] tbprof_fq_looker = thiagenTBprofilerFQ.tbprofiler_looker_csv
Array[File?] tbprof_fq_laboratorian = thiagenTBprofilerFQ.tbprofiler_laboratorian_report_csv
Array[File?] tbprof_fq_lims = thiagenTBprofilerFQ.tbprofiler_lims_report_csv
# these outputs only exist if there are multiple samples
File? tbprof_bam_all_depths = collate_bam_depth.outfile
File? tbprof_bam_all_strains = collate_bam_strains.outfile
File? tbprof_bam_all_resistances = collate_bam_resistance.outfile
File? tbprof_fq_all_depths = collate_fq_depth.outfile
File? tbprof_fq_all_strains = collate_fq_strains.outfile
File? tbprof_fq_all_resistances = collate_fq_resistance.outfile
# these outputs only exist if we ran on a single sample
String? tbprof_bam_this_depth = single_sample_tbprof_bam_depth
String? tbprof_bam_this_strain = single_sample_tbprof_bam_strain
String? tbprof_bam_this_resistance = single_sample_tbprof_bam_resistance
String? tbprof_fq_this_depth = single_sample_tbprof_fq_depth
String? tbprof_fq_this_strain = single_sample_tbprof_fq_strain
String? tbprof_fq_this_resistance = single_sample_tbprof_fq_resistance
# tree nine
File? tree_nwk = trees.tree_nwk
File? tree_usher = trees.tree_usher
File? tree_taxonium = trees.tree_taxonium
File? tree_nextstrain = trees.tree_nextstrain
Array[File]? trees_nextstrain = trees.subtrees_nextstrain
Array[File]? distance_matrices = trees.max_distance_matrix
}
}