-
Notifications
You must be signed in to change notification settings - Fork 30
/
cram-to-bam.wdl
152 lines (139 loc) · 5.13 KB
/
cram-to-bam.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
version 1.0
## Copyright Broad Institute, 2017
## This script should convert a CRAM to SAM to BAM and output a BAM, BAM Index, and validation report to a Google bucket. If you'd like to do ## this on multiple CRAMS, create a sample set in the Data tab.
## The reason this approach was chosen instead of converting CRAM to BAM directly using Samtools is because Samtools 1.3 produces incorrect
## bins due to an old version of htslib included in the package. Samtools versions 1.4 & 1.5 have an NM issue that causes them to not validate ## with Picard.
##
## TESTED: It was tested using the Genomes in the Cloud Docker image version 2.3.1-1500064817.
## Versions of other tools on this image at the time of testing:
## PICARD_VER=1.1150
## GATK34_VER=3.4-g3c929b0
## GATK35_VER=3.5-0-g36282e4
## GATK36_VER=3.6-44-ge7d1cd2
## GATK4_VER=4.beta.1
## SAMTOOLS_VER=1.3.1
## BWA_VER=0.7.15.r1140
## TABIX_VER=0.2.5_r1005
## BGZIP_VER=1.3
## SVTOOLKIT_VER=2.00-1650
## It was tested pulling the HG38 reference Fasta and Fai.
## Successfully tested on Cromwell version 47. Does not work on versions < v23 due to output syntax
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
##
## LICENSING : This script is released under the WDL source code license (BSD-3) (see LICENSE in https://github.com/broadinstitute/wdl).
## Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script.
## Please see the docker for detailed licensing information pertaining to the included programs.
##
#WORKFLOW DEFINITION
workflow CramToBamFlow {
input {
File ref_fasta
File ref_fasta_index
File ref_dict
File input_cram
String sample_name
String gotc_docker = "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"
Int preemptible_tries = 3
}
#converts CRAM to SAM to BAM and makes BAI
call CramToBamTask{
input:
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
input_cram = input_cram,
sample_name = sample_name,
docker_image = gotc_docker,
preemptible_tries = preemptible_tries
}
#validates Bam
call ValidateSamFile{
input:
input_bam = CramToBamTask.outputBam,
docker_image = gotc_docker,
preemptible_tries = preemptible_tries
}
#Outputs Bam, Bai, and validation report to the FireCloud data model
output {
File outputBam = CramToBamTask.outputBam
File outputBai = CramToBamTask.outputBai
File validation_report = ValidateSamFile.report
}
}
#Task Definitions
task CramToBamTask {
input {
# Command parameters
File ref_fasta
File ref_fasta_index
File ref_dict
File input_cram
String sample_name
# Runtime parameters
Int addtional_disk_size = 20
Int machine_mem_size = 15
String docker_image
Int preemptible_tries
}
Float output_bam_size = size(input_cram, "GB") / 0.60
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + addtional_disk_size
#Calls samtools view to do the conversion
command {
set -eo pipefail
samtools view -h -T ~{ref_fasta} ~{input_cram} |
samtools view -b -o ~{sample_name}.bam -
samtools index -b ~{sample_name}.bam
mv ~{sample_name}.bam.bai ~{sample_name}.bai
}
#Run time attributes:
#Use a docker with samtools. Set this up as a workspace attribute.
#cpu of one because no multi-threading is required. This is also default, so don't need to specify.
#disk_size should equal input size + output size + buffer
runtime {
docker: docker_image
memory: machine_mem_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
#Outputs a BAM and BAI with the same sample name
output {
File outputBam = "~{sample_name}.bam"
File outputBai = "~{sample_name}.bai"
}
}
#Validates BAM output to ensure it wasn't corrupted during the file conversion
task ValidateSamFile {
input {
File input_bam
Int addtional_disk_size = 10
Int machine_mem_size = 4
String docker_image
Int preemptible_tries
}
String output_name = basename(input_bam, ".bam") + ".validation_report"
Int disk_size = ceil(size(input_bam, "GB")) + addtional_disk_size
Int command_mem_size = machine_mem_size - 1
command {
java -Xmx~{command_mem_size}G -jar /usr/gitc/picard.jar \
ValidateSamFile \
INPUT=~{input_bam} \
OUTPUT=~{output_name} \
MODE=SUMMARY \
IS_BISULFITE_SEQUENCED=false
}
#Run time attributes:
#Use a docker with the picard.jar. Set this up as a workspace attribute.
#Read more about return codes here: https://github.com/broadinstitute/cromwell#continueonreturncode
runtime {
docker: docker_image
memory: machine_mem_size + " GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
continueOnReturnCode: [0,1]
}
#A text file is generated that will list errors or warnings that apply.
output {
File report = "~{output_name}"
}
}