-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTypeREF_save_bams.nf
347 lines (276 loc) · 12.6 KB
/
TypeREF_save_bams.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#!/usr/bin/env nextflow
// GLOBAL PARAMETERS
date = new Date().format( 'yyyyMMdd' )
// USER INPUT PARAMETERS
params.meltvcf = null
params.bed = null
params.RM_track = null // Default RM track for hg19 and hg38 are available in the "Ressources" folder
params.TE = "Alu" // SHOULD DISAPEAR TO ALLOW ALL TE AT THE SAME TIME
params.outdir = "TypeREF-${date}"
params.help = null
params.ref = null
params.aln_path = null
params.aln_samples = null
params.maxr = 10
params.exportBams = null
// params.cpu = 1
params.version = "0.0-dev"
// SAY HELLO
log.info """
████████╗██╗ ██╗██████╗ ███████╗ ██████╗ ███████╗███████╗
╚══██╔══╝╚██╗ ██╔╝██╔══██╗██╔════╝ ██╔══██╗██╔════╝██╔════╝
██║ ╚████╔╝ ██████╔╝█████╗█████╗██████╔╝█████╗ █████╗
██║ ╚██╔╝ ██╔═══╝ ██╔══╝╚════╝██╔══██╗██╔══╝ ██╔══╝
██║ ██║ ██║ ███████╗ ██║ ██║███████╗██║
╚═╝ ╚═╝ ╚═╝ ╚══════╝ ╚═╝ ╚═╝╚══════╝╚═╝
V . ${params.version}
Genotyping mobile elements insertions included in a reference genome
"""
//print usage
if (params.help) {
log.info """
------------------------------------------------
-|- Type-REF v${params.version}. help -|-
------------------------------------------------
Usage:
./nextflow run TypeREF.nf [--meltvcf melt.del.vcf(.gz)/--bed RefTE.coordinates.bed] --ref reference.genome.fasta --RM_track reference.TE.bed --aln_path path.to.bam.cram.dir --aln_samples samples.ID.filename.table [options]
Input:
--meltvcf vcf (/vcf.gz) file preduced by MELT-DEL pipeline (Deletion-Merge command)
or
--bed bed file with coordinates of Reference TE to genotype
--ref reference genome used with MELT (.fasta)
--RM_track RepeatMasker track for reference MEI (.bed) note: RM track for hg19 and hg38 are available in the "Ressources" folder
--aln_path path to bam/cram directory (all samples need to be in the same directory however,
the files to analyse will be only those in the aln_samples table)
--aln_samples two columns (tab delimited) with header and samples ID in column 1 and associated samples file names (.bam/.cram) in column 2.
ex: sampleId fileId
NA12878 NA12878.project.blah.bam
NA12831 NA12831.project.blah.bam
--maxr maximum number of fragments mapping to each allele (cap to avoid ref allele inflation)
Output:
--outdir output directory to store results
--exportBams export re-alignments for each sample in a .tar.gz archive
Options:
--TE TE type to be genotyped: Alu | L1 | SVA (default: Alu)
--help this message
"""
exit 1
}
// VALIDATE INPUT
// if ( params.meltvcf == null ) exit 1, "missing input (--meltvcf *vcf/vcf.gz)"
if ( params.RM_track == null ) exit 1, "Error: missing Repeat Masker track (--RM_track *.bed)"
if ( params.TE == null ) exit 1, "Error: missing TE type (any: \"Alu (default)\", \"L1\", \"SVA\")"
if ( !( params.TE == "Alu" || params.TE == "L1" || params.TE == "SVA" )) exit 1, "Error: incorrect TE type entered (use any: \"Alu (default)\", \"L1\", \"SVA\")"
if ( params.ref == null ) exit 1, "Error: missing reference genome (--ref *.fasta)"
if ( params.meltvcf != null && params.bed != null ) exit 1, "Error: --meltvcf and --bed are exclusive"
if ( params.meltvcf == null && params.bed == null ) exit 1, "Error: no TE breakpoints provided (--meltvcf or --bed)"
// ASSIGN INPUT CHANNELS WITH USER-DEFINED FILE PATH
RMtrack_in = Channel.fromPath(params.RM_track)
RMtrack_ch = Channel.fromPath(params.RM_track)
alignPath_ch = Channel.fromPath(params.aln_path)
alignSamples_ch = Channel
.fromPath(params.aln_samples)
.splitCsv(sep: '\t', header:true)
.map { row -> tuple(row.sampleId, file(row.fileId)) }
alignSamples_ch2 = Channel
.fromPath(params.aln_samples)
.splitCsv(sep: '\t', header:true)
.map { row -> row.sampleId }
if ( params.meltvcf != null )
in_ch = Channel.fromPath(params.meltvcf)
else
in_ch = Channel.fromPath(params.bed)
// split the ref genome into independent input channels for each process
ref_TSD = Channel.fromPath(params.ref)
ref_genoinput = Channel.fromPath(params.ref)
ref_geno_gen_ch = Channel.fromPath(params.ref)
ref_vcf_ch = Channel.fromPath(params.ref)
// load insertion-genotype submodules into dedicated channels
insgen_prep_ch = Channel.fromPath( './bin/insertion-genotype/' )
insgen_gen_ch = Channel.fromPath( './bin/insertion-genotype/' )
// put maxreads in its channel
insgen_maxr = Channel.value(params.maxr)
// ----------------------------------------
// STEP 1
// EXTRACT LOCI LIST AND INFO FROM MELT VCF
// ----------------------------------------
process parse_input {
publishDir "${params.outdir}/", mode: 'copy', glob: 'input_loci_correspondance'
input:
file inputfile from in_ch // takes the file from the path in the channel
file RM_track from RMtrack_in // takes the TE type
output:
file "infile" into TypeDEL_in // will output in new channel TypeDEL_in
file "infile" into rescue_in // copy for process 3 (TSD) rescue (those who are not processed by step 3)
file "input_loci_correspondance" into cor_ch
file "input_loci_correspondance" into rescue_cor
script:
"""
parse_input.sh $inputfile $RM_track > infile
"""
}
// ----------------------------------------------
// STEP 2
// MATCH MELT LOCI WITH REPEAT MASKER COORDINATES
// ----------------------------------------------
process matchRMloci {
input:
file infile from TypeDEL_in // takes the imput from precedent step (TypeTE specific input)
file RM_track from RMtrack_ch // takes the TE type
//string TE from TE_ch
output:
file "file.correspondingRepeatMaskerTEs.txt" into RM_refTSD // RM channel to host the output file for next process
file "file.correspondingRepeatMaskerTEs.txt" into RM_inGeno // RM channel to host the output file for geno process
// if 'chr' in the vcf keep like that, otherwise, remove the 'chr' from the RM track for the bedtool intersct!
script:
"""
if grep -q "chr" $infile
then
01_DelP_findcorrespondinginsertion_v3.3.pl -t $RM_track -f $infile -p . -te ${params.TE}
else
sed -E 's/chr//g' $RM_track > RM_track.alt
01_DelP_findcorrespondinginsertion_v3.3.pl -t RM_track.alt -f $infile -p . -te ${params.TE}
fi
"""
}
// -------------
// STEP 3
// REFINE TSDs
// -------------
process findTSDs {
input:
file "file.correspondingRepeatMaskerTEs.txt" from RM_refTSD
file ref from ref_TSD
//file "infile" from rescue_in
//file "input_loci_correspondance" from rescue_cor
output:
file "output_TSD_Intervals.out" into TSD // TSD channel to host the output file
script:
"""
03_DelP_findTSD_forRMTEcordinates_v3.4.pl -t file.correspondingRepeatMaskerTEs.txt -p output_TSD_Intervals.out -g $ref
"""
}
// -------------------------------------
// STEP 4
// CREATE INPUTS FOR insertion-genotype
// -------------------------------------
process createAlleles {
publishDir "${params.outdir}/", mode: 'copy', glob: 'TypeREF.allele'
input:
file "output_TSD_Intervals.out" from TSD
file "file.correspondingRepeatMaskerTEs.txt" from RM_inGeno
file ref from ref_genoinput
file "infile" from rescue_in
file "input_loci_correspondance" from rescue_cor
output:
file "RM_insertions_TSD_strands" into interm_ch
file "RM_insertions_TSD_strands" into RM_ins_vcf_channel
file "TypeREF.allele" into input_Geno_ch_1
file "TypeREF.allele" into input_Geno_ch_2
file "*.fai" into index_ch
file "exclusion.bed" into exclusion_ch
shell:
"""
join -11 -21 <(sort -k1,1 output_TSD_Intervals.out/TEcordinates_with_bothtsd_cordinates.v.3.4.txt) <(sort -k1,1 file.correspondingRepeatMaskerTEs.txt) | sed 's/ /\t/g' | awk '{print \$1"\t"\$2"\t"\$3"\t"\$4"\t"\$10"\t"\$8}' > RM_insertions_TSD_strands
grep -wf <(grep -vwf <(cut -f 1 RM_insertions_TSD_strands) infile | cut -f 1) <(awk '{print \$7"_"\$8"\t"\$0}' input_loci_correspondance) | awk '{print \$1"\t"\$8":"\$9"-"\$10"\t"\$8":"\$9"-"\$10"\tnoTSDs\t"\$13"\t"\$11}' >> RM_insertions_TSD_strands
samtools faidx $ref
deletion_create_input.sh RM_insertions_TSD_strands $ref > TypeREF.allele
awk '{print \$1}' TypeREF.allele | sed 's/:/\t/g;s/-/\t/g' | awk '{print \$1":"\$2"-"\$3"_genome\t500\t"500+(\$3-\$2)}' > exclusion.bed
"""
}
// --------------------------------------
// STEP 5 - (sub: INSERTION-GENOTYPE 1/2)
// CREATE alleles and index with bwa
// --------------------------------------
process insgen_indexAlleles {
input:
file "TypeREF.allele" from input_Geno_ch_1.splitText( by: 8 )
output:
file "genotyping" into allelebase_ch
script:
"""
mkdir genotyping
python2.7 $workflow.projectDir/bin/insertion-genotype/create-alternative-alleles.py --allelefile TypeREF.allele --allelebase genotyping --bwa bwa
"""
}
//----------------------------------
// STEP 6
// Gather loci in a genotype folder
//----------------------------------
process insgen_gatherloc {
input:
file "genotyping" from allelebase_ch.collect()
output:
// file "genotyping/samples/${sampleId}/*.vcf.gz" into indexed_vcfs
file "allelebase" into gathered_loc
script:
// check if we have multiple genotyping folder by searching for "genotyping1" -- otherwise folder is called "genotyping"
// if multiple genotyping folders, make a new "genotyping" folder and copy the content of the different folders into it
"""
mkdir -p allelebase
if [[ -d ./genotyping1 ]]; then
cp -r ./genotyping[0-9]*/* ./allelebase/
else
cp -r ./genotyping/* ./allelebase/
fi
"""
}
//--------------------------------------
// STEP 7 - (sub: INSERTION-GENOTYPE 2/2)
// GENOTYPE!!!!!!!
//--------------------------------------
process insgen_genotype {
input:
set sampleId, file(fileId) from alignSamples_ch
file "TypeREF.allele" from input_Geno_ch_2.toList()
file "allelebase" from gathered_loc.toList()
file "alignments" from alignPath_ch.toList()
file "ref" from ref_geno_gen_ch.toList()
file "*.fai" from index_ch.toList()
file "exclusion.bed" from exclusion_ch.toList()
//val maxr from insgen_maxr.toList()
output:
// file "genotyping/samples/${sampleId}/*.vcf.gz" into indexed_vcfs
file "*.vcf.gz*" into indexed_vcfs
file "allelebase" into export_ch
script:
"""
python2.7 $workflow.projectDir/bin/insertion-genotype/process-sample.py --allelefile TypeREF.allele --allelebase allelebase --samplename ${sampleId} --bwa bwa --bam alignments/${fileId} --reference ref --excludefile exclusion.bed --maxreads ${params.maxr}
bgzip -c allelebase/samples/${sampleId}/${sampleId}.vcf > ${sampleId}.vcf.gz
tabix -p vcf ${sampleId}.vcf.gz
"""
}
process exportBams {
publishDir "." ,mode: 'copy' , pattern: "*.tar.gz"
input:
set sampleId from alignSamples_ch2
file "allelebase" from export_ch
// create a channel to duplicate sampleID from alignSamples_ch
// create a channel to export the allelebase folder to a new channel + use .toList() to have it for each individual
output:
file "*.bams.tar.gz" into TR_bams_ch
when:
params.exportBams // add params.exportBams on top
"""
tar -czhvf ${sampleId}.bams.tar.gz allelebase/samples/${sampleId}/mapping
"""
}
//--------------------------------------
// STEP 8
// Merge vcfs
//--------------------------------------
process mergeVcfs {
publishDir "${params.outdir}/", mode: 'copy'
input:
file vcfFile from indexed_vcfs.collect()
file ref from ref_vcf_ch
file "RM_insertions_TSD_strands" from RM_ins_vcf_channel
output:
file "TypeREF.merged.genotypes.vcf" into typeref_outputs
file "TypeREF.final.genotypes.vcf" into typeref_outputs_2
script:
"""
vcf-merge *.vcf.gz > TypeREF.merged.genotypes.vcf
makevcf.sh $ref ${params.version} > TypeREF.final.genotypes.vcf
"""
}