diff --git a/conf/containers.config b/conf/containers.config index 34333ebb..23ff9114 100755 --- a/conf/containers.config +++ b/conf/containers.config @@ -27,10 +27,10 @@ withName:SomaticDellyCall { container = "cmopipeline/delly-bcftools:0.0.1" } - withName:RunMutect2 { + withName:".*RunMutect2" { container = "broadinstitute/gatk:4.1.0.0" } - withName:SomaticCombineMutect2Vcf { + withName:".*CombineMutect2Vcf" { container = "cmopipeline/bcftools-vt:1.2.0" } withName:SomaticRunManta { @@ -46,7 +46,7 @@ container = "cmopipeline/bcftools-vt:1.2.3" } withName:SomaticAnnotateMaf { - container = "cmopipeline/vcf2maf:vep88_1.2.4" + container = "cmopipeline/vcf2maf:vep88_1.2.5" } withName:DoFacets { container = "cmopipeline/facets-suite-preview-htstools:0.0.1" @@ -98,7 +98,7 @@ container = "cmopipeline/bcftools-vt:1.2.2" } withName:GermlineAnnotateMaf { - container = "cmopipeline/vcf2maf:vep88_1.2.4" + container = "cmopipeline/vcf2maf:vep88_1.2.5" } withName:GermlineFacetsAnnotation { container = "cmopipeline/facets-suite-preview-htstools:0.0.1" @@ -106,6 +106,12 @@ withName:GermlineMergeDellyAndManta { container = "cmopipeline/bcftools-vt:1.1.1" } + withName:FilterGermlineStrelka2 { + container = "cmopipeline/bcftools-vt:1.1.1" + } + withName:generatePoN { + container = "cmopipeline/bcftools-vt:1.1.1" + } diff --git a/containers/vcf2maf/filter-germline-pon-maf.R b/containers/vcf2maf/filter-germline-pon-maf.R new file mode 100755 index 00000000..ec10934d --- /dev/null +++ b/containers/vcf2maf/filter-germline-pon-maf.R @@ -0,0 +1,97 @@ +#!/usr/bin/env Rscript + +# __author__ = "Philip Jonsson" +# __email__ = "jonssonp@mskcc.org" +# __version__ = "0.2.0" +# __status__ = "Dev" + +suppressPackageStartupMessages({ + library(data.table) + library(annotateMaf) + library(argparse) +}) + +Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 3) +args = commandArgs(TRUE) + +if (is.null(args) | length(args)<1) { + message('Run filter-germline-maf.R --help for list of input arguments.') + quit() +} + +parser = ArgumentParser(description = 'Flag and filter somatic variants in input MAF file, output is a filtered and unfiltered but filter-tagged MAF file.') +parser$add_argument('-m', '--maf-file', type = 'character', required = TRUE, + help = 'VEP-annotated MAF file') +parser$add_argument('-o', '--output-prefix', type = 'character', required = TRUE, + help = 'Prefix of output files') +parser$add_argument('-nd', '--normal-depth', type = 'integer', required = FALSE, + default = 20, help = 'Normal variant loci total depth cut-off [default %(default)s]') +parser$add_argument('-nv', '--normal-vaf', type = 'double', required = FALSE, + default = 0.35, help = 'Normal variant allele frequency cut-off [default %(default)s]') +parser$add_argument('-sv', '--somatic-vaf', type = 'double', required = FALSE, + default = 0.05, help = 'Somatic variant allele frequency cut-off [default %(default)s]') + +# Get inputs +args = parser$parse_args() +maf = args$maf_file +output_prefix = args$output_prefix +normal_depth_cutoff = args$normal_depth +normal_vaf_cutoff = args$normal_vaf +somatic_vaf_cutoff = args$somatic_vaf + +add_tag = function(filter, tag) { + ifelse(filter == 'PASS', + tag, + paste(filter, tag, sep = ';')) +} + +format_tag = function(pretag){ + gsub("-","_",gsub(" ", "_", tolower(pretag))) +} + +ch_genes = c("ASXL1", "ATM", "BCOR", "CALR", "CBL", "CEBPA", "CREBBP", "DNMT3A", "ETV6", "EZH2", "FLT3", "GNAS", + "IDH1", "IDH2", "JAK2", "KIT", "KRAS", "MPL", "MYD88", "NF1", "NPM1", "NRAS", "PPM1D", "RAD21", "RUNX1", "SETD2", "SF3B1", "SH2B3", "SRSF2", "STAG2", "STAT3", "TET2", "TP53", "U2AF1", "WT1", "ZRSR2") + +maf = fread(maf, data.table = TRUE) + +# Tag input MAF with filters -------------------------------------------------------------------------------------- +#maf[, `:=` (t_var_freq = t_alt_count/(t_alt_count+t_ref_count), +maf[, `:=` (n_var_freq = n_alt_count/(n_alt_count+n_ref_count), + EncodeDacMapability = ifelse(is.na(EncodeDacMapability), '', EncodeDacMapability), + RepeatMasker = ifelse(is.na(RepeatMasker), '', RepeatMasker), + gnomAD_FILTER = ifelse(is.na(gnomAD_FILTER), 0, 1), + ch_gene = Hugo_Symbol %in% ch_genes, + mutation_effect = ifelse(is.na(mutation_effect), '', mutation_effect), + oncogenic = ifelse(is.na(oncogenic), '', oncogenic) +)] + +maf[n_depth < normal_depth_cutoff, FILTER := add_tag(FILTER, 'low_n_depth')] +maf[n_var_freq < somatic_vaf_cutoff, FILTER := add_tag(FILTER, 'low_vaf')] +#maf[!ch_gene & n_var_freq < normal_vaf_cutoff & !Variant_Classification %in% c('INS', 'DEL'), FILTER := add_tag(FILTER, 'low_n_vaf')] +#maf[!ch_gene & n_var_freq < (normal_vaf_cutoff - 0.10) & Variant_Classification %in% c('INS', 'DEL'), FILTER := add_tag(FILTER, 'low_n_vaf')] +#maf[ch_gene & n_var_freq < normal_vaf_cutoff & t_var_freq < .25, FILTER := add_tag(FILTER, 'ch_mutation')] +maf[ch_gene & n_var_freq < normal_vaf_cutoff, FILTER := add_tag(FILTER, 'ch_mutation')] +#maf[t_var_freq > 3 * n_var_freq, FILTER := add_tag(FILTER, 't_in_n_contamination')] +maf[EncodeDacMapability != '', FILTER := add_tag(FILTER, 'mappability')] +maf[RepeatMasker != '', FILTER := add_tag(FILTER, 'repeatmasker')] +maf[gnomAD_FILTER == 1, FILTER := add_tag(FILTER, 'gnomad_filter')] +maf[mutation_effect != '', FILTER := add_tag(FILTER,format_tag(mutation_effect))] +maf[oncogenic != '', FILTER := add_tag(FILTER,format_tag(oncogenic))] + + +# Add BRCA annotation --------------------------------------------------------------------------------------------- +maf = brca_annotate_maf(maf) +maf = hotspot_annotate_maf(maf) + +# Write filtered and tagged input MAF ----------------------------------------------------------------------------- +maf = as.data.table(maf) # necessary because of the class of output from previous call +#maf[Hotspot == TRUE & t_var_freq >= 0.02 & FILTER == 'low_vaf', FILTER := 'PASS'] # note: variants flagged by other filters will not be rescued by this +#maf[Hotspot == TRUE & FILTER == 'low_mapping_quality', FILTER := 'PASS'] +#maf[Hotspot == TRUE & FILTER == 'low_t_depth', FILTER := 'PASS'] +#maf[Hotspot == TRUE & FILTER == 'strand_bias', FILTER := 'PASS'] +maf[Hotspot == TRUE, FILTER := add_tag(FILTER, 'hotspot')] + +filter_maf = maf[FILTER == 'PASS'] + +fwrite(maf, paste0(output_prefix, '.unfiltered.maf'), sep = '\t') +fwrite(filter_maf, paste0(output_prefix, '.maf'), sep = '\t') diff --git a/pon.nf b/pon.nf new file mode 100644 index 00000000..2389c878 --- /dev/null +++ b/pon.nf @@ -0,0 +1,605 @@ +referenceMap = defineReferenceMap() +outDir = file(params.outDir).toAbsolutePath() +mappingFile = file(params.bamMapping, checkIfExists: true) +tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] + +TempoUtils.extractBAM(mappingFile, params.assayType) + .into{bams4MutectGermline; bams4MantaGermline; bamsForStrelkaGermline; bams4QC} + +process GermlineRunManta { + tag {idNormal} + + input: + set idNormal, target, file(bamNormal), file(baiNormal) from bams4MantaGermline + set file(genomeFile), file(genomeIndex) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex + ]) + set file(svCallingIncludeRegions), file(svCallingIncludeRegionsIndex) from Channel.value([ + referenceMap.svCallingIncludeRegions, referenceMap.svCallingIncludeRegionsIndex + ]) + + output: + set idNormal, target, file("*.candidateSmallIndels.vcf.gz"), file("*.candidateSmallIndels.vcf.gz.tbi") into mantaOutput + + script: + outputPrefix = "${idNormal}" + options = "" + if (params.assayType == "exome") options = "--exome" + """ + configManta.py \\ + ${options} \\ + --callRegions ${svCallingIncludeRegions} \\ + --reference ${genomeFile} \\ + --bam ${bamNormal} \\ + --runDir Manta + python Manta/runWorkflow.py \\ + --mode local \\ + --jobs ${task.cpus} + + mv Manta/results/variants/candidateSmallIndels.vcf.gz \\ + Manta_${outputPrefix}.candidateSmallIndels.vcf.gz + mv Manta/results/variants/candidateSmallIndels.vcf.gz.tbi \\ + Manta_${outputPrefix}.candidateSmallIndels.vcf.gz.tbi + """ +} + +bamsForStrelkaGermline.join(mantaOutput, by:[0,1]) + .set{bamsForStrelkaGermline} + +process GermlineRunStrelka2 { + tag {idNormal} + + //publishDir "${outDir}/pon/${idNormal}/strelka2", mode: params.publishDirMode + + input: + set idNormal, target, file(bamNormal), file(baiNormal), file(mantaCSI), file(mantaCSI_idx) from bamsForStrelkaGermline + set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict + ]) + set file(idtTargets), file(idtTargetsIndex) from Channel.value([referenceMap.idtTargets, referenceMap.idtTargetsIndex]) + set file(idtv2Targets), file(idtv2TargetsIndex) from Channel.value([referenceMap.idtv2Targets, referenceMap.idtv2TargetsIndex]) + set file(agilentTargets), file(agilentTargetsIndex) from Channel.value([referenceMap.agilentTargets, referenceMap.agilentTargetsIndex]) + set file(wgsIntervals), file(wgsIntervalssIndex) from Channel.value([referenceMap.wgsTargets, referenceMap.wgsTargetsIndex]) + + output: + set idNormal, target, file("variants.vcf.gz"), file("variants.vcf.gz.tbi") into StrelkaGermline_prefilter + + script: + options = "" + intervals = wgsIntervals + if (params.assayType == "exome") { + options = "--exome" + if (target == 'agilent') intervals = agilentTargets + if (target == 'idt') intervals = idtTargets + if (target == 'idt_v2') intervals = idtTargets + } +""" +configureStrelkaGermlineWorkflow.py \\ + ${options} \\ + --callRegions ${intervals} \\ + --referenceFasta ${genomeFile} \\ + --bam ${bamNormal} \\ + --indelCandidates ${mantaCSI} \\ + --runDir Strelka + +python Strelka/runWorkflow.py \\ + --mode local \\ + --jobs ${task.cpus} + +mv Strelka/results/variants/variants.vcf.gz ./variants.vcf.gz +mv Strelka/results/variants/variants.vcf.gz.tbi ./variants.vcf.gz.tbi + +""" +} + +process FilterGermlineStrelka2 { + tag {idNormal} + + publishDir "${outDir}/pon/${idNormal}/strelka2", mode: params.publishDirMode + + input: + set idNormal, target, file("variants.vcf.gz"), file("variants.vcf.gz.tbi") from StrelkaGermline_prefilter + set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict + ]) + + output: + set idNormal, target, file("${idNormal}.strelka2.vcf.gz"), file("${idNormal}.strelka2.vcf.gz.tbi") into StrelkaGermline_out + + script: + """ + bcftools filter \\ + --include 'FORMAT/AD[0:1]>1' \\ + --output-type v \\ + --output intermediate.vcf \\ + variants.vcf.gz + + sed -i -e 's/ID=RU,Number=1/ID=RU,Number=A/' \\ + -e 's/ID=AD,Number=./ID=AD,Number=R/' \\ + -e 's/ADR,Number=./ADR,Number=R/g' \\ + -e 's/ADF,Number=./ADF,Number=R/g' \\ + -e 's/PL,Number=R/PL,Number=G/g' intermediate.vcf + + bgzip -c intermediate.vcf > intermediate.vcf.gz + tabix -p vcf intermediate.vcf.gz + + bcftools norm \\ + --fasta-ref ${genomeFile} \\ + --check-ref s \\ + --multiallelics -both \\ + --output-type z \\ + --output ${idNormal}.strelka2.vcf.gz \\ + intermediate.vcf.gz + + tabix --preset vcf ${idNormal}.strelka2.vcf.gz + """ + +} + + +process CreateScatteredIntervals { + input: + set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict + ]) + set file(idtTargets), file(idtv2Targets), file(agilentTargets), file(wgsTargets), + file(idtTargetsIndex), file(idtv2TargetsIndex), file(agilentTargetsIndex), file(wgsTargetsIndex) from Channel.value([ + referenceMap.idtTargets, referenceMap.idtv2Targets, referenceMap.agilentTargets, referenceMap.wgsTargets, + referenceMap.idtTargetsIndex, referenceMap.idtv2TargetsIndex, referenceMap.agilentTargetsIndex, referenceMap.wgsTargetsIndex + ]) + + output: + set file("agilent*.interval_list"), val("agilent"), val("agilent") into agilentIList + set file("idt*.interval_list"), val("idt"), val("idt") into idtIList + set file("wgs*.interval_list"), val("wgs"), val("wgs") into wgsIList + set file("idt*.interval_list"), val("idt_v2"), val("idt_v2") into idtv2IList + + script: + scatterCount = params.scatterCount + """ + gatk SplitIntervals \\ + --reference ${genomeFile} \\ + --intervals ${agilentTargets} \\ + --scatter-count ${scatterCount} \\ + --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \\ + --output agilent + for i in agilent/*.interval_list; + do + BASENAME=`basename \$i` + mv \$i agilent-\$BASENAME + done + gatk SplitIntervals \\ + --reference ${genomeFile} \\ + --intervals ${idtTargets} \\ + --scatter-count ${scatterCount} \\ + --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \\ + --output idt + for i in idt/*.interval_list; + do + BASENAME=`basename \$i` + mv \$i idt-\$BASENAME + done + gatk SplitIntervals \\ + --reference ${genomeFile} \\ + --intervals ${idtv2Targets} \\ + --scatter-count ${scatterCount} \\ + --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW \ + --output idt_v2 + for i in idt_v2/*.interval_list; + do + BASENAME=`basename \$i` + mv \$i idt_v2-\$BASENAME + done + gatk SplitIntervals \\ + --reference ${genomeFile} \\ + --intervals ${wgsTargets} \\ + --scatter-count ${scatterCount} \\ + --subdivision-mode INTERVAL_SUBDIVISION \\ + --output wgs + for i in wgs/*.interval_list; + do + BASENAME=`basename \$i` + mv \$i wgs-\$BASENAME + done + """ +} + +agilentIList.mix(idtIList, wgsIList, idtv2IList).set{mergedIList4N} + +bams4MutectGermline.combine(mergedIList4N, by:1) +.map{ item -> + def idNormal = item[1] + def target = item[0] + def normalBam = item[2] + def normalBai = item[3] + def intervalBed = item[4] + def key = "${idNormal}@${target}" // adding one unique key + + return [ key, idNormal, target, normalBam, normalBai, intervalBed ] +}.map{ + key, idNormal, target, normalBam, normalBai, intervalBed -> + tuple ( + groupKey(key, intervalBed.size()), // adding numbers so that each sample only wait for it's own children processes + idNormal, target, normalBam, normalBai, intervalBed + ) +} +.transpose() +.set{ mergedChannelGermline } + +process GermlineRunMutect2 { + tag {idNormal} + + input: + set key, idNormal, target, file(bamNormal), file(baiNormal), file(intervalBed) from mergedChannelGermline + set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict + ]) + + output: + // set key, idNormal, target, file("*filtered.vcf.gz"), file("*filtered.vcf.gz.tbi"), file("*Mutect2FilteringStats.tsv") into forGermlineMutect2Combine + set key, idNormal, target, file("${mutect2Vcf}"), file("${mutect2Vcf}.tbi") into forGermlineMutect2Combine + + // definitions for terms in FilterMutectCalls here: https://support.sentieon.com/appnotes/out_fields/ + // Removed FilterMutectCalls because at the end all uncommon variants will be assumed as + script: + mutect2Vcf = "${idNormal}_${intervalBed.baseName}.vcf.gz" + prefix = "${mutect2Vcf}".replaceFirst(".vcf.gz", "") + """ + gatk --java-options -Xmx8g Mutect2 \\ + --reference ${genomeFile} \\ + --intervals ${intervalBed} \\ + --input ${bamNormal} \\ + --tumor ${idNormal} \\ + --output ${mutect2Vcf} + + gatk --java-options -Xmx8g FilterMutectCalls \\ + --variant ${mutect2Vcf} \\ + --stats ${prefix}.Mutect2FilteringStats.tsv \\ + --output ${prefix}.gatk-filtered.vcf.gz + + find . -type f > listoffiles.txt + """ +} + +forGermlineMutect2Combine.groupTuple(by:[0,1,2]).set{ forGermlineMutect2Combine } + +process GermlineCombineMutect2Vcf { + tag {idNormal} + + publishDir "${outDir}/pon/${idNormal}/mutect2", mode: params.publishDirMode + + input: + set id, idNormal, target, file(mutect2Vcf), file(mutect2VcfIndex) from forGermlineMutect2Combine + set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict + ]) + + output: + set idNormal, target, file("${idNormal}.mutect2.vcf.gz"), file("${idNormal}.mutect2.vcf.gz.tbi") into mutect2CombinedVcf4Combine + + script: + //def idNormal = id.toString().split("@")[0].split("__")[1] + //def target = id.toString().split("@")[1] + def outfile = "${idNormal}.mutect2.vcf.gz" + """ + bcftools concat \\ + --allow-overlaps \\ + ${mutect2Vcf} | \\ + bcftools sort | \\ + bcftools norm \\ + --fasta-ref ${genomeFile} \\ + --check-ref s \\ + --multiallelics -both | \\ + bcftools norm --rm-dup all | \\ + bcftools view \\ + --samples ${idNormal} \\ + --output-type z | \\ + bcftools filter \\ + --include 'FORMAT/AD[0:1]>1' | \\ + sed -e 's/ID=RU,Number=1/ID=RU,Number=A/' -e 's/ID=AD,Number=R/ID=AD,Number=./' | \\ + bcftools norm \\ + --fasta-ref ${genomeFile} \\ + --check-ref s \\ + --multiallelics -both \\ + --output-type z \\ + --output ${outfile} + + tabix --preset vcf ${outfile} + """ +} + +mutect2CombinedVcf4Combine + .combine(StrelkaGermline_out, by:[0,1]) + .set{combineMutect_and_Strelka} + +process GermlineCombineChannel { + tag {idNormal} + + container = "cmopipeline/bcftools-vt:1.2.2" + + publishDir "${outDir}/pon/${idNormal}", mode: params.publishDirMode + + input: + set idNormal, target, file(mutectFile), file(mutectFileTbi), file(strelkaFile), file(strelkaFileTbi) from combineMutect_and_Strelka + set file(gnomadWesVcf), file(gnomadWesVcfIndex), file(gnomadWgsVcf), file(gnomadWgsVcfIndex) from Channel.value([ + referenceMap.gnomadWesVcf, referenceMap.gnomadWesVcfIndex, + referenceMap.gnomadWgsVcf, referenceMap.gnomadWgsVcfIndex + ]) + set file(genomeFile), file(genomeIndex), file(genomeDict) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict + ]) + set file(repeatMasker), file(repeatMaskerIndex), file(mapabilityBlacklist), file(mapabilityBlacklistIndex) from Channel.value([ + referenceMap.repeatMasker, referenceMap.repeatMaskerIndex, + referenceMap.mapabilityBlacklist, referenceMap.mapabilityBlacklistIndex + ]) + + output: + set idNormal, target, file("${idNormal}.combined.annot.vcf.gz"), file("${idNormal}.combined.annot.vcf.gz.tbi") into combinedVcf, combinedVcf4Annot, combinedVcf4Subtract + + script: + gnomad = gnomadWgsVcf + if (params.assayType == 'genome') { + gnomad = gnomadWgsVcf + } + else if (params.assayType == 'exome') { + gnomad = gnomadWesVcf + } + """ + echo -e '##INFO=' > vcf.map.header + echo -e "##INFO=" > vcf.rm.header + + bcftools isec \\ + --output-type z \\ + --prefix isecDir \\ + ${mutectFile} ${strelkaFile} + + bcftools concat \\ + --allow-overlaps \\ + --rm-dups all \\ + isecDir/0000.vcf.gz isecDir/0001.vcf.gz isecDir/0002.vcf.gz | \\ + bcftools sort \\ + --output-type z \\ + --output-file concat.vcf.gz + + tabix -p vcf concat.vcf.gz + + bcftools annotate \\ + --annotations ${gnomad} \\ + --columns INFO \\ + concat.vcf.gz | \\ + bcftools annotate \\ + --header-lines vcf.map.header \\ + --annotations ${mapabilityBlacklist} \\ + --columns CHROM,FROM,TO,EncodeDacMapability | \\ + bcftools annotate \\ + --header-lines vcf.rm.header \\ + --annotations ${repeatMasker} \\ + --columns CHROM,FROM,TO,RepeatMasker | \\ + bcftools filter \\ + --exclude \"${params.germlineVariant.gnomadAf}\" \\ + --output-type z \\ + --output ${idNormal}.combined.annot.vcf.gz + + tabix -p vcf ${idNormal}.combined.annot.vcf.gz + + bcftools filter \\ + --include 'FILTER=\"PASS\"' \\ + --output-type z \\ + --output ${idNormal}.combined.annot.pass.vcf.gz \\ + ${idNormal}.combined.annot.vcf.gz + + tabix -p vcf ${idNormal}.combined.annot.pass.vcf.gz + + """ +} + +process GermlineAnnotateMaf { + tag {idNormal} + + cpus = 2 + + input: + set idNormal, target, file(vcfGz), file(vcfGzTbi) from combinedVcf4Annot + set file(genomeFile), file(genomeIndex), file(genomeDict), file(vepCache), file(isoforms) from Channel.value([ + referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict, + referenceMap.vepCache, referenceMap.isoforms + ]) + file(filter_script) from Channel.value([workflow.projectDir + "/containers/vcf2maf/filter-germline-pon-maf.R"]) + + output: + set idNormal, target, file("${idNormal}.oncokb.custom-filter.vcf.gz"), file("${idNormal}.oncokb.custom-filter.vcf.gz.tbi") into annotatedNormalVcf + set idNormal, target, file("${idNormal}.oncokb.custom-filter.maf"), file("${idNormal}.oncokb.custom-filter.unfiltered.maf") into annotatedNormalMaf + + + script: + if (target == 'wgs') { + infoCols = "MuTect2,EncodeDacMapability,RepeatMasker,gnomAD_FILTER,AC,AF,AC_nfe_seu,AF_nfe_seu,AC_afr,AF_afr,AC_nfe_onf,AF_nfe_onf,AC_amr,AF_amr,AC_eas,AF_eas,AC_nfe_nwe,AF_nfe_nwe,AC_nfe_est,AF_nfe_est,AC_nfe,AF_nfe,AC_fin,AF_fin,AC_asj,AF_asj,AC_oth,AF_oth,AC_popmax,AN_popmax,AF_popmax" + } + else { + infoCols = "MuTect2,EncodeDacMapability,RepeatMasker,gnomAD_FILTER,non_cancer_AC_nfe_onf,non_cancer_AF_nfe_onf,non_cancer_AC_nfe_seu,non_cancer_AF_nfe_seu,non_cancer_AC_eas,non_cancer_AF_eas,non_cancer_AC_asj,non_cancer_AF_asj,non_cancer_AC_afr,non_cancer_AF_afr,non_cancer_AC_amr,non_cancer_AF_amr,non_cancer_AC_nfe_nwe,non_cancer_AF_nfe_nwe,non_cancer_AC_nfe,non_cancer_AF_nfe,non_cancer_AC_nfe_swe,non_cancer_AF_nfe_swe,non_cancer_AC,non_cancer_AF,non_cancer_AC_fin,non_cancer_AF_fin,non_cancer_AC_eas_oea,non_cancer_AF_eas_oea,non_cancer_AC_raw,non_cancer_AF_raw,non_cancer_AC_sas,non_cancer_AF_sas,non_cancer_AC_eas_kor,non_cancer_AF_eas_kor,non_cancer_AC_popmax,non_cancer_AF_popmax" + } + """ + zcat ${vcfGz} > combined.vcf + perl /opt/vcf2maf.pl \\ + --maf-center MSKCC-CMO \\ + --vep-path /usr/bin/vep \\ + --vep-data ${vepCache} \\ + --vep-forks 4 \\ + --ref-fasta ${genomeFile} \\ + --custom-enst ${isoforms} \\ + --filter-vcf 0 \\ + --retain-info ${infoCols} \\ + --input-vcf combined.vcf \\ + --output-maf ${idNormal}.maf \\ + --normal-id ${idNormal} + + python /usr/bin/oncokb_annotator/MafAnnotator.py \\ + -i ${idNormal}.maf \\ + -o ${idNormal}.oncokb.maf + + Rscript --no-init-file ${filter_script} \\ + --normal-depth ${params.germlineVariant.normalDepth} \\ + --normal-vaf ${params.germlineVariant.normalVaf} \\ + --maf-file ${idNormal}.oncokb.maf \\ + --output-prefix ${idNormal}.oncokb.custom-filter + + perl /opt/maf2vcf.pl \\ + --input-maf ${idNormal}.oncokb.custom-filter.maf \\ + --ref-fasta ${genomeFile} \\ + --output-vcf ${idNormal}.oncokb.custom-filter.vcf \\ + --output-dir dummy + + bgzip ${idNormal}.oncokb.custom-filter.vcf + tabix -p vcf ${idNormal}.oncokb.custom-filter.vcf.gz + + """ + +} + +combinedVcf4Subtract.join(annotatedNormalVcf, by:[0,1]).set{combinedVcf4Subtract} + +process PareVariants { + tag {idNormal} + container = "cmopipeline/bcftools-vt:1.2.2" + + input: + set idNormal, target, file(vcfGz), file(vcfGzTbi), file(filteredVcfGz), file(filteredVcfGzTbi) from combinedVcf4Subtract + + output: + set idNormal, target, file("${idNormal}.filtered.vcf.gz"), file("${idNormal}.filtered.vcf.gz.tbi") into combinedFilteredVcf + + script: + """ + bcftools isec \\ + --output-type z \\ + -n ~11 \\ + -p isecDir \\ + ${vcfGz} ${filteredVcfGz} + + cp isecDir/0000.vcf.gz ${idNormal}.filtered.vcf.gz + cp isecDir/0000.vcf.gz.tbi ${idNormal}.filtered.vcf.gz.tbi + + """ +} + +// combinedVcf +combinedFilteredVcf + .groupTuple(by:[1]) + .view() + .set{ponInputs} + +process generatePoN { + tag {target} + + input: + set idNormal, target, file(vcfFiltered), file(vcfFilteredTbi) from ponInputs + + output: + set target, file("pon_out/pon.vcf.gz"), file("pon_out/pon.vcf.gz.tbi") into ponOutputVcf + + script: + // BCFTOOLS_PLUGINS can be found where bcftools is installed under /libexec/bcftools + minSamples = idNormal.size() > 3 ? 3 : idNormal.size() + """ + export BCFTOOLS_PLUGINS=/opt/hall-lab/bcftools-1.9/libexec/bcftools + mkdir -p isecDir pon_out + bcftools isec \\ + --output-type z \\ + -n +${minSamples} \\ + -p isecDir \\ + ${vcfFiltered} + bcftools merge \\ + --merge both \\ + --output-type z \\ + --output pon_out/pon_deprecated.vcf.gz \\ + isecDir/*vcf.gz + + bcftools merge \\ + --merge both \\ + ${vcfFiltered} | \\ + bcftools +fill-tags \\ + --output-type z \\ + --output pon_out/pon.vcf.gz # \\ + #-- -t all + + tabix --preset vcf pon_out/pon.vcf.gz + tabix --preset vcf pon_out/pon_deprecated.vcf.gz + """ + +} + +def checkParamReturnFile(item) { + params."${item}" = params.genomes[params.genome]."${item}" + if(params."${item}" == null){println "${item} is not found in reference map"; exit 1} + if(file(params."${item}", checkIfExists: false) == []){println "${item} is not found; glob pattern produces empty list"; exit 1} + return file(params."${item}", checkIfExists: true) +} + +def defineReferenceMap() { + if (!(params.genome in params.genomes)) exit 1, "Genome ${params.genome} not found in configuration" + result_array = [ + // genome reference dictionary + 'genomeDict' : checkParamReturnFile("genomeDict"), + // FASTA genome reference + 'genomeFile' : checkParamReturnFile("genomeFile"), + // genome .fai file + 'genomeIndex' : checkParamReturnFile("genomeIndex"), + // VCFs with known indels (such as 1000 Genomes, Mill’s gold standard) + 'svCallingExcludeRegions' : checkParamReturnFile("svCallingExcludeRegions"), + 'svCallingIncludeRegions' : checkParamReturnFile("svCallingIncludeRegions"), + 'svCallingIncludeRegionsIndex' : checkParamReturnFile("svCallingIncludeRegionsIndex"), + // Target and Bait BED files + 'idtTargets' : checkParamReturnFile("idtTargets"), + //'idtTargetsUnzipped' : checkParamReturnFile("idtTargetsUnzipped"), + 'idtTargetsIndex' : checkParamReturnFile("idtTargetsIndex"), + 'idtTargetsList' : checkParamReturnFile("idtTargetsList"), + 'idtBaitsList' : checkParamReturnFile("idtBaitsList"), + 'idtv2Targets' : checkParamReturnFile("idtv2Targets"), + 'idtv2TargetsIndex' : checkParamReturnFile("idtv2TargetsIndex"), + 'idtv2TargetsList' : checkParamReturnFile("idtv2TargetsList"), + 'idtv2BaitsList' : checkParamReturnFile("idtv2BaitsList"), + 'agilentTargets' : checkParamReturnFile("agilentTargets"), + //'agilentTargetsUnzipped' : checkParamReturnFile("agilentTargetsUnzipped"), + 'agilentTargetsIndex' : checkParamReturnFile("agilentTargetsIndex"), + 'agilentTargetsList' : checkParamReturnFile("agilentTargetsList"), + 'agilentBaitsList' : checkParamReturnFile("agilentBaitsList"), + 'wgsTargets' : checkParamReturnFile("wgsTargets"), + //'wgsTargetsUnzipped' : checkParamReturnFile("wgsTargetsUnzipped"), + 'wgsTargetsIndex' : checkParamReturnFile("wgsTargetsIndex") + ] + + if (workflow.profile != "test") { + result_array << ['vepCache' : checkParamReturnFile("vepCache")] + // intervals file for spread-and-gather processes + result_array << ['intervals' : checkParamReturnFile("intervals")] + // files for CombineChannel, needed by bcftools annotate + result_array << ['repeatMasker' : checkParamReturnFile("repeatMasker")] + result_array << ['repeatMaskerIndex' : checkParamReturnFile("repeatMaskerIndex")] + result_array << ['mapabilityBlacklist' : checkParamReturnFile("mapabilityBlacklist")] + result_array << ['mapabilityBlacklistIndex' : checkParamReturnFile("mapabilityBlacklistIndex")] + // isoforms needed by vcf2maf + result_array << ['isoforms' : checkParamReturnFile("isoforms")] + // PON files + result_array << ['exomePoN' : checkParamReturnFile("exomePoN")] + result_array << ['exomePoNIndex' : checkParamReturnFile("exomePoNIndex")] + result_array << ['wgsPoN' : checkParamReturnFile("wgsPoN")] + result_array << ['wgsPoNIndex' : checkParamReturnFile("wgsPoNIndex")] + // gnomAD resources + result_array << ['gnomadWesVcf' : checkParamReturnFile("gnomadWesVcf")] + result_array << ['gnomadWesVcfIndex' : checkParamReturnFile("gnomadWesVcfIndex")] + result_array << ['gnomadWgsVcf' : checkParamReturnFile("gnomadWgsVcf")] + result_array << ['gnomadWgsVcfIndex' : checkParamReturnFile("gnomadWgsVcfIndex")] + // HLA FASTA and *dat for LOHHLA + result_array << ['hlaFasta' : checkParamReturnFile("hlaFasta")] + result_array << ['hlaDat' : checkParamReturnFile("hlaDat")] + // files for neoantigen & NetMHC + result_array << ['neoantigenCDNA' : checkParamReturnFile("neoantigenCDNA")] + result_array << ['neoantigenCDS' : checkParamReturnFile("neoantigenCDS")] + // coding region BED files for calculating TMB + result_array << ['idtCodingBed' : checkParamReturnFile("idtCodingBed")] + result_array << ['idtv2CodingBed' : checkParamReturnFile("idtv2CodingBed")] + result_array << ['agilentCodingBed' : checkParamReturnFile("agilentCodingBed")] + result_array << ['wgsCodingBed' : checkParamReturnFile("wgsCodingBed")] + } + return result_array +}