updated get indels, added transgene annotations, updated channels

dhslab · Sep 5, 2024 · 83554db · 83554db
1 parent 10cde29
commit 83554db
Show file tree

Hide file tree

Showing 10 changed files with 1,240 additions and 265 deletions.
diff --git a/bin/extract_variant_reads.py b/bin/extract_variant_reads.py
diff --git a/bin/getIndelsFromBam.py b/bin/getIndelsFromBam.py
diff --git a/bin/make_hotspot_file.py b/bin/make_hotspot_file.py
@@ -42,10 +42,10 @@ def main():
     parser.add_argument('--bed', type=checkfile, help="bed hotspot file")
     parser.add_argument('--csv', type=checkfile, help="csv hotspot file")
     parser.add_argument('--window', type=int, default=200, help="window")
+    parser.add_argument('--id', help="meta id")
     args = parser.parse_args()
 
     window = args.window 
-    # assumes csv_file ranges don't overlap
     data_df = pd.DataFrame()
     if (args.bed):
         bed_file = os.path.realpath(args.bed)
@@ -69,12 +69,13 @@ def main():
         row_df['Chromosome'] = rows['Chromosome']
         vcf_df = pd.concat([vcf_df, row_df], ignore_index=True)
     result_vcf = dataframe_to_vcf(vcf_df)
-    with open(f"hotspot.vcf", "w") as f:
+    id = args.id
+    with open(f"{id}.hotspot.vcf", "w") as f:
         f.write(result_vcf)
 
 if __name__ == "__main__":
     if len(sys.argv) < 2:
-        print("Usage: python script.py --bed <bed_file> --csv <csv_file> --window [window]")
+        print("Usage: python script.py --bed <bed_file> --csv <csv_file> --window [window] --id <id>")
         sys.exit(1)
     else:
         main()
diff --git a/modules/local/annotate_transgene.nf b/modules/local/annotate_transgene.nf
@@ -0,0 +1,25 @@
+process ANNOTATE_TRANSGENE_VARIANTS {
+    tag "$meta.id"
+    label 'process_low'
+    label 'final_output'
+    container "ghcr.io/dhslab/docker-vep:release_105"
+
+    input:
+    tuple val(meta), path(files), path("${meta.id}.transgene_out.tsv")
+
+    output:
+    tuple val(meta), path("${meta.id}.transgene.annotated.tsv"), emit: annotated_transgenes
+    path "versions.yml",    emit: versions
+
+    script:
+    """
+    awk 'BEGIN {FS=OFS="\\t"} NR > 1 {sub(/^chr/, "", \$1); for (i = 1; i < NF; i++) {printf "%s%s", \$i, (i == NF-1 ? "\\n" : OFS)}}' ${meta.id}.transgene_out.tsv > ${meta.id}.edited.transgene.tsv && \\
+    /opt/vep/src/ensembl-vep/vep --cache --dir /storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/VEP_cache --symbol --per_gene -i ${meta.id}.edited.transgene.tsv -o ${meta.id}.transgene.annotated.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vep: \$(/opt/vep/src/ensembl-vep/vep 2>&1 | grep ensembl-vep | cut -d ':' -f 2 | sed 's/\s*//g')
+    END_VERSIONS
+    """
+
+}
diff --git a/modules/local/get_indels.nf b/modules/local/get_indels.nf
@@ -2,19 +2,19 @@ process GET_INDELS {
     tag "$meta.id"
     label 'process_low'
     label 'final_output'
-    container "ghcr.io/dhslab/docker-cleutils"
+    container "ghcr.io/dhslab/docker-baseimage:latest"
+    errorStrategy 'ignore'
 
     input:
-    tuple val(meta), path(files)
-    path(targetfile)
+    tuple val(meta), path(files), path(hotspot_file)
 
     output:
     tuple val(meta), path("${meta.id}.indels.txt")
     path "versions.yml",    emit: versions
 
     script:
     """
-    getIndelsFromBam.py ${targetfile} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt
+    extract_variant_reads.py ${hotspot_file} ${meta.id}_tumor.cram ${meta.id}.cram > ${meta.id}.indels.txt
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/get_transgene_junctions.nf b/modules/local/get_transgene_junctions.nf
@@ -8,7 +8,7 @@ process GET_TRANSGENE_JUNCTIONS {
     tuple val(meta), path(files)
 
     output:
-    tuple val(meta), path("${meta.id}.transgene_out.tsv")
+    tuple val(meta), path("${meta.id}.transgene_out.tsv"), emit: transgene_file
     path "versions.yml"                                     , emit: versions
 
     script:

diff --git a/modules/local/make_hotspot_file.nf b/modules/local/make_hotspot_file.nf
@@ -25,7 +25,7 @@ process MAKE_HOTSPOT_FILE {
         def bed = bed_file.name != 'NO_FILE.bed' ? "--bed $bed_file" : ''
         def csv = csv_file.name != 'NO_FILE.csv' ? "--csv $csv_file" : ''
         """
-        make_hotspot_file.py $bed $csv --window $params.window
+        make_hotspot_file.py $bed $csv --window $params.window --id ${info[0].id}
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -52,9 +52,9 @@ params {
 
     // Dragen inputs
     dragen_inputs {
-        dragen_hash                 = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/hg38_PLVM_CD19_CARv4_cd34"
-        snv_noisefile               = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dragen_v1.0_systematic_noise.nextera_wgs.120920.bed.gz"
-        sv_noisefile                = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/WGS_v2.0.0_hg38_sv_systematic_noise.bedpe.gz"
+        dragen_hash                 = "/storage1/fs1/dspencer/Active/clinseq/projects/scge/data/refdata/singh_v4.3.6"
+        snv_noisefile               = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v.2.0.0_systematic_noise.snv.bed.gz"
+        sv_noisefile                = "/storage1/fs1/dspencer/Active/spencerlab/refdata/hg38/dragenfiles/IDPF_WGS_hg38_v3.0.0_systematic_noise.sv.bedpe.gz"
         dbsnp                       = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz"
         dbsnp_index                 = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/dbsnp.vcf.gz.tbi"
         pop_af_vcf                  = "/storage1/fs1/dspencer/Active/clinseq/projects/chromoseq/refdata/dragen_align_inputs/hg38/1000G_phase1.snps.high_confidence.hg38.vcf.gz"

diff --git a/subworkflows/local/somatic_input_check.nf b/subworkflows/local/somatic_input_check.nf
@@ -156,8 +156,26 @@ workflow SOMATIC_INPUT_CHECK {
 
     ch_input_data = ch_input_data.mix(ch_bam)
 
+    ch_mastersheet
+    .map { meta -> 
+        if (meta.dragen_path != null){
+            def new_meta = [:]
+            new_meta['id'] = meta.id
+            return [new_meta]
+        }
+    }
+    .set{ch_ids}
+    ch_input_data = ch_input_data.mix(ch_ids)
+
     ch_hotspots = ch_mastersheet
-        .map{ row -> [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]}
+        .map{ row -> 
+        if (row.uid != null) {
+            return [row.uid, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]
+            }
+        else {
+            return [row.id, row.hotspot_file ?: "$projectDir/assets/NO_FILE.csv"]
+            }
+        }
         .unique()
 
     ch_input_data = ch_input_data
@@ -175,7 +193,6 @@ workflow SOMATIC_INPUT_CHECK {
         }
     }
     .set { ch_dragen_outputs }
-    ch_dragen_outputs.dump(tag: 'dragen_output')
 
     emit:
     dragen_outputs = ch_dragen_outputs

diff --git a/workflows/scge.nf b/workflows/scge.nf
@@ -50,6 +50,7 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft
 include { MAKE_HOTSPOT_FILE           } from '../modules/local/make_hotspot_file.nf'
 include { DRAGEN_SCGE                 } from '../modules/local/dragen_scge.nf'
 include { ANNOTATE_VARIANTS           } from '../modules/local/annotate_variants.nf'
+include { ANNOTATE_TRANSGENE_VARIANTS } from '../modules/local/annotate_transgene.nf'
 include { GET_INDELS                  } from '../modules/local/get_indels.nf'
 include { GET_TRANSGENE_JUNCTIONS     } from '../modules/local/get_transgene_junctions.nf'
 include { REFORMAT_CNV_DATA           } from '../modules/local/reformat_cnv_data.nf'
@@ -103,36 +104,51 @@ workflow SCGE {
     SOMATIC_INPUT_CHECK(Channel.fromPath(mastersheet), data_path)
 
     ch_input_data = SOMATIC_INPUT_CHECK.out.input_data
-    hotspot_input = ch_input_data.combine(hotspot_bed)
-    MAKE_HOTSPOT_FILE(hotspot_input)
-    ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf
-        .map{ info, hotspot_vcf ->
-        def newinfo = []
-        newinfo = info + [hotspot_vcf]
-        newinfo
-        }
-
-    ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
-    ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
-    ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
+    ch_hotspots = ch_input_data.map{ meta, file -> return [meta[0]['id'], file] }
 
     if (params.run_dragen == true) {
+
+        hotspot_input = ch_input_data.combine(hotspot_bed)
+        MAKE_HOTSPOT_FILE(hotspot_input)
+        ch_input_data = MAKE_HOTSPOT_FILE.out.hotspot_vcf
+            .map{ info, hotspot_vcf ->
+            def newinfo = []
+            newinfo = info + [hotspot_vcf]
+            newinfo
+            }
+
+        ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
+        ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
+        ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
+
         DRAGEN_SCGE (ch_input_data, ch_dragen_inputs)
         ch_versions = ch_versions.mix(DRAGEN_SCGE.out.versions)
         ch_dragen_outputs = ch_dragen_outputs.mix(DRAGEN_SCGE.out.dragen_output)
+    } else {
+        ch_dragen_outputs = ch_dragen_outputs.mix(SOMATIC_INPUT_CHECK.out.dragen_outputs)
+        ch_dragen_inputs = Channel.value(stageFileset(params.dragen_inputs))
+        ch_assay_inputs = Channel.value(stageFileset(params.assay_inputs))
     }
 
     if (params.run_analysis == true) {
-
-        ANNOTATE_VARIANTS (ch_dragen_outputs)
+        
+        ANNOTATE_VARIANTS (ch_dragen_outputs, ch_assay_inputs, params.fasta)
         ch_versions = ch_versions.mix(ANNOTATE_VARIANTS.out.versions)
-
-        GET_INDELS (ch_dragen_outputs)
+
+        get_indels_input = ch_dragen_outputs
+            .map{ meta -> [meta[0]['id'], meta] }
+            .combine(ch_hotspots, by: 0)
+            .map{id, meta, hotspot_file -> [meta[0], meta[1], hotspot_file]}
+        GET_INDELS (get_indels_input)
         ch_versions = ch_versions.mix(GET_INDELS.out.versions)
 
         GET_TRANSGENE_JUNCTIONS (ch_dragen_outputs)
         ch_versions = ch_versions.mix(GET_TRANSGENE_JUNCTIONS.out.versions)
 
+        annotate_transgene_input = ch_dragen_outputs.join(GET_TRANSGENE_JUNCTIONS.out.transgene_file)
+        ANNOTATE_TRANSGENE_VARIANTS (annotate_transgene_input)
+        ch_versions = ch_versions.mix(ANNOTATE_TRANSGENE_VARIANTS.out.versions)
+
         REFORMAT_CNV_DATA (ch_dragen_outputs)
         ch_versions = ch_versions.mix(REFORMAT_CNV_DATA.out.versions)
     }