Merge pull request #1003 from maxplanck-ie/allelic_frombam

Allelic frombam
maxplanck-ie · Apr 10, 2024 · 316cc86 · 316cc86
2 parents 89c13f2 + 1280216
commit 316cc86
Show file tree

Hide file tree

Showing 12 changed files with 168 additions and 114 deletions.
diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh
@@ -101,6 +101,30 @@ touch allelic_BAM_input/allelic_bams/sample1.genome1.sorted.bam \
       allelic_BAM_input/allelic_bams/sample5.genome2.sorted.bam.bai \
       allelic_BAM_input/allelic_bams/sample6.genome1.sorted.bam.bai \
       allelic_BAM_input/allelic_bams/sample6.genome2.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam \
+      allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam.bai \
+      allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam.bai \
       allelic_BAM_input/deepTools_qc/bamPEFragmentSize/fragmentSize.metric.tsv \
       allelic_BAM_input/filtered_bam/sample1.filtered.bam \
       allelic_BAM_input/filtered_bam/sample2.filtered.bam \
@@ -314,6 +338,10 @@ WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2539 ]; then exit 1 ; fi
 WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 3294 ]; then exit 1 ; fi
+WC=`mRNA-seq -m allelic-counting -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp"  .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
+if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 659 ]; then exit 1 ; fi
+
+
 
 WC=`noncoding-RNA-seq -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1370 ]; then exit 1 ; fi

diff --git a/docs/content/News.rst b/docs/content/News.rst
@@ -6,6 +6,7 @@ ________________
 
 * added SEACR peaks qc
 * added external bed functionality for differential binding in ChIP-seq and ATAC-seq workflows
+* added "allelic-counting" mode to mRNA-seq, allowing to count reads and run DGE from allelic bam files split e.g. by whatshap
 * fixed copyfile command for sampleSheet
 * removed deprecated --force argument from mamba commands
 * fixes #998

diff --git a/docs/content/workflows/mRNA-seq.rst b/docs/content/workflows/mRNA-seq.rst
@@ -194,6 +194,11 @@ Allele-specific, gene-level differential expression analysis is then performed u
 
 .. note:: **allelic-mapping** mode is mutually exclusive with **mapping** mode
 
+"allelic-counting"
+~~~~~~~~~~~~~~~~~~
+
+**allelic-counting** mode requires the user to input, per sample, 4 bam files, corresponding to haplotype1, haplotype2, unassigned and haplotagged , e.g. as generated by whatshap. The respective suffixes ".genome1", ".genome2", ".unassigned", ".alelle_flagged" are required to be followed by the bam extention ".sorted.bam". This mode is mutually exclusive with "deepTools_qc". Only the allelic version of deepTools qc will be run, by default. Allelic version of featureCounts will be run by default. If sample sheet is provided, allelic DESeq2- or allelic Salmon-based differential gene expression analysis will be run. 
+
 "alignment-free"
 ~~~~~~~~~~~~~~~~
 

diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py
@@ -173,6 +173,20 @@ def get_sample_names_bam(infiles, bamExt):
     return sorted(list(set(s)))
 
 
+def get_sample_names_suffix_bam(infiles, bamExt):
+    """
+    Get sample names without file extensions
+    """
+    bamSuff = [x + bamExt for x in [".genome1", ".genome2", ".unassigned", ".allele_flagged"]]
+    s = []
+    for x in infiles:
+        for y in bamSuff:
+            if y in os.path.basename(x):
+                x = os.path.basename(x).replace(y, "")
+                s.append(x)
+    return sorted(list(set(s)))
+
+
 def is_paired(infiles, ext, reads):
     """
     Check for paired-end input files

diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R
@@ -29,7 +29,7 @@
 sampleInfoFilePath <- snakemake@params[["sampleSheet"]]
 countFilePath <- snakemake@params[["counts_table"]]
 fdr <- as.numeric(snakemake@params[["fdr"]])
-geneNamesFilePath <- snakemake@input[["symbol_file"]]
+geneNamesFilePath <- snakemake@params[["symbol_file"]]
 importfunc <- snakemake@params[["importfunc"]]
 allelic_info <- as.logical(snakemake@params[["allele_info"]])
 tx2gene_file <- snakemake@params[["tx2gene_file"]]

diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile
@@ -36,7 +36,8 @@ rule DESeq2:
         tx2gene_file = 'NA',
         rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
         formula = config["formula"],
-        counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table)
+        counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
+        symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
     log:
         out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")),
         err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
@@ -64,20 +65,10 @@ rule DESeq2_Salmon_basic:
         fdr = fdr,
         importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
         allele_info = 'FALSE',
-        tx2gene_file = "Annotation/genes.filtered.t2g",
+        tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
         rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
-        formula = config["formula"]
+        formula = config["formula"],
+        counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
+        symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
     conda: CONDA_RNASEQ_ENV
-    shell:
-        "cd {params.outdir} && "
-        "Rscript {params.script} "
-        "{params.sampleSheet} " # 1
-        "../{input.counts_table} " # 2
-        "{params.fdr} " # 3
-        "../{input.symbol_file} " # 4
-        "{params.importfunc} " # 5
-        "{params.allele_info} " # 6
-        "../{input.tx2gene_file} " # 7
-        "{params.rmdTemplate} " # 8
-        "{params.formula} "
-        " > ../{log.out} 2> ../{log.err}"
+    script: "{params.script}"
diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile
@@ -7,40 +7,30 @@ def get_outdir(folder_name,sampleSheet):
 ## DESeq2 (on featureCounts)
 rule DESeq2:
     input:
-        counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode else "featureCounts/counts.tsv",
+        counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode or "allelic-counting" in mode else "featureCounts/counts.tsv",
         sampleSheet = sampleSheet,
         symbol_file = "Annotation/genes.filtered.symbol" #get_symbol_file
     output:
         "{}/DESeq2.session_info.txt".format(get_outdir("DESeq2",sampleSheet))
     benchmark:
         "{}/.benchmark/DESeq2.featureCounts.benchmark".format(get_outdir("DESeq2",sampleSheet))
     params:
-        script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
+        script = os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
+        sampleSheet = lambda wildcards,input: input.sampleSheet,
         outdir = get_outdir("DESeq2",sampleSheet),
         fdr = fdr,
         importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
-        allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE',
+        allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode or "allelic-counting" in mode else 'FALSE',
         tx2gene_file = 'NA',
         rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
-        formula = config["formula"]
+        formula = config["formula"],
+        counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
+        symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
     log:
         out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)),
         err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet))
     conda: CONDA_RNASEQ_ENV
-    shell:
-        "cd {params.outdir} && "
-        "Rscript {params.script} "
-        "{input.sampleSheet} " # 1
-        "../{input.counts_table} " # 2
-        "{params.fdr} " # 3
-        "../{input.symbol_file} " # 4
-        "{params.importfunc} " # 5
-        "{params.allele_info} " # 6
-        "{params.tx2gene_file} " # 7
-        "{params.rmdTemplate} " # 8
-        "{params.formula} " #9
-        " > ../{log.out} 2> ../{log.err}"
-
+    script: "{params.script}"
 
 ## DESeq2 (on Salmon)
 rule DESeq2_Salmon_basic:
@@ -62,23 +52,14 @@ rule DESeq2_Salmon_basic:
         fdr = fdr,
         importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
         allele_info = 'FALSE',
-        tx2gene_file = "Annotation/genes.filtered.t2g",
+        tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
         rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
-        formula = config["formula"]
+        formula = config["formula"],
+        counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
+        symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
     conda: CONDA_RNASEQ_ENV
-    shell:
-        "cd {params.outdir} && "
-        "Rscript {params.script} "
-        "{input.sampleSheet} " # 1
-        "../{input.counts_table} " # 2
-        "{params.fdr} " # 3
-        "../{input.symbol_file} " # 4
-        "{params.importfunc} " # 5
-        "{params.allele_info} " # 6
-        "../{input.tx2gene_file} " # 7
-        "{params.rmdTemplate} " # 8
-        "{params.formula} " # 9
-        " > ../{log.out} 2> ../{log.err}"
+    script: "{params.script}"
+
 
 rule DESeq2_Salmon_allelic:
     input:
@@ -99,18 +80,10 @@ rule DESeq2_Salmon_allelic:
         fdr = fdr,
         importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
         allele_info = 'TRUE',
-        tx2gene_file = "Annotation/genes.filtered.t2g",
-        rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd")
+        tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
+        rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
+        formula = config["formula"],
+        counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
+        symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
     conda: CONDA_RNASEQ_ENV
-    shell:
-        "cd {params.outdir} && "
-        "Rscript {params.script} "
-        "{input.sampleSheet} " # 1
-        "../{input.counts_table} " # 2
-        "{params.fdr} " # 3
-        "../{input.symbol_file} " # 4
-        "{params.importfunc} " # 5
-        "{params.allele_info} " # 6
-        "../{input.tx2gene_file} " # 7
-        "{params.rmdTemplate} " # 8
-        " > ../{log.out} 2> ../{log.err}"
+    script: "{params.script}"
diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile
@@ -1,49 +1,74 @@
 import os
 
-rule link_bam:
-    input:
-        indir + "/{sample}" + bamExt
-    output:
-        aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam"
-    params:
-        input_bai = indir + "/{sample}" + bamExt + ".bai",
-        output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai"
-    run:
-        if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
-            os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
-        if not os.path.exists(os.path.join(outdir,output[0])):
-            os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))
-
-if not pipeline=="noncoding-rna-seq":
+if pipeline=="rna-seq" and "allelic-counting" in mode:
+    rule link_bam:
+        input:
+            indir + "/{sample}.{suffix}" + bamExt
+        output:
+            "allelic_bams/{sample}.{suffix}" + bamExt
+        params:
+            input_bai = indir + "/{sample}.{suffix}" + bamExt + ".bai",
+            output_bai = "allelic_bams/{sample}.{suffix}" + bamExt + ".bai"
+        run:
+            if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
+                os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
+            if not os.path.exists(os.path.join(outdir,output[0])):
+                os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))
+
     rule samtools_index_external:
         input:
-            aligner + "/{sample}.bam"
+            "allelic_bams/{sample}.{suffix}" + bamExt
         output:
-            aligner + "/{sample}.bam.bai"
+            "allelic_bams/{sample}.{suffix}" + bamExt + ".bai"
         conda: CONDA_SHARED_ENV
         shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi"
 
-    if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC:
-        rule link_bam_bai_external:
+
+else:
+    rule link_bam:
+        input:
+            indir + "/{sample}" + bamExt
+        output:
+            aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam"
+        params:
+            input_bai = indir + "/{sample}" + bamExt + ".bai",
+            output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai"
+        run:
+            if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
+                os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
+            if not os.path.exists(os.path.join(outdir,output[0])):
+                os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))
+
+    if not pipeline=="noncoding-rna-seq":
+        rule samtools_index_external:
             input:
-                bam = aligner + "/{sample}.bam",
-                bai = aligner + "/{sample}.bam.bai"
+                aligner + "/{sample}.bam"
             output:
-                bam_out = "filtered_bam/{sample}.filtered.bam",
-                bai_out = "filtered_bam/{sample}.filtered.bam.bai",
-            shell: """
-                ln -s ../{input.bam} {output.bam_out};
-                ln -s ../{input.bai} {output.bai_out}
-            """
-
-
-    rule sambamba_flagstat:
-       input:
-           aligner + "/{sample}.bam"
-       output:
-           "Sambamba/{sample}.markdup.txt"
-       log: "Sambamba/logs/{sample}.flagstat.log"
-       conda: CONDA_SAMBAMBA_ENV
-       shell: """
-           sambamba flagstat -p {input} > {output} 2> {log}
-           """
+                aligner + "/{sample}.bam.bai"
+            conda: CONDA_SHARED_ENV
+            shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi"
+
+        if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC:
+            rule link_bam_bai_external:
+                input:
+                    bam = aligner + "/{sample}.bam",
+                    bai = aligner + "/{sample}.bam.bai"
+                output:
+                    bam_out = "filtered_bam/{sample}.filtered.bam",
+                    bai_out = "filtered_bam/{sample}.filtered.bam.bai",
+                shell: """
+                    ln -s ../{input.bam} {output.bam_out};
+                    ln -s ../{input.bai} {output.bai_out}
+                """
+
+
+        rule sambamba_flagstat:
+           input:
+               aligner + "/{sample}.bam"
+           output:
+               "Sambamba/{sample}.markdup.txt"
+           log: "Sambamba/logs/{sample}.flagstat.log"
+           conda: CONDA_SAMBAMBA_ENV
+           shell: """
+               sambamba flagstat -p {input} > {output} 2> {log}
+               """
diff --git a/snakePipes/shared/rules/multiQC.snakefile b/snakePipes/shared/rules/multiQC.snakefile
@@ -66,17 +66,18 @@ def multiqc_input_check(return_value):
             indir += "allelic_bams"
     elif pipeline=="rna-seq":
         # must be RNA-mapping, add files as per the mode
-        if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode:
+        if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode and not "allelic-counting" in mode:
             infiles.append( expand(aligner+"/{sample}.bam", sample = samples) +
                     expand("Sambamba/{sample}.markdup.txt", sample = samples) +
                     expand("deepTools_qc/estimateReadFiltering/{sample}_filtering_estimation.txt",sample=samples)+
                     expand("featureCounts/{sample}.counts.txt", sample = samples))
             indir += aligner + " featureCounts "
             indir += " Sambamba "
             indir += " deepTools_qc "
-        if "allelic-mapping" in mode:
+        if "allelic-mapping" in mode or "allelic-counting" in mode:
             infiles.append( expand("featureCounts/{sample}.allelic_counts.txt", sample = samples) )
             indir += aligner + " featureCounts "
+        if "allelic-mapping" in mode:
             infiles.append( expand("allelic_bams/{sample}.SNPsplit_report.yaml", sample = samples) )
             infiles.append( expand("allelic_bams/{sample}.SNPsplit_sort.yaml", sample = samples) )
             indir += "allelic_bams"