Skip to content

Commit

Permalink
Merge pull request #1003 from maxplanck-ie/allelic_frombam
Browse files Browse the repository at this point in the history
Allelic frombam
  • Loading branch information
katsikora authored Apr 10, 2024
2 parents 89c13f2 + 1280216 commit 316cc86
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 114 deletions.
28 changes: 28 additions & 0 deletions .ci_stuff/test_dag.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,30 @@ touch allelic_BAM_input/allelic_bams/sample1.genome1.sorted.bam \
allelic_BAM_input/allelic_bams/sample5.genome2.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.genome1.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.genome2.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam.bai \
allelic_BAM_input/deepTools_qc/bamPEFragmentSize/fragmentSize.metric.tsv \
allelic_BAM_input/filtered_bam/sample1.filtered.bam \
allelic_BAM_input/filtered_bam/sample2.filtered.bam \
Expand Down Expand Up @@ -314,6 +338,10 @@ WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2539 ]; then exit 1 ; fi
WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 3294 ]; then exit 1 ; fi
WC=`mRNA-seq -m allelic-counting -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 659 ]; then exit 1 ; fi



WC=`noncoding-RNA-seq -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1370 ]; then exit 1 ; fi
Expand Down
1 change: 1 addition & 0 deletions docs/content/News.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ________________

* added SEACR peaks qc
* added external bed functionality for differential binding in ChIP-seq and ATAC-seq workflows
* added "allelic-counting" mode to mRNA-seq, allowing to count reads and run DGE from allelic bam files split e.g. by whatshap
* fixed copyfile command for sampleSheet
* removed deprecated --force argument from mamba commands
* fixes #998
Expand Down
5 changes: 5 additions & 0 deletions docs/content/workflows/mRNA-seq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ Allele-specific, gene-level differential expression analysis is then performed u

.. note:: **allelic-mapping** mode is mutually exclusive with **mapping** mode

"allelic-counting"
~~~~~~~~~~~~~~~~~~

**allelic-counting** mode requires the user to input, per sample, 4 bam files, corresponding to haplotype1, haplotype2, unassigned and haplotagged , e.g. as generated by whatshap. The respective suffixes ".genome1", ".genome2", ".unassigned", ".alelle_flagged" are required to be followed by the bam extention ".sorted.bam". This mode is mutually exclusive with "deepTools_qc". Only the allelic version of deepTools qc will be run, by default. Allelic version of featureCounts will be run by default. If sample sheet is provided, allelic DESeq2- or allelic Salmon-based differential gene expression analysis will be run.

"alignment-free"
~~~~~~~~~~~~~~~~

Expand Down
14 changes: 14 additions & 0 deletions snakePipes/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,20 @@ def get_sample_names_bam(infiles, bamExt):
return sorted(list(set(s)))


def get_sample_names_suffix_bam(infiles, bamExt):
"""
Get sample names without file extensions
"""
bamSuff = [x + bamExt for x in [".genome1", ".genome2", ".unassigned", ".allele_flagged"]]
s = []
for x in infiles:
for y in bamSuff:
if y in os.path.basename(x):
x = os.path.basename(x).replace(y, "")
s.append(x)
return sorted(list(set(s)))


def is_paired(infiles, ext, reads):
"""
Check for paired-end input files
Expand Down
2 changes: 1 addition & 1 deletion snakePipes/shared/rscripts/DESeq2.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
sampleInfoFilePath <- snakemake@params[["sampleSheet"]]
countFilePath <- snakemake@params[["counts_table"]]
fdr <- as.numeric(snakemake@params[["fdr"]])
geneNamesFilePath <- snakemake@input[["symbol_file"]]
geneNamesFilePath <- snakemake@params[["symbol_file"]]
importfunc <- snakemake@params[["importfunc"]]
allelic_info <- as.logical(snakemake@params[["allele_info"]])
tx2gene_file <- snakemake@params[["tx2gene_file"]]
Expand Down
23 changes: 7 additions & 16 deletions snakePipes/shared/rules/DESeq2.multipleComp.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ rule DESeq2:
tx2gene_file = 'NA',
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table)
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
log:
out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")),
err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
Expand Down Expand Up @@ -64,20 +65,10 @@ rule DESeq2_Salmon_basic:
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = 'FALSE',
tx2gene_file = "Annotation/genes.filtered.t2g",
tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"]
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{params.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"../{input.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
"{params.formula} "
" > ../{log.out} 2> ../{log.err}"
script: "{params.script}"
67 changes: 20 additions & 47 deletions snakePipes/shared/rules/DESeq2.singleComp.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,30 @@ def get_outdir(folder_name,sampleSheet):
## DESeq2 (on featureCounts)
rule DESeq2:
input:
counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode else "featureCounts/counts.tsv",
counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode or "allelic-counting" in mode else "featureCounts/counts.tsv",
sampleSheet = sampleSheet,
symbol_file = "Annotation/genes.filtered.symbol" #get_symbol_file
output:
"{}/DESeq2.session_info.txt".format(get_outdir("DESeq2",sampleSheet))
benchmark:
"{}/.benchmark/DESeq2.featureCounts.benchmark".format(get_outdir("DESeq2",sampleSheet))
params:
script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
script = os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
sampleSheet = lambda wildcards,input: input.sampleSheet,
outdir = get_outdir("DESeq2",sampleSheet),
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE',
allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode or "allelic-counting" in mode else 'FALSE',
tx2gene_file = 'NA',
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"]
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
log:
out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)),
err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet))
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{input.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"{params.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
"{params.formula} " #9
" > ../{log.out} 2> ../{log.err}"

script: "{params.script}"

## DESeq2 (on Salmon)
rule DESeq2_Salmon_basic:
Expand All @@ -62,23 +52,14 @@ rule DESeq2_Salmon_basic:
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = 'FALSE',
tx2gene_file = "Annotation/genes.filtered.t2g",
tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"]
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{input.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"../{input.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
"{params.formula} " # 9
" > ../{log.out} 2> ../{log.err}"
script: "{params.script}"


rule DESeq2_Salmon_allelic:
input:
Expand All @@ -99,18 +80,10 @@ rule DESeq2_Salmon_allelic:
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = 'TRUE',
tx2gene_file = "Annotation/genes.filtered.t2g",
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd")
tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{input.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"../{input.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
" > ../{log.out} 2> ../{log.err}"
script: "{params.script}"
103 changes: 64 additions & 39 deletions snakePipes/shared/rules/LinkBam.snakefile
Original file line number Diff line number Diff line change
@@ -1,49 +1,74 @@
import os

rule link_bam:
input:
indir + "/{sample}" + bamExt
output:
aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam"
params:
input_bai = indir + "/{sample}" + bamExt + ".bai",
output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai"
run:
if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
if not os.path.exists(os.path.join(outdir,output[0])):
os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))

if not pipeline=="noncoding-rna-seq":
if pipeline=="rna-seq" and "allelic-counting" in mode:
rule link_bam:
input:
indir + "/{sample}.{suffix}" + bamExt
output:
"allelic_bams/{sample}.{suffix}" + bamExt
params:
input_bai = indir + "/{sample}.{suffix}" + bamExt + ".bai",
output_bai = "allelic_bams/{sample}.{suffix}" + bamExt + ".bai"
run:
if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
if not os.path.exists(os.path.join(outdir,output[0])):
os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))

rule samtools_index_external:
input:
aligner + "/{sample}.bam"
"allelic_bams/{sample}.{suffix}" + bamExt
output:
aligner + "/{sample}.bam.bai"
"allelic_bams/{sample}.{suffix}" + bamExt + ".bai"
conda: CONDA_SHARED_ENV
shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi"

if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC:
rule link_bam_bai_external:

else:
rule link_bam:
input:
indir + "/{sample}" + bamExt
output:
aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam"
params:
input_bai = indir + "/{sample}" + bamExt + ".bai",
output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai"
run:
if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
if not os.path.exists(os.path.join(outdir,output[0])):
os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))

if not pipeline=="noncoding-rna-seq":
rule samtools_index_external:
input:
bam = aligner + "/{sample}.bam",
bai = aligner + "/{sample}.bam.bai"
aligner + "/{sample}.bam"
output:
bam_out = "filtered_bam/{sample}.filtered.bam",
bai_out = "filtered_bam/{sample}.filtered.bam.bai",
shell: """
ln -s ../{input.bam} {output.bam_out};
ln -s ../{input.bai} {output.bai_out}
"""


rule sambamba_flagstat:
input:
aligner + "/{sample}.bam"
output:
"Sambamba/{sample}.markdup.txt"
log: "Sambamba/logs/{sample}.flagstat.log"
conda: CONDA_SAMBAMBA_ENV
shell: """
sambamba flagstat -p {input} > {output} 2> {log}
"""
aligner + "/{sample}.bam.bai"
conda: CONDA_SHARED_ENV
shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi"

if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC:
rule link_bam_bai_external:
input:
bam = aligner + "/{sample}.bam",
bai = aligner + "/{sample}.bam.bai"
output:
bam_out = "filtered_bam/{sample}.filtered.bam",
bai_out = "filtered_bam/{sample}.filtered.bam.bai",
shell: """
ln -s ../{input.bam} {output.bam_out};
ln -s ../{input.bai} {output.bai_out}
"""


rule sambamba_flagstat:
input:
aligner + "/{sample}.bam"
output:
"Sambamba/{sample}.markdup.txt"
log: "Sambamba/logs/{sample}.flagstat.log"
conda: CONDA_SAMBAMBA_ENV
shell: """
sambamba flagstat -p {input} > {output} 2> {log}
"""
5 changes: 3 additions & 2 deletions snakePipes/shared/rules/multiQC.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,18 @@ def multiqc_input_check(return_value):
indir += "allelic_bams"
elif pipeline=="rna-seq":
# must be RNA-mapping, add files as per the mode
if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode:
if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode and not "allelic-counting" in mode:
infiles.append( expand(aligner+"/{sample}.bam", sample = samples) +
expand("Sambamba/{sample}.markdup.txt", sample = samples) +
expand("deepTools_qc/estimateReadFiltering/{sample}_filtering_estimation.txt",sample=samples)+
expand("featureCounts/{sample}.counts.txt", sample = samples))
indir += aligner + " featureCounts "
indir += " Sambamba "
indir += " deepTools_qc "
if "allelic-mapping" in mode:
if "allelic-mapping" in mode or "allelic-counting" in mode:
infiles.append( expand("featureCounts/{sample}.allelic_counts.txt", sample = samples) )
indir += aligner + " featureCounts "
if "allelic-mapping" in mode:
infiles.append( expand("allelic_bams/{sample}.SNPsplit_report.yaml", sample = samples) )
infiles.append( expand("allelic_bams/{sample}.SNPsplit_sort.yaml", sample = samples) )
indir += "allelic_bams"
Expand Down
Loading

0 comments on commit 316cc86

Please sign in to comment.