Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allelic frombam #1003

Merged
merged 24 commits into from
Apr 10, 2024
28 changes: 28 additions & 0 deletions .ci_stuff/test_dag.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,30 @@ touch allelic_BAM_input/allelic_bams/sample1.genome1.sorted.bam \
allelic_BAM_input/allelic_bams/sample5.genome2.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.genome1.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.genome2.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam \
allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam \
allelic_BAM_input/allelic_bams/sample1.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample1.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample2.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample2.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample3.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample3.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample4.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample4.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample5.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample5.unassigned.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.allele_flagged.sorted.bam.bai \
allelic_BAM_input/allelic_bams/sample6.unassigned.sorted.bam.bai \
allelic_BAM_input/deepTools_qc/bamPEFragmentSize/fragmentSize.metric.tsv \
allelic_BAM_input/filtered_bam/sample1.filtered.bam \
allelic_BAM_input/filtered_bam/sample2.filtered.bam \
Expand Down Expand Up @@ -314,6 +338,10 @@ WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 2539 ]; then exit 1 ; fi
WC=`mRNA-seq -m allelic-mapping,deepTools_qc,alignment-free -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 3294 ]; then exit 1 ; fi
WC=`mRNA-seq -m allelic-counting -i allelic_BAM_input/allelic_bams --fromBAM --bamExt '.sorted.bam' -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 659 ]; then exit 1 ; fi



WC=`noncoding-RNA-seq -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "conda installation\|Conda environment" | sed '/^\s*$/d' | wc -l`
if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1370 ]; then exit 1 ; fi
Expand Down
1 change: 1 addition & 0 deletions docs/content/News.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ ________________

* added SEACR peaks qc
* added external bed functionality for differential binding in ChIP-seq and ATAC-seq workflows
* added "allelic-counting" mode to mRNA-seq, allowing to count reads and run DGE from allelic bam files split e.g. by whatshap
* fixed copyfile command for sampleSheet
* removed deprecated --force argument from mamba commands
* fixes #998
Expand Down
5 changes: 5 additions & 0 deletions docs/content/workflows/mRNA-seq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ Allele-specific, gene-level differential expression analysis is then performed u

.. note:: **allelic-mapping** mode is mutually exclusive with **mapping** mode

"allelic-counting"
~~~~~~~~~~~~~~~~~~

**allelic-counting** mode requires the user to input, per sample, 4 bam files, corresponding to haplotype1, haplotype2, unassigned and haplotagged , e.g. as generated by whatshap. The respective suffixes ".genome1", ".genome2", ".unassigned", ".alelle_flagged" are required to be followed by the bam extention ".sorted.bam". This mode is mutually exclusive with "deepTools_qc". Only the allelic version of deepTools qc will be run, by default. Allelic version of featureCounts will be run by default. If sample sheet is provided, allelic DESeq2- or allelic Salmon-based differential gene expression analysis will be run.

"alignment-free"
~~~~~~~~~~~~~~~~

Expand Down
14 changes: 14 additions & 0 deletions snakePipes/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,20 @@ def get_sample_names_bam(infiles, bamExt):
return sorted(list(set(s)))


def get_sample_names_suffix_bam(infiles, bamExt):
"""
Get sample names without file extensions
"""
bamSuff = [x + bamExt for x in [".genome1", ".genome2", ".unassigned", ".allele_flagged"]]
s = []
for x in infiles:
for y in bamSuff:
if y in os.path.basename(x):
x = os.path.basename(x).replace(y, "")
s.append(x)
return sorted(list(set(s)))


def is_paired(infiles, ext, reads):
"""
Check for paired-end input files
Expand Down
2 changes: 1 addition & 1 deletion snakePipes/shared/rscripts/DESeq2.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
sampleInfoFilePath <- snakemake@params[["sampleSheet"]]
countFilePath <- snakemake@params[["counts_table"]]
fdr <- as.numeric(snakemake@params[["fdr"]])
geneNamesFilePath <- snakemake@input[["symbol_file"]]
geneNamesFilePath <- snakemake@params[["symbol_file"]]
importfunc <- snakemake@params[["importfunc"]]
allelic_info <- as.logical(snakemake@params[["allele_info"]])
tx2gene_file <- snakemake@params[["tx2gene_file"]]
Expand Down
23 changes: 7 additions & 16 deletions snakePipes/shared/rules/DESeq2.multipleComp.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ rule DESeq2:
tx2gene_file = 'NA',
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table)
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
log:
out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")),
err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
Expand Down Expand Up @@ -64,20 +65,10 @@ rule DESeq2_Salmon_basic:
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = 'FALSE',
tx2gene_file = "Annotation/genes.filtered.t2g",
tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"]
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{params.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"../{input.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
"{params.formula} "
" > ../{log.out} 2> ../{log.err}"
script: "{params.script}"
67 changes: 20 additions & 47 deletions snakePipes/shared/rules/DESeq2.singleComp.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,30 @@ def get_outdir(folder_name,sampleSheet):
## DESeq2 (on featureCounts)
rule DESeq2:
input:
counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode else "featureCounts/counts.tsv",
counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode or "allelic-counting" in mode else "featureCounts/counts.tsv",
sampleSheet = sampleSheet,
symbol_file = "Annotation/genes.filtered.symbol" #get_symbol_file
output:
"{}/DESeq2.session_info.txt".format(get_outdir("DESeq2",sampleSheet))
benchmark:
"{}/.benchmark/DESeq2.featureCounts.benchmark".format(get_outdir("DESeq2",sampleSheet))
params:
script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
script = os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
sampleSheet = lambda wildcards,input: input.sampleSheet,
outdir = get_outdir("DESeq2",sampleSheet),
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE',
allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode or "allelic-counting" in mode else 'FALSE',
tx2gene_file = 'NA',
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"]
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
log:
out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",sampleSheet)),
err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",sampleSheet))
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{input.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"{params.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
"{params.formula} " #9
" > ../{log.out} 2> ../{log.err}"

script: "{params.script}"

## DESeq2 (on Salmon)
rule DESeq2_Salmon_basic:
Expand All @@ -62,23 +52,14 @@ rule DESeq2_Salmon_basic:
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = 'FALSE',
tx2gene_file = "Annotation/genes.filtered.t2g",
tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"]
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{input.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"../{input.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
"{params.formula} " # 9
" > ../{log.out} 2> ../{log.err}"
script: "{params.script}"


rule DESeq2_Salmon_allelic:
input:
Expand All @@ -99,18 +80,10 @@ rule DESeq2_Salmon_allelic:
fdr = fdr,
importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
allele_info = 'TRUE',
tx2gene_file = "Annotation/genes.filtered.t2g",
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd")
tx2gene_file = os.path.join(outdir,"Annotation/genes.filtered.t2g"),
rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd"),
formula = config["formula"],
counts_table = lambda wildcards,input: os.path.join(outdir,input.counts_table),
symbol_file = lambda wildcards,input: os.path.join(outdir,input.symbol_file)
conda: CONDA_RNASEQ_ENV
shell:
"cd {params.outdir} && "
"Rscript {params.script} "
"{input.sampleSheet} " # 1
"../{input.counts_table} " # 2
"{params.fdr} " # 3
"../{input.symbol_file} " # 4
"{params.importfunc} " # 5
"{params.allele_info} " # 6
"../{input.tx2gene_file} " # 7
"{params.rmdTemplate} " # 8
" > ../{log.out} 2> ../{log.err}"
script: "{params.script}"
103 changes: 64 additions & 39 deletions snakePipes/shared/rules/LinkBam.snakefile
Original file line number Diff line number Diff line change
@@ -1,49 +1,74 @@
import os

rule link_bam:
input:
indir + "/{sample}" + bamExt
output:
aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam"
params:
input_bai = indir + "/{sample}" + bamExt + ".bai",
output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai"
run:
if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
if not os.path.exists(os.path.join(outdir,output[0])):
os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))

if not pipeline=="noncoding-rna-seq":
if pipeline=="rna-seq" and "allelic-counting" in mode:
rule link_bam:
input:
indir + "/{sample}.{suffix}" + bamExt
output:
"allelic_bams/{sample}.{suffix}" + bamExt
params:
input_bai = indir + "/{sample}.{suffix}" + bamExt + ".bai",
output_bai = "allelic_bams/{sample}.{suffix}" + bamExt + ".bai"
run:
if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
if not os.path.exists(os.path.join(outdir,output[0])):
os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))

rule samtools_index_external:
input:
aligner + "/{sample}.bam"
"allelic_bams/{sample}.{suffix}" + bamExt
output:
aligner + "/{sample}.bam.bai"
"allelic_bams/{sample}.{suffix}" + bamExt + ".bai"
conda: CONDA_SHARED_ENV
shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi"

if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC:
rule link_bam_bai_external:

else:
rule link_bam:
input:
indir + "/{sample}" + bamExt
output:
aligner + "/{sample}.unsorted.bam" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam"
params:
input_bai = indir + "/{sample}" + bamExt + ".bai",
output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="noncoding-rna-seq" else aligner + "/{sample}.bam.bai"
run:
if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
if not os.path.exists(os.path.join(outdir,output[0])):
os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))

if not pipeline=="noncoding-rna-seq":
rule samtools_index_external:
input:
bam = aligner + "/{sample}.bam",
bai = aligner + "/{sample}.bam.bai"
aligner + "/{sample}.bam"
output:
bam_out = "filtered_bam/{sample}.filtered.bam",
bai_out = "filtered_bam/{sample}.filtered.bam.bai",
shell: """
ln -s ../{input.bam} {output.bam_out};
ln -s ../{input.bai} {output.bai_out}
"""


rule sambamba_flagstat:
input:
aligner + "/{sample}.bam"
output:
"Sambamba/{sample}.markdup.txt"
log: "Sambamba/logs/{sample}.flagstat.log"
conda: CONDA_SAMBAMBA_ENV
shell: """
sambamba flagstat -p {input} > {output} 2> {log}
"""
aligner + "/{sample}.bam.bai"
conda: CONDA_SHARED_ENV
shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi"

if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC:
rule link_bam_bai_external:
input:
bam = aligner + "/{sample}.bam",
bai = aligner + "/{sample}.bam.bai"
output:
bam_out = "filtered_bam/{sample}.filtered.bam",
bai_out = "filtered_bam/{sample}.filtered.bam.bai",
shell: """
ln -s ../{input.bam} {output.bam_out};
ln -s ../{input.bai} {output.bai_out}
"""


rule sambamba_flagstat:
input:
aligner + "/{sample}.bam"
output:
"Sambamba/{sample}.markdup.txt"
log: "Sambamba/logs/{sample}.flagstat.log"
conda: CONDA_SAMBAMBA_ENV
shell: """
sambamba flagstat -p {input} > {output} 2> {log}
"""
5 changes: 3 additions & 2 deletions snakePipes/shared/rules/multiQC.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,18 @@ def multiqc_input_check(return_value):
indir += "allelic_bams"
elif pipeline=="rna-seq":
# must be RNA-mapping, add files as per the mode
if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode:
if ( "alignment" in mode or "deepTools_qc" in mode or "three-prime-seq" in mode ) and not "allelic-mapping" in mode and not "allelic-counting" in mode:
infiles.append( expand(aligner+"/{sample}.bam", sample = samples) +
expand("Sambamba/{sample}.markdup.txt", sample = samples) +
expand("deepTools_qc/estimateReadFiltering/{sample}_filtering_estimation.txt",sample=samples)+
expand("featureCounts/{sample}.counts.txt", sample = samples))
indir += aligner + " featureCounts "
indir += " Sambamba "
indir += " deepTools_qc "
if "allelic-mapping" in mode:
if "allelic-mapping" in mode or "allelic-counting" in mode:
infiles.append( expand("featureCounts/{sample}.allelic_counts.txt", sample = samples) )
indir += aligner + " featureCounts "
if "allelic-mapping" in mode:
infiles.append( expand("allelic_bams/{sample}.SNPsplit_report.yaml", sample = samples) )
infiles.append( expand("allelic_bams/{sample}.SNPsplit_sort.yaml", sample = samples) )
indir += "allelic_bams"
Expand Down
Loading
Loading