maxplanck-ie · katsikora · Dec 20, 2024 · Sep 12, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh
diff --git a/snakePipes/shared/profiles/local/config.yaml b/snakePipes/shared/profiles/local/config.yaml
@@ -205,4 +205,4 @@ set-resources:
   velo_to_sce:
     mem: 30G
   velocyto:
-    mem: 20G
+    mem: 20G
diff --git a/snakePipes/shared/rscripts/DB_functions.R b/snakePipes/shared/rscripts/DB_functions.R
@@ -343,6 +343,8 @@ writeOutput_chip <- function(chipResultObject, outfile_prefix, fdrcutoff,lfccuto
     full_res[,2]<-full_res[,2]-1
     full_res[,2]<-format(full_res[,2], scientific = FALSE,trim=TRUE)
     full_res[,3]<-format(full_res[,3], scientific = FALSE,trim=TRUE)
+    #write full results
+    write.table(full_res,file="Full.results.bed",row.names=FALSE,col.names=FALSE,sep="\t",quote=FALSE)
     ##filter full result for FDR and LFC and write to output
     full_res.filt<-subset(full_res,(FDR<=fdrcutoff)&(abs(best.logFC)>=lfccutoff))
     if(nrow(full_res.filt)>0){

diff --git a/snakePipes/shared/rscripts/sleuth.R b/snakePipes/shared/rscripts/sleuth.R
@@ -24,17 +24,43 @@ sample_info = read.table(sample_info_file, header=T)#[,1:2]
 cnames.sub<-unique(colnames(sample_info)[2:which(colnames(sample_info) %in% "condition")])
 d<-as.formula(noquote(paste0("~",paste(cnames.sub,collapse="+"))))
 colnames(sample_info)[colnames(sample_info) %in% "name"] ="sample"
+
+#check if sample names are pure numbers
+numbers_only <- function(x) !grepl("\\D", x)
+if(any(numbers_only(sample_info$sample))){sample_info$sample[numbers_only(sample_info$sample)]<-paste0("X",sample_info$sample[numbers_only(sample_info$sample)])}
+rownames(sample_info)<-sample_info$sample
+
 print(sample_info)
 sample_info$sample
 
-sample_id = list.dirs(file.path(indir), recursive=F, full.names=F)
-sample_id = sort(sample_id[grep('[^benchmark][^SalmonIndex]', sample_id, invert=F)])
+
+###MODIFY THIS PART
+#sample_id = list.dirs(file.path(indir), recursive=F, full.names=F)
+#sample_id = sort(sample_id[grep('[^benchmark][^SalmonIndex]', sample_id, invert=F)])
+#if(any(numbers_only(sample_id))){sample_id[numbers_only(sample_id)]<-paste0("X",sample_id[numbers_only(sample_id)])}
 #sample_id = intersect(sample_info$sample, sample_id) # get only those sample that are defined in the sampleInfo!
-sample_id<-sample_id[match(sample_info$sample,sample_id)]
+#sample_id<-sample_id[match(sample_info$sample,sample_id)]
+#print(sample_id)
+#if(any(is.na(sample_id))){stop("Sample names from sample sheet and from Salmon output are not matching each other.")}
+
+#salmon_dirs = sapply(sample_id, function(id) file.path(indir, id))
+#print(salmon_dirs)
+######
+sample_id<- dir(file.path(indir),recursive=FALSE,full.names=TRUE)
+sample_id<-sample_id[grep("*quant.sf",sample_id)]
+sample_id<-sub(".quant.sf","",sample_id)
+names(sample_id)<-basename(sample_id)
+print(sample_id)
+
+if(any(numbers_only(names(sample_id)))){names(sample_id)[numbers_only(names(sample_id))]<-paste0("X",names(sample_id)[numbers_only(names(sample_id))])}
+
+sample_id<-sample_id[match(sample_info$sample,names(sample_id))]
 print(sample_id)
+if(any(is.na(names(sample_id)))){stop("Sample names from sample sheet and from Salmon output are not matching each other.")}
 
-salmon_dirs = sapply(sample_id, function(id) file.path(indir, id))
+salmon_dirs = sample_id
 print(salmon_dirs)
+##########
 
 s2c = mutate(sample_info, path=salmon_dirs)
 ## reorder conditions (for Wald test later on: order of comparison important for fold change)

diff --git a/snakePipes/shared/rules/CSAW.multiComp.snakefile b/snakePipes/shared/rules/CSAW.multiComp.snakefile
@@ -87,7 +87,8 @@ rule CSAW:
     output:
         "{}/CSAW.session_info.txt".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")),
         "{}/DiffBinding_analysis.Rdata".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")),
-        expand("{}".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{{compGroup}}.tsv")) + "/Filtered.results.{change_dir}.bed", change_dir=change_direction)
+        expand("{}".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{{compGroup}}.tsv")) + "/Filtered.results.{change_dir}.bed", change_dir=change_direction),
+        "{}".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")) + "/Full.results.bed"
     benchmark:
         "{}/.benchmark/CSAW.benchmark".format(get_outdir(peakCaller,os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
     params:

diff --git a/snakePipes/shared/rules/CSAW.singleComp.snakefile b/snakePipes/shared/rules/CSAW.singleComp.snakefile
@@ -71,7 +71,8 @@ rule CSAW:
     output:
         "CSAW_{}_{}/CSAW.session_info.txt".format(peakCaller, sample_name),
         "CSAW_{}_{}/DiffBinding_analysis.Rdata".format(peakCaller, sample_name),
-        expand("CSAW_{}_{}".format(peakCaller, sample_name) + "/Filtered.results.{change_dir}.bed", change_dir=change_direction)
+        expand("CSAW_{}_{}".format(peakCaller, sample_name) + "/Filtered.results.{change_dir}.bed", change_dir=change_direction),
+        "CSAW_{}_{}".format(peakCaller, sample_name) + "/Full.results.bed"
     benchmark:
         "CSAW_{}_{}/.benchmark/CSAW.benchmark".format(peakCaller, sample_name)
     params:

diff --git a/snakePipes/shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.singleComp.snakefile
@@ -42,6 +42,7 @@ rule DESeq2_Salmon_basic:
         "{}/.benchmark/DESeq2.Salmon.benchmark".format(get_outdir("DESeq2_Salmon",sampleSheet))
     params:
         script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
+        sampleSheet = lambda wildcards,input: input.sampleSheet,
         outdir = get_outdir("DESeq2_Salmon",sampleSheet),
         fdr = fdr,
         importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
@@ -67,6 +68,7 @@ rule DESeq2_Salmon_allelic:
         "{}/.benchmark/DESeq2.SalmonAllelic.benchmark".format(get_outdir("DESeq2_SalmonAllelic",sampleSheet))
     params:
         script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
+        sampleSheet = lambda wildcards,input: input.sampleSheet,
         outdir = get_outdir("DESeq2_SalmonAllelic",sampleSheet),
         fdr = fdr,
         importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),

diff --git a/snakePipes/shared/rules/LinkBam.snakefile b/snakePipes/shared/rules/LinkBam.snakefile
@@ -29,42 +29,43 @@ else:
         input:
             indir + "/{sample}" + bamExt
         output:
-            aligner + "/{sample}.unsorted.bam" if pipeline=="ncRNAseq" else aligner + "/{sample}.bam"
+            aligner + "/{sample}.unsorted.bam" if pipeline=="ncRNAseq" else aligner + "/{sample}.markdup.bam"
         params:
             input_bai = indir + "/{sample}" + bamExt + ".bai",
-            output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="ncRNAseq" else aligner + "/{sample}.bam.bai"
+            output_bai = aligner + "/{sample}.unsorted.bam.bai" if pipeline=="ncRNAseq" else aligner + "/{sample}.markdup.bam.bai"
         run:
             if os.path.exists(params.input_bai) and not os.path.exists(os.path.join(outdir,params.output_bai)):
                 os.symlink(params.input_bai,os.path.join(outdir,params.output_bai))
             if not os.path.exists(os.path.join(outdir,output[0])):
                 os.symlink(os.path.join(outdir,input[0]),os.path.join(outdir,output[0]))
 
+
     if not pipeline=="ncRNAseq":
         rule samtools_index_external:
             input:
-                aligner + "/{sample}.bam"
+                aligner + "/{sample}.markdup.bam"
             output:
-                aligner + "/{sample}.bam.bai"
+                aligner + "/{sample}.markdup.bam.bai"
             conda: CONDA_SHARED_ENV
             shell: "if [[ ! -f {output[0]} ]]; then samtools index {input[0]}; fi"
 
-        if not pipeline=="WGBS" or pipeline=="WGBS" and skipBamQC:
-            rule link_bam_bai_external:
-                input:
-                    bam = aligner + "/{sample}.bam",
-                    bai = aligner + "/{sample}.bam.bai"
-                output:
-                    bam_out = "filtered_bam/{sample}.filtered.bam",
-                    bai_out = "filtered_bam/{sample}.filtered.bam.bai",
-                shell: """
-                    ln -s ../{input.bam} {output.bam_out};
-                    ln -s ../{input.bai} {output.bai_out}
+
+        rule link_bam_bai_external:
+            input:
+                bam = aligner + "/{sample}.markdup.bam",
+                bai = aligner + "/{sample}.markdup.bam.bai"
+            output:
+                bam_out = "filtered_bam/{sample}.filtered.bam",
+                bai_out = "filtered_bam/{sample}.filtered.bam.bai",
+            shell: """
+                ln -s ../{input.bam} {output.bam_out};
+                ln -s ../{input.bai} {output.bai_out}
                 """
 
 
         rule sambamba_flagstat:
            input:
-               aligner + "/{sample}.bam"
+               aligner + "/{sample}.markdup.bam"
            output:
                "Sambamba/{sample}.markdup.txt"
            conda: CONDA_SAMBAMBA_ENV

diff --git a/snakePipes/shared/rules/SNPsplit.snakefile b/snakePipes/shared/rules/SNPsplit.snakefile
@@ -22,12 +22,12 @@ elif aligner == "STAR" or aligner == "EXTERNAL_BAM":
     rule snp_split:
         input:
             snp = SNPFile,
-            bam = aligner+"/{sample}.bam"
+            bam = aligner+"/{sample}.markdup.bam"
         output:
-            targetbam = expand("allelic_bams/{{sample}}.{suffix}.bam", suffix = ['allele_flagged', 'genome1', 'genome2', 'unassigned']),
+            targetbam = expand("allelic_bams/{{sample}}.markdup.{suffix}.bam", suffix = ['allele_flagged', 'genome1', 'genome2', 'unassigned']),
             #tempbam = temp(aligner+"/{sample}.sortedByName.bam"),
-            rep1 = "allelic_bams/{sample}.SNPsplit_report.yaml",
-            rep2 = "allelic_bams/{sample}.SNPsplit_sort.yaml"
+            rep1 = "allelic_bams/{sample}.markdup.SNPsplit_report.yaml",
+            rep2 = "allelic_bams/{sample}.markdup.SNPsplit_sort.yaml"
         params:
             pairedEnd = '--paired' if pairedEnd else '',
             outdir = "allelic_bams"
@@ -62,7 +62,7 @@ elif aligner == "STAR" or aligner == "EXTERNAL_BAM":
 
 # sort them
 rule BAMsort_allelic:
-    input: "allelic_bams/{sample}.filtered.{suffix}.bam" if aligner == "Bowtie2" else "allelic_bams/{sample}.{suffix}.bam"
+    input: "allelic_bams/{sample}.filtered.{suffix}.bam" if aligner == "Bowtie2" else "allelic_bams/{sample}.markdup.{suffix}.bam"
     output:
         "allelic_bams/{sample}.{suffix}.sorted.bam"
     threads:

diff --git a/snakePipes/shared/rules/Salmon.snakefile b/snakePipes/shared/rules/Salmon.snakefile
@@ -57,7 +57,7 @@ if pairedEnd:
             gtf = genes_gtf,
             lib_type = getSalmon_libtype(pairedEnd, libraryType)
         threads: 8
-        conda: CONDA_RNASEQ_ENV
+        conda: CONDA_SALMON_ENV
         shell: """
             salmon quant -p {threads} --softclipOverhangs --validateMappings --numBootstraps 50 -i {params.index} -l {params.lib_type} -1 {input.r1} -2 {input.r2} -o {params.outdir}
             """

diff --git a/snakePipes/shared/rules/WGBS.snakefile b/snakePipes/shared/rules/WGBS.snakefile
@@ -24,7 +24,7 @@ if pairedEnd and not fromBAM:
             r1=fastq_dir + "/{sample}" + reads[0] + ".fastq.gz",
             r2=fastq_dir + "/{sample}" + reads[1] + ".fastq.gz"
         output:
-            sbam=temp(aligner+"/{sample}.bam")
+            sbam=temp(aligner+"/{sample}.sorted.bam")
         params:
             bwameth_index=bwameth_index if aligner=="bwameth" else bwameth2_index,
             tempDir = tempDir
@@ -43,7 +43,7 @@ elif not pairedEnd and not fromBAM:
         input:
             r1=fastq_dir + "/{sample}" + reads[0] + ".fastq.gz",
         output:
-            sbam=temp(aligner+"/{sample}.bam")
+            sbam=temp(aligner+"/{sample}.sorted.bam")
         params:
             bwameth_index=bwameth_index if aligner=="bwameth" else bwameth2_index,
             tempDir = tempDir
@@ -57,59 +57,59 @@ elif not pairedEnd and not fromBAM:
             rm -rf "$MYTEMP"
             """
 
-if not fromBAM:
-    rule index_bam:
-        input:
-            aligner+"/{sample}.bam"
-        output:
-            temp(aligner+"/{sample}.bam.bai")
-        conda: CONDA_SHARED_ENV
-        shell: """
-            samtools index "{input}"
-            """
-
-if not skipBamQC:
-    rule markDupes:
-        input:
-            aligner+"/{sample}.bam",
-            aligner+"/{sample}.bam.bai"
-        output:
-            "Sambamba/{sample}.markdup.bam"
-        threads: lambda wildcards: 10 if 10<max_thread else max_thread
-        params:
-            tempDir = tempDir
-        conda: CONDA_SAMBAMBA_ENV
-        shell: """
-            TMPDIR={params.tempDir}
-            MYTEMP=$(mktemp -d "${{TMPDIR:-/tmp}}"/snakepipes.XXXXXXXXXX)
-            sambamba markdup --overflow-list-size 600000 -t {threads} --tmpdir "$MYTEMP/{wildcards.sample}" "{input[0]}" "{output}"
-            rm -rf "$MYTEMP"
-            """
-
-
-    rule indexMarkDupes:
-        input:
-            "Sambamba/{sample}.markdup.bam"
-        output:
-            "Sambamba/{sample}.markdup.bam.bai"
-        params:
-        threads: 1
-        conda: CONDA_SHARED_ENV
-        shell: """
-            samtools index "{input}"
-            """
-
-    rule link_deduped_bam:
-        input:
-            bam="Sambamba/{sample}.markdup.bam",
-            bai="Sambamba/{sample}.markdup.bam.bai"
-        output:
-            bam = "filtered_bam/{sample}.filtered.bam",
-            bai = "filtered_bam/{sample}.filtered.bam.bai"
-        shell: """
-            ln -s ../{input.bam} {output.bam}
-            ln -s ../{input.bai} {output.bai}
-        """
+#if not fromBAM:
+#    rule index_bam:
+#        input:
+#            aligner+"/{sample}.sorted.bam"
+#        output:
+#            temp(aligner+"/{sample}.sorted.bam.bai")
+#        conda: CONDA_SHARED_ENV
+#        shell: """
+#            samtools index "{input}"
+#            """
+
+#if not skipBamQC:
+#    rule markDupes:
+#        input:
+#            aligner+"/{sample}.sorted.bam",
+#            aligner+"/{sample}.sorted.bam.bai"
+#        output:
+#            "Sambamba/{sample}.markdup.bam"
+#        threads: lambda wildcards: 10 if 10<max_thread else max_thread
+#        params:
+#            tempDir = tempDir
+#        conda: CONDA_SAMBAMBA_ENV
+#        shell: """
+#            TMPDIR={params.tempDir}
+#            MYTEMP=$(mktemp -d "${{TMPDIR:-/tmp}}"/snakepipes.XXXXXXXXXX)
+#            sambamba markdup --overflow-list-size 600000 -t {threads} --tmpdir "$MYTEMP/{wildcards.sample}" "{input[0]}" "{output}"
+#            rm -rf "$MYTEMP"
+#            """
+
+
+#    rule indexMarkDupes:
+#        input:
+#            "Sambamba/{sample}.markdup.bam"
+#        output:
+#            "Sambamba/{sample}.markdup.bam.bai"
+#        params:
+#        threads: 1
+#        conda: CONDA_SHARED_ENV
+#        shell: """
+#            samtools index "{input}"
+#            """
+
+#    rule link_deduped_bam:
+#        input:
+#            bam="Sambamba/{sample}.markdup.bam",
+#            bai="Sambamba/{sample}.markdup.bam.bai"
+#        output:
+#            bam = "filtered_bam/{sample}.filtered.bam",
+#            bai = "filtered_bam/{sample}.filtered.bam.bai"
+#        shell: """
+#            ln -s ../{input.bam} {output.bam}
+#            ln -s ../{input.bai} {output.bai}
+#        """
 
 
 rule getRandomCpGs:
@@ -174,7 +174,7 @@ rule calc_Mbias:
     threads: lambda wildcards: 10 if 10<max_thread else max_thread
     conda: CONDA_WGBS_ENV
     shell: """
-        MethylDackel mbias -@ {threads} {params.genome} {input[0]} QC_metrics/{wildcards.sample}
+        MethylDackel mbias -@ {threads} {params.genome} {input[0]} QC_metrics/{wildcards.sample} 2> {output}
         """
 
 
@@ -189,7 +189,7 @@ rule calcCHHbias:
     threads: lambda wildcards: 10 if 10<max_thread else max_thread
     conda: CONDA_WGBS_ENV
     shell: """
-        MethylDackel mbias -@ {threads} --CHH --noCpG --noSVG {params.genome} {input[0]} QC_metrics/{wildcards.sample}
+        MethylDackel mbias -@ {threads} --CHH --noCpG --noSVG {params.genome} {input[0]} QC_metrics/{wildcards.sample} 2> {output}
         """
 
 

diff --git a/snakePipes/shared/rules/bam_filtering.snakefile b/snakePipes/shared/rules/bam_filtering.snakefile
@@ -7,7 +7,7 @@ bam_filter_string = "{} {} {}".format("-F 1024" if dedup else "", "-f 2" if prop
 
 rule samtools_filter:
     input:
-        aligner+"/{sample}.bam"
+        aligner+"/{sample}.markdup.bam"
     output:
         bam = temp("filtered_bam/{sample}.filtered.tmp.bam")
     params:

diff --git a/snakePipes/shared/rules/deepTools_qc.snakefile b/snakePipes/shared/rules/deepTools_qc.snakefile
@@ -3,8 +3,8 @@
 
 rule bamCoverage:
     input:
-        bam = aligner+"/{sample}.bam",
-        bai = aligner+"/{sample}.bam.bai"
+        bam = aligner+"/{sample}.markdup.bam",
+        bai = aligner+"/{sample}.markdup.bam.bai"
     output:
         "bamCoverage/{sample}.seq_depth_norm.bw"
     params:
@@ -143,8 +143,8 @@ rule plotPCA:
 
 rule estimate_read_filtering:
     input:
-        bam = aligner+"/{sample}.bam",
-        bai = aligner+"/{sample}.bam.bai"
+        bam = aligner+"/{sample}.markdup.bam",
+        bai = aligner+"/{sample}.markdup.bam.bai"
     output:
         "deepTools_qc/estimateReadFiltering/{sample}_filtering_estimation.txt"
     conda: CONDA_SHARED_ENV

diff --git a/snakePipes/shared/rules/envs/sleuth.yaml b/snakePipes/shared/rules/envs/sleuth.yaml
@@ -6,3 +6,4 @@ dependencies:
  - r-base = 4.2.0
  - r-sleuth = 0.30.1
  - r-dplyr
+ - r-gtools