-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
120 lines (111 loc) · 6.09 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
samples_df = pd.read_table('data/runs/primeira/samples.tsv').set_index("samples", drop=False)
SAMPLES = list(samples_df['samples'])
SOURCE = list(samples_df['source'])
DESTINY = list(samples_df['destiny'])
CELL = list(samples_df['cell'])
HEAD = list(samples_df['read'])
EMAIL = list(samples_df['email'])
SERVER = list(samples_df['server'])
PBC_df = pd.read_table('data/barcodes/T/PBC.txt', header=None)
PBC = list(PBC_df[0])
rule all:
input:
expand("{destiny}{sample}.zip", destiny=DESTINY, sample=SAMPLES)
#run fastqc for quality control
#trim 2nt in the begining of each read
rule trim_reads:
input:
expand("{source}{sample}_S1_R1_001.fastq.gz", source=SOURCE, sample=SAMPLES),
expand("{source}{sample}_S1_R2_001.fastq.gz", source=SOURCE, sample=SAMPLES)
output:
expand("{destiny}{sample}_S1_R1_001.fastq", sample=SAMPLES, destiny=DESTINY),
expand("{destiny}{sample}_S1_R2_001.fastq", sample=SAMPLES, destiny=DESTINY)
params:
QCinput=expand("{source}", source=SOURCE),
QCfolder=expand("{destiny}", destiny=DESTINY)
conda:
"envs/fastx.yaml"
shell:"""
zcat {input[0]} | fastx_trimmer -Q33 -f 3 -o {output[0]} && zcat {input[1]} | fastx_trimmer -Q33 -f 3 -o {output[1]}
fastqc {input[0]}
fastqc {input[1]}
mkdir {params.QCfolder}QC
mv {params.QCinput}*zip {params.QCinput}*html {params.QCfolder}QC/
"""
#run pandaseq
rule pandaseq:
input:
expand("{source}{sample}_S1_R1_001.fastq", source=DESTINY, sample=SAMPLES),
expand("{source}{sample}_S1_R2_001.fastq", source=DESTINY, sample=SAMPLES)
output:
expand("{destiny}{sample}.fasta", sample=SAMPLES, destiny=DESTINY)
conda:
"envs/pandaseq.yaml"
shell:
"pandaseq -f {input[0]} -r {input[1]} -w {output}"
#Split fasta files into plates by each barcodes
rule genfasta:
input:
expand("{source}{sample}.fasta", source=DESTINY, sample=SAMPLES),
expand("data/barcodes/{cell}PBC.txt", cell=CELL, sample=SAMPLES),
expand("data/barcodes/{cell}BC.txt", cell=CELL, sample=SAMPLES)
conda:
"envs/fastx.yaml"
params:
fo1=expand("{destiny}split1/", destiny=DESTINY),
fo2=expand("{destiny}split1/split2/", destiny=DESTINY),
fo3=expand("{destiny}split1/split2/collapsed/", destiny=DESTINY),
head=expand("{head}", head=HEAD),
fo4=expand("{destiny}split1/split2/collapsed/head/", destiny=DESTINY),
fo5=expand("{destiny}split1/split2/collapsed/head/final/", destiny=DESTINY),
sample=expand("{sample}", sample=SAMPLES),
removal=expand("{destiny}", destiny=DESTINY)
output:
expand("{destiny}split1/{sample}", destiny=DESTINY, sample=SAMPLES)
shell:"""
touch {output}
cat {input[0]} | fastx_barcode_splitter.pl --bcfile {input[1]} --bol --prefix {output} --suffix .fasta --exact
rm {params.fo1}*unmatched*
find {params.fo1} -name "*fasta" -size -4k -delete
mkdir -p {params.fo1}split2/
for f in {params.fo1}*fasta ; do x="${{f##*/}}" ; cat $f | fastx_barcode_splitter.pl --bcfile {input[2]} --eol --prefix {params.fo1}split2/${{x%.*}} --suffix .fasta ; done > {params.removal}BC.stats.txt
grep -v 'total' {params.removal}BC.stats.txt > {params.removal}tmp && mv {params.removal}tmp {params.removal}BC.stats.txt
grep -v 'Barcode' {params.removal}BC.stats.txt > {params.removal}tmp && mv {params.removal}tmp {params.removal}BC.stats.txt
rm {params.fo2}*unmatched*
find {params.fo2} -name "*fasta" -size -4k -delete
mkdir -p {params.fo2}collapsed/
for f in {params.fo2}*fasta ; do x="${{f##*/}}" ; fastx_collapser -i $f -o {params.fo2}collapsed/${{x%.*}}.fasta ; done
for f in {params.fo3}*fasta ; do x="${{f##*/}}" ; grep ">" $f | sed s/^.*-//g | sort -nr > {params.fo3}${{x%.*}}.collapsed.stats.txt ; done
ls {params.fo3}*txt | xargs | sed 's/ /\t/g' | sed 's/.collapsed.stats.txt//g' > {params.fo3}col.names
paste {params.fo3}*txt > {params.fo3}collapsed.stats
cat {params.fo3}col.names {params.fo3}collapsed.stats > {params.removal}collapsed.stats.txt
Rscript ./data/scripts/plate_stats.R {params.removal} {params.removal}{params.sample}
mkdir -p {params.fo3}head/
for f in {params.fo3}*fasta ; do x="${{f##*/}}" ; head -n{params.head} $f > {params.fo3}head/${{x%.*}}.fasta ; done
mkdir -p {params.fo4}final/
awk '/>/{{sub(">","&"FILENAME"_")}}1' {params.fo4}*fasta > {params.fo5}{params.sample}.fasta
sed -i 's/>.*\//>/g' {params.fo5}{params.sample}.fasta
sed -i 's/\.fasta//g' {params.fo5}{params.sample}.fasta
"""
rule final:
input:
expand("{destiny}split1/{sample}", destiny=DESTINY, sample=SAMPLES),
output:
expand("{destiny}{sample}.zip", destiny=DESTINY, sample=SAMPLES)
params:
removal=expand("{destiny}", destiny=DESTINY),
finalFasta=expand("{destiny}split1/split2/collapsed/head/final/{sample}.fasta", destiny=DESTINY, sample=SAMPLES),
finalRes=expand("{destiny}{sample}.fasta", destiny=DESTINY, sample=SAMPLES),
QC=expand("{destiny}/QC", destiny=DESTINY),
sample=expand("{sample}.zip", sample=SAMPLES),
server=expand("{server}", server=SERVER),
email=expand("{email}", email=EMAIL),
subject=expand("{sample}", sample=SAMPLES)
shell:"""
mv {params.finalFasta} {params.finalRes}
zip -r {output} {params.finalRes} {params.QC} {params.removal}*pdf {params.removal}*tsv
if [ {params.server} == "mucidalab3" ]; then smbclient '//mucidalab3/LMI_shared' -A ~/.smbclient1.conf -c 'cd ./Pipelines/clonality/results/ ; put {output} {params.sample}'; fi
if [ {params.server} == "victoranas" ]; then smbclient '//victoranas/victoracfs' -A ~/.smbclient2.conf -c 'cd ./Pipelines/clonality/results/; put {output} {params.sample}'; fi
mail -s "Your Clonality Run {params.subject} is completed" {params.email} <<< "Your clonality run is completed sucessfuly and it was uploaded to the folder: Pipelines/clonality/results/ of your lab server."
"""