forked from abdelrahmanMA/ngs1_project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
codes.txt
173 lines (134 loc) · 6.16 KB
/
codes.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#Install packages required
source activate ngs1
conda install -c bioconda sra-tools
conda install -c bioconda seqkit
#Create a new file directory for the assignment
mkdir assign && cd assign
#Download dataset SRA file
wget -c ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR879/SRR8797509/SRR8797509.sra
# Converting SRA to FASTQ files
mkdir fastq_files && cd fastq_files
fastq-dump --split-files -X 5000000 ~/assign/SRR8797509.sra
# Splitting unshuffled samples
cd..
mkdir unshuffled_samples && cd unshuffled_samples
seqkit split2 -1 SRR8797509_1.fastq -2 SRR8797509_2.fastq -p 5 -O out -f
#QC for the unshuffled sample one
mkdir fastqc_reports && cd fastqc_reports
for f in ~/assign/unshuffled_samples/out/SRR8797509_*.part_001.fastq;do
fastqc -t 1 -f fastq -noextract -o ~/assign/unshuffled_samples/fastqc_reports $f;done
multiqc -z -o . .
#Shuffling 5M reads and splitting to 5 samples
cd..
mkdir shuffled_samples && cd shuffled_samples
seqkit shuffle ~/assign/fastq_files/SRR8797509_1.fastq >shuffled_1
seqkit shuffle ~/assign/fastq_files/SRR8797509_2.fastq >shuffled_2
seqkit split2 -1 shuffled_1.fastq -2 shuffled_2.fastq -p 5 -O out -f
#QC for shuffled sample one
mkdir fastqc_reports && cd fastqc_reports
for f in ~/assign/shuffled_samples/out/shuffled_*.part_001.fastq;do
fastqc -t 1 -f fastq -noextract -o ~/assign/shuffled_samples/fastqc_reports $f;done
multiqc -z -o . .
#Trimming unshuffled reads:
cd ..
cd unshuffled_samples
mkdir mild_trim && cd mild_trim
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.fastq"
R2="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.fastq"
newf1="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.pe.trim.fastq"
newf2="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.pe.trim.fastq"
newf1U="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.se.trim.fastq"
newf2U="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.se.trim.fastq"
adap="/home/nehal/miniconda3/envs/ngs1/share/trimmomatic-0.39-1/adapters"
trimmomatic PE -threads 1 -phred33 -trimlog trimLogFile -summary statsSummaryFile $R1 $R2 $newf1 $newf1U $newf2 $newf2U \
ILLUMINACLIP:$adap/TruSeq3-PE.fa:2:30:10:1 SLIDINGWINDOW:4:15 MINLEN:36
done
#Trimming shuffled reads
#I increased the minimum length, widened the sliding window and raised the required quality for aggressive trimming
cd
cd assign/shuffled_samples
mkdir agg_trim && cd agg_trim
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.fastq"
R2="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.fastq"
newf1="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.pe.trim.fastq"
newf2="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.pe.trim.fastq"
newf1U="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.se.trim.fastq"
newf2U="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.se.trim.fastq"
adap="/home/nehal/miniconda3/envs/ngs1/share/trimmomatic-0.39-1/adapters"
trimmomatic PE -threads 1 -phred33 -trimlog trimLogFile -summary statsSummaryFile $R1 $R2 $newf1 $newf1U $newf2 $newf2U \
ILLUMINACLIP:$adap/TruSeq3-PE.fa:2:30:10:1 SLIDINGWINDOW:6:18 MINLEN:38
done
#BWA alignment for unshuffled samples
#Indexing
cd
mkdir -p ~/assign/bwa_align/bwaIndex && cd ~/assign/bwa_align/bwaIndex
ln -s ~/workdir/sample_data/chr22_with_ERCC92.fa .
bwa index -a bwtsw chr22_with_ERCC92.fa
#Alignment
cd ..
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/unshuffled_samples/out/SRR8797509_1.part_00${SAMPLE}.pe.trim.fastq"
R2="/home/nehal/assign/unshuffled_samples/out/SRR8797509_2.part_00${SAMPLE}.pe.trim.fastq"
/usr/bin/time -v bwa mem bwaIndex/chr22_with_ERCC92.fa $R1 $R2 > SRR8797509_part_00${SAMPLE}.sam
done
#Hisat alignment for shuffled reads
#Indexing (Already done before)
cd ..
mkdir -p ~/assign/hisat_align && cd ~/assign/hisat_align
ln -s ~/workdir/hisat_align/hisatIndex
#Alignment
for SAMPLE in 1 2 3 4 5;
do
R1="/home/nehal/assign/shuffled_samples/out/shuffled_1.part_00${SAMPLE}.pe.trim.fastq"
R2="/home/nehal/assign/shuffled_samples/out/shuffled_2.part_00${SAMPLE}.pe.trim.fastq"
hisat2 -p 1 -x hisatIndex/chr22_with_ERCC92 --dta --rna-strandness RF -1 $R1 -2 $R2 -S shuff_SRR8797509_part_00${SAMPLE}.sam
done
#Assembly_unshuffled samples with previous known annotations
cd ..
cd bwa_align
for SAMPLE in 1 2 3 4 5
do
R="/home/nehal/assign/bwa_align/SRR8797509_part_00${SAMPLE}.sam"
samtools view -bS $R > SRR8797509_part_00${SAMPLE}.bam
done
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/bwa_align/SRR8797509_part_00${SAMPLE}.bam"
samtools sort $R -o SRR8797509_part_00${SAMPLE}.sorted.bam
done
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/bwa_align/SRR8797509_part_00${SAMPLE}.sorted.bam"
stringtie $R --rf -l ref_sup -G ~/workdir/sample_data/chr22_with_ERCC92.gtf -o ref_sup.gtf
done
#Assembly_shuffled samples with previous known annotations
cd ..
cd hisat_align
for SAMPLE in 1 2 3 4 5
do
R="/home/nehal/assign/hisat_align/shuff_SRR8797509_part_00${SAMPLE}.sam"
samtools view -bS $R > shuff_SRR8797509_part_00${SAMPLE}.bam
done
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/hisat_align/shuff_SRR8797509_part_00${SAMPLE}.bam"
samtools sort $R -o shuff_SRR8797509_part_00${SAMPLE}.sorted.bam
done
for SAMPLE in 1 2 3 4 5;
do
R="/home/nehal/assign/hisat_align/shuff_SRR8797509_part_00${SAMPLE}.sorted.bam"
stringtie $R --rf -l ref_sup -G ~/workdir/sample_data/chr22_with_ERCC92.gtf -o ref_sup.gtf
done
#GTF Compare Unshuffled
cd ..
mkdir -p ~/assign/gtf-compare-unshuff/gtfs && cd ~/assign/gtf-compare-unshuff/gtfs
ln -s ~/assign/bwa_align/ref_sup.gtf .
ln -s ~/workdir/gtf-compare/comp.py .
ln -s ~/workdir/gtf-compare/stat.py .
python comp.py -r ../gtfs/ref_sup.gtf
python stat.py