forked from ospupegam/ngs2-assignment
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Commands
138 lines (108 loc) · 5.07 KB
/
Commands
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#download human genome reference Release 30 (GRCh38.p12)
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh38.p12.genome.fa.gz
gunzip GRCh38.p12.genome.fa.gz
git clone https://github.com/ne1al/ngs2-assignment.git
cd ngs2-assignment/
echo "Nehal Adel Abdelsalam, [email protected]">user_info.md
git add user_info.md
git commit -m "create user_info.md"
git push
-----------------------------------------------------------------------------------------------------------------------------------
source activate ngs1
#Download packages required
conda install -c bioconda star
#create a file for the assignment
mkdir assign2 && cd assign2
#download sample files (sample of one million reads)
wget -c ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR879/SRR8797509/SRR8797509.sra
fastq-dump --split-files -X 1000000 ~/assign2/SRR8797509.sra
#STAR mapping
mkdir STARmap && cd STARmap
ln -s ~/workdir/sample_data/chr22_with_ERCC92.fa .
ln -s ~/workdir/sample_data/chr22_with_ERCC92.gtf .
STAR --runThreadN 1 --runMode genomeGenerate --genomeDir ~/assign2/STARmap --genomeFastaFiles chr22_with_ERCC92.fa --sjdbGTFfile chr22_with_ERCC92.gtf --sjdbOverhang 149
STAR --runThreadN 1 --genomeDir ~/assign2/STARmap --readFilesIn ~/assign2/SRR8797509_1.fastq ~/assign2/SRR8797509_2.fastq
STAR --runThreadN 4 --genomeDir ~/assign2/STARmap --twopassMode Basic --readFilesIn ~/assign2/SRR8797509_1.fastq ~/assign2/SRR8797509_2.fastq
#Mark duplicates and sort
for samfile in *.sam;do
sample=${samfile%.sam}
samtools view -hbo $sample.bam $samfile
samtools sort $sample.bam -o $sample.sorted.bam
done
java -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar MergeSamFiles I=Aligned.out.sorted.bam OUTPUT=Aligned.out_merged.sorted.bam
samtools view -H Aligned.out.sorted.bam
samtools view -H Aligned.out_merged.sorted.bam
rm Aligned.out.sorted.bam
for bamFile in *.sorted.bam;do
output=${bamFile%.sorted.bam}
samtools depth $bamFile | awk '{{sum+=$3}} END {{print "Average = ",sum/NR}}' > $output.cov
samtools flagstat $bamFile > $output.stat
done
for sample in *.sorted.bam;do
name=${sample%.sorted.bam}
java -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar MarkDuplicates INPUT=$sample OUTPUT=$name.dedup.bam METRICS_FILE=$name.metrics.txt
done
#Indexing
for sample in *.dedup.bam;do
#name=${sample%.dedup.bam}
java -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar BuildBamIndex VALIDATION_STRINGENCY=LENIENT INPUT=$sample
done
# Reference
java -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar CreateSequenceDictionary R=chr22_with_ERCC92.fa O=chr22_with_ERCC92.dict
samtools faidx chr22_with_ERCC92.fa
#SplitNtrim
gatk SplitNCigarReads -R chr22_with_ERCC92.fa -I Aligned.out_merged.dedup.bam -O split.bam
#Download known variants
wget 'ftp://ftp.ensembl.org/pub/release-96/variation/vcf/homo_sapiens/homo_sapiens-chr22.vcf.gz' -O human_chr22.vcf.gz
gunzip human_chr22.vcf.gz
grep "^22" human_chr22.vcf | sed 's/^22/chr22/' >> human_fam_chr22.vcf
gatk IndexFeatureFile -F human_fam_chr22.vcf
#Recalibration (failed to recalibrate, explanation in troubleshooting file)
#adding dictionary to vcf
gatk UpdateVCFSequenceDictionary \
-V human_chr22.vcf \
-R chr22_with_ERCC92.fa \
--output human_chr22_newcontiglines.vcf
#sorting VCF
java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar SortVcf \
I=human_chr22_newcontiglines.vcf \
O=human_chr22.sorted.vcf
#indexing vcf
bgzip -c human_chr22.sorted.vcf > human_chr22.sorted.vcf.gz
tabix -p vcf human_chr22.sorted.vcf.gz
#rename
grep "^22" human_chr22.sorted.vcf.gz.tbi | sed 's/^22/chr22/' >> human_chr22.sorted_fam.vcf.gz.tbi
gatk IndexFeatureFile -F human_chr22.sorted_fam.vcf.gz.tbi
gatk BaseRecalibrator \
-I ~/assign2/STARmap/Aligned.out_merged.dedup.bam \
-R ~/assign2/STARmap/chr22_with_ERCC92.fa \
--known-sites ~/assign2/STARmap/human_chr22.sorted_fam.vcf.gz.tbi\
-O recal_data.table
gatk ApplyBQSR \
-R ~/assign2/STARmap/chr22_with_ERCC92.fa \
-I ~/assign2/STARmap/Aligned.out_merged.dedup.bam \
--bqsr-recal_data.table \
-O output.bam
#Haplotype Caller
gatk --java-options "-Xmx2G" HaplotypeCaller \
-R chr22_with_ERCC92.fa -I Aligned.out_merged.dedup.bam\
--emit-ref-confidence GVCF \
--pcr-indel-model NONE \
-O $name.gvcf
gatk --java-options "-Xmx4g" HaplotypeCaller \
-R chr22_with_ERCC92.fa \
-I Aligned.out_merged.dedup.bam \
-O output.g.vcf.gz
java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar ValidateSamFile \
I=Aligned.out_merged.dedup.bam \
R=chr22_with_ERCC92.fa \
MODE=VERBOSE
java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar ValidateSamFile \
I=Aligned.out_merged.sorted.bam \
R=chr22_with_ERCC92.fa \
MODE=VERBOSE
samtools calmd -bAr Aligned.out_merged.dedup.bam chr22_with_ERCC92.fa > Aligned.out_merged_fixeddedup.bam
java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar ValidateSamFile \
I=Aligned.out_merged_fixeddedup.bam \
R=chr22_with_ERCC92.fa \
MODE=VERBOSE