Commands

#download human genome reference Release 30 (GRCh38.p12)
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh38.p12.genome.fa.gz
gunzip GRCh38.p12.genome.fa.gz


git clone https://github.com/ne1al/ngs2-assignment.git
cd ngs2-assignment/
echo "Nehal Adel Abdelsalam, nehal.abdelsalam@outlook.com">user_info.md
git add user_info.md
git commit -m "create user_info.md"
git push
-----------------------------------------------------------------------------------------------------------------------------------

source activate ngs1
#Download packages required
conda install -c bioconda star 

#create a file for the assignment
mkdir assign2 && cd assign2
#download sample files (sample of one million reads)
wget -c ftp://ftp-trace.ncbi.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/SRR879/SRR8797509/SRR8797509.sra
fastq-dump --split-files -X 1000000 ~/assign2/SRR8797509.sra

#STAR mapping
mkdir STARmap && cd STARmap
ln -s ~/workdir/sample_data/chr22_with_ERCC92.fa .
ln -s ~/workdir/sample_data/chr22_with_ERCC92.gtf .

STAR --runThreadN 1 --runMode genomeGenerate --genomeDir ~/assign2/STARmap --genomeFastaFiles chr22_with_ERCC92.fa --sjdbGTFfile chr22_with_ERCC92.gtf --sjdbOverhang 149

STAR --runThreadN 1 --genomeDir ~/assign2/STARmap --readFilesIn ~/assign2/SRR8797509_1.fastq ~/assign2/SRR8797509_2.fastq
STAR --runThreadN 4 --genomeDir ~/assign2/STARmap --twopassMode Basic --readFilesIn ~/assign2/SRR8797509_1.fastq ~/assign2/SRR8797509_2.fastq

#Mark duplicates and sort
for samfile in *.sam;do
  sample=${samfile%.sam}
  samtools view -hbo $sample.bam $samfile
  samtools sort $sample.bam -o $sample.sorted.bam
done

java  -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar MergeSamFiles I=Aligned.out.sorted.bam OUTPUT=Aligned.out_merged.sorted.bam

samtools view -H Aligned.out.sorted.bam
samtools view -H Aligned.out_merged.sorted.bam

rm Aligned.out.sorted.bam
for bamFile in *.sorted.bam;do
  output=${bamFile%.sorted.bam}
  samtools depth $bamFile | awk '{{sum+=$3}} END {{print "Average = ",sum/NR}}' > $output.cov
  samtools flagstat $bamFile > $output.stat
done

for sample in *.sorted.bam;do
  name=${sample%.sorted.bam}
  java  -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar MarkDuplicates INPUT=$sample OUTPUT=$name.dedup.bam METRICS_FILE=$name.metrics.txt
done

#Indexing
for sample in *.dedup.bam;do
  #name=${sample%.dedup.bam}
  java -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar BuildBamIndex VALIDATION_STRINGENCY=LENIENT INPUT=$sample
done

# Reference
java -Xmx2g -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar CreateSequenceDictionary R=chr22_with_ERCC92.fa O=chr22_with_ERCC92.dict
samtools faidx chr22_with_ERCC92.fa

#SplitNtrim
gatk SplitNCigarReads -R chr22_with_ERCC92.fa -I Aligned.out_merged.dedup.bam -O split.bam 

#Download known variants
wget 'ftp://ftp.ensembl.org/pub/release-96/variation/vcf/homo_sapiens/homo_sapiens-chr22.vcf.gz' -O human_chr22.vcf.gz
gunzip human_chr22.vcf.gz
grep "^22" human_chr22.vcf | sed 's/^22/chr22/' >> human_fam_chr22.vcf
gatk IndexFeatureFile -F human_fam_chr22.vcf

#Recalibration (failed to recalibrate, explanation  in troubleshooting file)
#adding dictionary to vcf
 gatk UpdateVCFSequenceDictionary \
     -V human_chr22.vcf \
     -R chr22_with_ERCC92.fa \
     --output human_chr22_newcontiglines.vcf
#sorting VCF
java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar SortVcf \
      I=human_chr22_newcontiglines.vcf \
      O=human_chr22.sorted.vcf
#indexing vcf
bgzip -c human_chr22.sorted.vcf > human_chr22.sorted.vcf.gz
tabix -p vcf human_chr22.sorted.vcf.gz
#rename
grep "^22" human_chr22.sorted.vcf.gz.tbi | sed 's/^22/chr22/' >> human_chr22.sorted_fam.vcf.gz.tbi
gatk IndexFeatureFile -F human_chr22.sorted_fam.vcf.gz.tbi
 
gatk BaseRecalibrator \
   -I ~/assign2/STARmap/Aligned.out_merged.dedup.bam \
   -R ~/assign2/STARmap/chr22_with_ERCC92.fa \
   --known-sites ~/assign2/STARmap/human_chr22.sorted_fam.vcf.gz.tbi\
   -O recal_data.table

gatk ApplyBQSR \
   -R ~/assign2/STARmap/chr22_with_ERCC92.fa \
   -I ~/assign2/STARmap/Aligned.out_merged.dedup.bam \
   --bqsr-recal_data.table \
   -O output.bam

#Haplotype Caller
 gatk --java-options "-Xmx2G" HaplotypeCaller \
  -R chr22_with_ERCC92.fa -I Aligned.out_merged.dedup.bam\
  --emit-ref-confidence GVCF \
  --pcr-indel-model NONE \
  -O $name.gvcf

gatk --java-options "-Xmx4g" HaplotypeCaller  \
   -R chr22_with_ERCC92.fa \
   -I Aligned.out_merged.dedup.bam \
   -O output.g.vcf.gz 

java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar ValidateSamFile \
      I=Aligned.out_merged.dedup.bam \
      R=chr22_with_ERCC92.fa \
      MODE=VERBOSE

java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar ValidateSamFile \
      I=Aligned.out_merged.sorted.bam \
      R=chr22_with_ERCC92.fa \
      MODE=VERBOSE


samtools calmd -bAr Aligned.out_merged.dedup.bam chr22_with_ERCC92.fa > Aligned.out_merged_fixeddedup.bam

java -jar ~/miniconda3/envs/ngs1/share/picard-2.19.0-0/picard.jar ValidateSamFile \
      I=Aligned.out_merged_fixeddedup.bam \
      R=chr22_with_ERCC92.fa \
      MODE=VERBOSE