-
Notifications
You must be signed in to change notification settings - Fork 1
/
smaltalign_indel.sh
executable file
·185 lines (157 loc) · 5.7 KB
/
smaltalign_indel.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/bin/bash
### defaults
script_dir=$( dirname "$(readlink -f "$0")" )
n_reads=1000000
iterations=4
### arguments
if [ $# == 0 ]; then
echo
echo 'Usage: smaltalign.sh -r <reference_file> [options] <fastq_file/directory> '
echo 'Options:'
echo ' -r reference'
echo ' -n INT number of reads (default 200000)'
echo ' -i INT iterations (default 4)'
echo
exit
fi
while [[ $# -gt 1 ]]; do
key="$1"
case $key in
-r)
ref="$2"
shift # past argument
;;
-n)
n_reads="$2"
shift # past argument
;;
-i)
iterations="$2"
shift # past argument
;;
*)
# unknown option
;;
esac
shift # past argument or value
done
if [[ -n $1 ]]; then
sample_dir=$1
fi
### convert relative to absolute path
ref=$( readlink -f $ref )
sample_dir=$( readlink -f $sample_dir )
#if [[ $sample_dir =~ \.gz$ ]]; then
# echo "counting number of reads"
# lines=`zcat $sample_dir|wc -l`
# max_seq=`expr $lines / 4`
# n_reads=$max_seq
#elif [[ $sample_dir =~ \.fastq$ ]]; then
# echo "counting number of reads"
# lines=`cat $sample_dir|wc -l`
# max_seq=`expr $lines / 4`
# n_reads=$max_seq
#fi
#echo "max n_reads is $n_reads"
### print arguments
echo -e 'sample_dir: ' $sample_dir
echo -e 'script_dir: ' $script_dir
echo -e 'ref: ' $ref
echo -e 'n_reads: ' $n_reads
echo -e 'iterations: ' $iterations
### loop over list of files to analyse
if [ -d $sample_dir ]; then list=$(ls $sample_dir | grep .fastq); else list=$sample_dir; fi
for i in $list; do
name=$(basename $i | sed 's/_L001_R.*//' | sed 's/.fastq.gz//'| sed 's/.fastq//')
### sample reads with seqtk
seqtk sample $i $n_reads > ${name}_reads.fastq
n_sample=$(wc -l ${name}_reads.fastq | cut -f 1 -d " ")
n_sample=$(($n_sample / 4))
### de novo aligment
#start=`date +%s`
echo
echo sample $name, $n_sample reads, de-novo alignment
echo "*******************************************************************************"
velveth ${name} 29 -fastq ${name}_reads.fastq
velvetg ${name} -min_contig_lgth 200
### fake fastq of contigs and cat to reads in triplicate
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < ${name}/contigs.fa | awk 'BEGIN {RS = ">" ; FS = "\n"} NR > 1 {print "@"$1"\n"$2"\n+"$1"\n"gensub(/./, "I", "g", $2)}' > ${name}/contigs.fastq
cat ${name}_reads.fastq ${name}/contigs.fastq ${name}/contigs.fastq ${name}/contigs.fastq > ${name}_reads_contigs.fasta
#end=`date +%s`
#runtime=$((end-start))
#echo "Assmebling runtime is $runtime"
it=1
while [ "$it" -le "$iterations" ]; do
echo
echo sample $name, $n_sample reads, iteration $it
if [ -s $ref ]; then
echo reference $ref
echo "*******************************************************************************"
else
echo ERROR: No reference
exit
fi
### align with smalt to reference, reads either ${name}_reads.fastq or ${name}_reads_contigs.fasta
#start=`date +%s`
smalt index -k 7 -s 2 ${name}_${it}_smalt_index $ref
samtools faidx $ref
if [ "$it" -eq 1 ]; then
smalt map -n 24 -x -y 0.5 -f samsoft -o ${name}_${it}.sam ${name}_${it}_smalt_index ${name}_reads_contigs.fasta
else
smalt map -n 24 -x -y 0.5 -f samsoft -o ${name}_${it}.sam ${name}_${it}_smalt_index ${name}_reads.fastq
fi
samtools view -Su ${name}_${it}.sam | samtools sort -o ${name}_${it}.bam
samtools index ${name}_${it}.bam
#end=`date +%s`
#runtime2=$((end-start))
#echo "Alignment smalt runtime is $runtime2"
### create consensus with freebayes
freebayes -f $ref -p 1 ${name}_${it}.bam > ${name}_${it}.vcf
muts=$(grep -c -v "^#" ${name}_${it}.vcf)
if [ "$muts" -eq "0" ]; then
echo WARNING: vcf file empty
cat $ref > ${name}_${it}_cons.fasta
else
vcf2fasta -f $ref -p ${name}_${it}_ -P 1 ${name}_${it}.vcf
mv ${name}_${it}_unknown* ${name}_${it}_cons.fasta
fi
rm -f ${name}_${it}_lofreq.vcf
if [ "$(uname)" == "Darwin" ]; then
cores=$(sysctl -n hw.ncpu) ### Mac OS X platform
elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
cores=$(nproc) ### GNU/Linux platform
else
cores=2 ### unkonwn/other platforms
fi
cores=$(expr $cores / 2) ### only run on half the cores
cores="${cores/0/1}"
echo =-=-= lofreq with $cores cores =-=-=
lofreq call-parallel --pp-threads $cores -f $ref -o ${name}_${it}_lofreq.vcf ${name}_${it}.bam
samtools sort ${name}_${it}.bam > ${name}_${it}_sorted.bam
lofreq indelqual --dindel -f $ref ${name}_${it}_sorted.bam -o ${name}_${it}_indel.bam
samtools index -b ${name}_${it}_indel.bam
lofreq call-parallel --pp-threads $cores --call-indels -f $ref -o ${name}_${it}_lofreq_indel.vcf ${name}_${it}_indel.bam
if [ $(grep -v "^#" ${name}_${it}_lofreq_indel.vcf | wc -l) -gt 0 ] ; then
lofreq filter --only-indels -a 0.15 -v 10 --indelqual-thresh 20 -i ${name}_${it}_lofreq_indel.vcf -o ${name}_${it}_lofreq_indel_temp_hq.vcf
echo lofreq filter is done ${name}_${it}_lofreq_indel_temp_hq.vcf
if [ $(grep -v "^#" ${name}_${it}_lofreq_indel_temp_hq.vcf | wc -l) -gt 0 ] ; then
cp ${name}_${it}_lofreq_indel_temp_hq.vcf ${name}_lofreq_indel_hq.vcf
echo lofreq_indel_hq.vcf file is created.
fi
fi
### calculate depth
samtools depth -d 1000000 ${name}_${it}.bam > ${name}_${it}.depth
### remove temporary files of this iteration
rm -f ${name}_${it}.sam
rm -f ${name}_${it}.vcf
rm -f ${name}_${it}_smalt_index.*
rm -f ${name}_*_cons.fasta.fai
### new ref for next iteration
ref=${name}_${it}_cons.fasta
((it+=1))
done
### remove temporary files
rm -f ${name}_reads.fastq
rm -f ${name}_reads_contigs.fasta
rm -rf ${name}
done