-
Notifications
You must be signed in to change notification settings - Fork 1
/
smaltalign.sh
executable file
·149 lines (126 loc) · 4.26 KB
/
smaltalign.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/bin/bash
### defaults
script_dir=$( dirname "$(readlink -f "$0")" )
n_reads=200000
iterations=4
### arguments
if [ $# == 0 ]; then
echo
echo 'Usage: smaltalign.sh -r <reference_file> [options] <fastq_file/directory> '
echo 'Options:'
echo ' -r reference'
echo ' -n INT number of reads (default 200000)'
echo ' -i INT iterations (default 4)'
echo
exit
fi
while [[ $# -gt 1 ]]; do
key="$1"
case $key in
-r)
ref="$2"
shift # past argument
;;
-n)
n_reads="$2"
shift # past argument
;;
-i)
iterations="$2"
shift # past argument
;;
*)
# unknown option
;;
esac
shift # past argument or value
done
if [[ -n $1 ]]; then
sample_dir=$1
fi
### convert relative to absolute path
ref=$( readlink -f $ref )
sample_dir=$( readlink -f $sample_dir )
### print arguments
echo -e 'sample_dir: ' $sample_dir
echo -e 'script_dir: ' $script_dir
echo -e 'ref: ' $ref
echo -e 'n_reads: ' $n_reads
echo -e 'iterations: ' $iterations
### loop over list of files to analyse
if [ -d $sample_dir ]; then list=$(ls $sample_dir | grep .fastq); else list=$sample_dir; fi
for i in $list; do
name=$(basename $i | sed 's/_L001_R.*//' | sed 's/.fastq.gz//'| sed 's/.fastq//')
### sample reads with seqtk
seqtk sample $i $n_reads > ${name}_reads.fastq
n_sample=$(wc -l ${name}_reads.fastq | cut -f 1 -d " ")
n_sample=$(($n_sample / 4))
### de novo aligment
echo
echo sample $name, $n_sample reads, de-novo alignment
echo "*******************************************************************************"
velveth ${name} 29 -fastq ${name}_reads.fastq
velvetg ${name} -min_contig_lgth 200
### fake fastq of contigs and cat to reads in triplicate
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < ${name}/contigs.fa | awk 'BEGIN {RS = ">" ; FS = "\n"} NR > 1 {print "@"$1"\n"$2"\n+"$1"\n"gensub(/./, "I", "g", $2)}' > ${name}/contigs.fastq
cat ${name}_reads.fastq ${name}/contigs.fastq ${name}/contigs.fastq ${name}/contigs.fastq > ${name}_reads_contigs.fasta
it=1
while [ "$it" -le "$iterations" ]; do
echo
echo sample $name, $n_sample reads, iteration $it
if [ -s $ref ]; then
echo reference $ref
echo "*******************************************************************************"
else
echo ERROR: No reference
exit
fi
### align with smalt to reference, reads either ${name}_reads.fastq or ${name}_reads_contigs.fasta
smalt index -k 7 -s 2 ${name}_${it}_smalt_index $ref
samtools faidx $ref
if [ "$it" -eq 1 ]; then
smalt map -n 24 -x -y 0.5 -f samsoft -o ${name}_${it}.sam ${name}_${it}_smalt_index ${name}_reads_contigs.fasta
else
smalt map -n 24 -x -y 0.5 -f samsoft -o ${name}_${it}.sam ${name}_${it}_smalt_index ${name}_reads.fastq
fi
samtools view -Su ${name}_${it}.sam | samtools sort -o ${name}_${it}.bam
samtools index ${name}_${it}.bam
### create consensus with freebayes
freebayes -f $ref -p 1 ${name}_${it}.bam > ${name}_${it}.vcf
muts=$(grep -c -v "^#" ${name}_${it}.vcf)
if [ "$muts" -eq "0" ]; then
echo WARNING: vcf file empty
cat $ref > ${name}_${it}_cons.fasta
else
vcf2fasta -f $ref -p ${name}_${it}_ -P 1 ${name}_${it}.vcf
mv ${name}_${it}_unknown* ${name}_${it}_cons.fasta
fi
### create vcf with lofreq
rm -f ${name}_${it}_lofreq.vcf
if [ "$(uname)" == "Darwin" ]; then
cores=$(sysctl -n hw.ncpu) ### Mac OS X platform
elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
cores=$(nproc) ### GNU/Linux platform
else
cores=2 ### unkonwn/other platforms
fi
cores=$(expr $cores / 2) ### only run on half the cores
cores="${cores/0/1}"
echo =-=-= lofreq with $cores cores =-=-=
lofreq call-parallel --pp-threads $cores -f $ref -o ${name}_${it}_lofreq.vcf ${name}_${it}.bam
### calculate depth
samtools depth -d 1000000 ${name}_${it}.bam > ${name}_${it}.depth
### remove temporary files of this iteration
rm -f ${name}_${it}.sam
rm -f ${name}_${it}.vcf
rm -f ${name}_${it}_smalt_index.*
rm -f ${name}_*_cons.fasta.fai
### new ref for next iteration
ref=${name}_${it}_cons.fasta
((it+=1))
done
### remove temporary files
rm -f ${name}_reads.fastq
rm -f ${name}_reads_contigs.fasta
rm -rf ${name}
done