-
Notifications
You must be signed in to change notification settings - Fork 1
/
influtable.py
executable file
·143 lines (134 loc) · 5.31 KB
/
influtable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/opt/miniconda/bin/python3
"""Summarise SmaltAlign results from an influenza run.
Run on SmaltAlign results and extract base counts with bam-readcount. Then, extract coverage summary statistics
with bedtools.
"""
import sys
import os
import glob
import re
import shlex
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def mean_coverage(bamfile):
"""Run samtools depth to compute mean and stdev of coverage"""
from math import sqrt
cml = shlex.split('samtools depth -a -d 100000 %s' % bamfile)
proc = subprocess.Popen(cml, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True)
with proc.stdout as handle:
m = var = 0.0
for l in handle:
pos, c = (int(i) for i in l.strip().split()[1:])
m += c
var += c * c
return round(m / pos, 2), round(sqrt(var / (pos - 1)), 2)
def coverage_info(bamfile, thresh=50):
"""Run bedtools to extract coverage information."""
cml = shlex.split("bedtools genomecov -ibam %s -max %d" % (bamfile, thresh))
proc = subprocess.Popen(cml, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True)
with proc.stdout as handle:
for l in handle:
if not l.startswith('genome'):
continue
lsp = l.strip().split()
if int(lsp[1]) == thresh:
return float(l.strip().split()[-1])
return 0.0
def get_bases(reffile, bamfile, name=''):
"""Run bam-readcount, parse the results, and return a dictionary with base information."""
info_dict = {
'refname': [],
'position': [],
'refbase': [],
'coverage': [],
'=': [],
'A': [],
'C': [],
'G': [],
'T': [],
'N': []
}
cml = shlex.split('bam-readcount -f %s -w 1 %s %s' % (reffile, bamfile, name))
proc = subprocess.Popen(cml, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, universal_newlines=True)
with proc.stdout as handle:
for i, line in enumerate(handle):
chrom, pos, ref_b, depth = line.split('\t')[:4]
info_dict['refname'].append(chrom)
info_dict['position'].append(pos)
info_dict['refbase'].append(ref_b)
info_dict['coverage'].append(depth)
for bi in line.split('\t')[4:]:
base_info = bi.strip().split(':')
try:
info_dict[base_info[0]].append(int(base_info[1]))
except KeyError:
print('%s entities %s found at pos %s' % (base_info[1], base_info[0], pos), file=sys.stderr)
if i == 4:
pass
return info_dict
def main():
"""What the main does."""
rundir = sys.argv[1]
run_name = os.path.split(rundir.rstrip('/'))[1]
try:
os.mkdir('tables-%s' % run_name)
except OSError:
sys.exit('Directory exists. Delete/rename old one.')
molis_vec = []
sample_vec = []
segment_vec = []
cov_20 = []
cov_1k = []
ave_cov = []
stdev_cov = []
# Run on all segments of all samples
for segmentdir in glob.glob('%s/1000*_S*_L001_R1_001/segment*/' % rundir):
print('## running %s ##' % segmentdir, file=sys.stderr)
molis, sample, segment = re.search(r"\d*_[^/]*/(\d{10}).*(S\d*)_.*/segment(\d*)/", segmentdir).group(1, 2, 3)
# extract iteration 4, skip if absent
try:
ref = glob.glob('%s/*4_cons.fasta' % segmentdir)[0]
bam = glob.glob('%s/*4.bam' % segmentdir)[0]
except IndexError:
print('files not found in %s' % segmentdir, file=sys.stderr)
continue
refname = list(s.id for s in SeqIO.parse(ref, 'fasta'))[0]
baminfo = get_bases(ref, bam, refname)
# write base information to pandas dataframe, manipulate
df = pd.DataFrame(baminfo)
l = df.shape[0]
df['molis'] = [molis] * l
df['segment'] = [segment] * l
df.drop(columns=['='], inplace=True)
# extract nucleotides with max frequency to write consensus sequence
nucleotides = df.loc[:, ['A', 'C', 'G', 'T', 'N']]
nucleotides['MAX'] = nucleotides.idxmax(axis=1)
seq = ''.join(nucleotides['MAX'].tolist())
sr = SeqRecord(Seq(seq), id='%s-%s' % (molis, segment), description='')
SeqIO.write(sr, 'tables-%s/%s-%s.fasta' % (run_name, molis, segment), 'fasta')
# save table
filename = 'tables-%s/%s-%s.csv' % (run_name, molis, segment)
df = df.set_index('position')
df.to_csv(filename)
molis_vec.append(molis)
sample_vec.append(sample)
segment_vec.append(segment)
cov_20.append(coverage_info(bam, 20))
cov_1k.append(coverage_info(bam, 1000))
m, s = mean_coverage(bam)
ave_cov.append(m)
stdev_cov.append(s)
print('###############################', file=sys.stderr)
# save coverage info to file
cov_df = pd.DataFrame({'molis': molis_vec,
'segment': segment_vec,
'cov_20': cov_20,
'cov_1k': cov_1k,
'ave_cov': ave_cov,
'stdev_cov': stdev_cov})
cov_df.to_csv('tables-%s/coverage_info.csv' % rundir, index=False)
if __name__ == '__main__':
main()