-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta.qual_summary_plot.py
125 lines (105 loc) · 4.09 KB
/
fasta.qual_summary_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
Get quality information from fasta.screen.qual file generated by phred.
Reads are from ABI3730.
"""
import sys
from pylab import *
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
from itertools import chain
if len(sys.argv) < 3:
## print >>sys.stderr, "Usage: %prog <x.fasta.qual> quality_cutoff(0-99, recommend using 20)"
print >>sys.stderr, "Usage: python2.7 fasta.qual_summary_plot.py <x.fasta.qual> quality_cutoff(0-99, recommend using 20)"
sys.exit()
quality_cut = int(sys.argv[2])
sflag=0
low=0
fw1 = sys.argv[1]+".summary"
fw = open(fw1,'w')
with file(sys.argv[1],'r') as f:
for row in f:
if row[0]==">" and sflag==1:
if nhigh==0: low+=1; print "Read {} has no high quality base calls.".format(name)
else:
high_proportion = float(nhigh)/ntotal
high_avg = float(sumhigh)/nhigh
print >>fw, "{}\t{:.2f}\t{:.1f}".format(name, high_proportion, high_avg)
name = row.split(" ")[0].lstrip(">").replace(".ab1","",1)
nhigh = 0
ntotal = 0
sumhigh = 0
elif sflag==0:
name = row.split(" ")[0].lstrip(">").replace(".ab1","",1)
nhigh = 0
ntotal = 0
sumhigh = 0
sflag=1
elif row[0]!=">":
numbers = [int(x) for x in row.strip("\n").split()]
nhigh += len([x for x in numbers if x > quality_cut])
ntotal += len(numbers)
sumhigh += sum([x for x in numbers if x > quality_cut])
fw.close()
print "number of reads with no high quality bases: ",low
## frequency plot
bin_num=30
image_name = "hists.png"
fig = plt.figure(1, (5,10), dpi=300)
ax = axes([0.15,0.55,.8,.4])
ax2 = axes([0.15,0.05,.8,.4])
fin=file(fw1,'r')
x11 = [float(row.strip('\n').split('\t')[1]) for row in fin if "b1" in row.strip('\n').split('\t')[0]]
fin.seek(0)
x12 = [float(row.strip('\n').split('\t')[1]) for row in fin if "g1" in row.strip('\n').split('\t')[0]]
fin.seek(0)
x21 = [float(row.strip('\n').split('\t')[2]) for row in fin if "b1" in row.strip('\n').split('\t')[0]]
fin.seek(0)
x22 = [float(row.strip('\n').split('\t')[2]) for row in fin if "g1" in row.strip('\n').split('\t')[0]]
for x,col in zip([x11, x12],["g","r"]):
# calc frequencies
n, bins = np.histogram(x, bin_num)
nsum = float(sum(n))
n=map(lambda x: x/nsum, n)
# plot
left = np.array(bins[:-1])
right = np.array(bins[1:])
bottom = np.zeros(len(left))
top = bottom + n
XY = np.array([[left,left,right,right], [bottom,top,top,bottom]]).T
barpath = path.Path.make_compound_path_from_polys(XY)
patch = patches.PathPatch(barpath, facecolor=col, edgecolor='gray', alpha=0.5)
ax.add_patch(patch)
ax.set_xlim(left[0], right[-1])
ax.set_ylim(bottom.min(), top.max()+.05)
ax.set_xlabel('proportion of bases quality > {}'.format(quality_cut))
ax.set_ylabel('frequency')
ax.legend(('b1', 'g1'),'upper left', shadow=True)
ax.set_title(sys.argv[1])
ax.grid(True)
for x,col in zip([x21, x22],["g","r"]):
# calc frequencies
n, bins = np.histogram(x, bin_num)
nsum = float(sum(n))
n=map(lambda x: x/nsum, n)
# plot
left = np.array(bins[:-1])
right = np.array(bins[1:])
bottom = np.zeros(len(left))
top = bottom + n
XY = np.array([[left,left,right,right], [bottom,top,top,bottom]]).T
barpath = path.Path.make_compound_path_from_polys(XY)
patch = patches.PathPatch(barpath, facecolor=col, edgecolor='gray', alpha=0.5)
ax2.add_patch(patch)
ax2.set_xlim(left[0], right[-1])
ax2.set_ylim(bottom.min(), top.max()+.05)
ax2.set_xlabel('Avg quality of high quality (>{}) bases of reads'.format(quality_cut))
ax2.set_ylabel('frequency')
ax2.legend(('b1', 'g1'),'upper left', shadow=True)
ax2.grid(True)
fin.close()
plt.savefig(image_name,dpi=300)
print >>sys.stderr, "print image to %s" % image_name