-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathprimer_bam_to_non_targets.py
209 lines (203 loc) · 8.42 KB
/
primer_bam_to_non_targets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# This function reads BAM-file of mapping primer sequence onto reference genome
# and returns dictionary that contains all regions to which each primer sequence
# was mapped
# As input it gets:
# - AlignedSegment from pysam (read)
# - maximum number of nonspecific regions that user is ready to save (maxPrimerNonspec)
# - reference genome opened with pysam (refFa)
# - information about primers for storing target region (primersInfo):
# {amplicon_name:[amplicon_number,left_primer,right_primer,
# amplicon_name,amplicon_start,amplicon_end]}
# As output it gives:
# - sequence of primer that was mapped onto the reference genome
# - list of unspecific regions to which it was mapped:
# [[chrom,strand,position,regionLen]]
import re
import logging
from Bio import Seq
from Bio import pairwise2
logger=logging.getLogger(__name__)
def revComplement(nuc):
try:
return(str(Seq.Seq(nuc).reverse_complement()))
except ValueError as e:
if 'Mixed RNA/DNA found' in str(e):
print('ERROR (3): Mixed RNA/DNA found:')
print(nuc)
exit(3)
else:
print('ERROR: Unknown ValueError!')
exit(0)
def readToPrimerNonTargets(read,maxPrimerNonspec,refFa,primersInfo=None):
indelsPat=re.compile('(\d+)([ID])')
matchPat=re.compile('(\d+)M')
# Extract strand of the main match in genome
if read.flag==0:
strand=1
elif read.flag==16 or read.flag==20:
strand=-1
elif read.flag==4:
return(read.qname,[])
else:
print('ERROR! Unknown value of FLAG:',read.flag)
print(read)
exit(1)
if primersInfo:
primersName=read.qname
for primerPairNum,primerPairInfo in primersInfo.items():
if primersName in primerPairInfo[1:3]:
primerNumInPair=primerPairInfo[1:3].index(primersName)
infoStrand=int((-1)**primerNumInPair)
try:
## pos=int(primersInfo[primerPairNum][5+primerNumInPair*4])+primerNumInPair
pos=int(primersInfo[primerPairNum][5+primerNumInPair])
except IndexError:
print('ERROR (2): Incorrect index:')
print(primersInfo)
print(primersName)
print(primerPairNum)
print(primerNumInPair)
exit(2)
try:
targetRegion=[primersInfo[primerPairNum][4],
infoStrand*pos]
except TypeError:
print('ERROR!',primersInfo[primerPairNum][4],primersInfo[primerPair][0],primerNumInPair)
exit(15)
break
else:
targetRegion=[]
if strand==-1:
mainMapping=[read.reference_name,strand*(read.pos+len(read.qname))]
else:
mainMapping=[read.reference_name,strand*(read.pos+1)]
if (mainMapping!=targetRegion):
## The next string is necessary for human genomes
## to exclude unsorted chromosome fragments from the analysis
## and '_' not in read.reference_name
nonSpecRegions=[','.join([read.reference_name,
str(strand*(read.pos+1)),
read.cigarstring,
str(read.get_tag('NM'))])]
else:
nonSpecRegions=[]
if read.has_tag('XA'):
# XA tag of read ends with ; so the last element is empty
nonSpecRegions.extend(read.get_tag('XA').split(';')[:-1])
qname=read.qname
if len(nonSpecRegions)>maxPrimerNonspec:
return(qname,nonSpecRegions)
primerNonSpecRegions=[]
for region in nonSpecRegions:
# We go through all these regions and check
# that 3'-nucleotide matches primer's 3'-end
chrom,pos,cigar,subst=region.split(',')
# Determine length of sequence of interest by parsing CIGAR.
# The length depends only on the deletions
# but not mismatches nor insertions into reference genome
# So we count number of deletions
indelsMatch=indelsPat.findall(cigar)
matchMatch=matchPat.findall(cigar)
if len(indelsMatch)==0:
regionLen=int(matchMatch[0])
else:
sumDeletions=0
sumMatch=0
for m in indelsMatch:
if m[1]=='D':
sumDeletions+=int(m[0])
for m in matchMatch:
sumMatch+=int(m)
regionLen=sumMatch+sumDeletions
if int(pos)<0:
pos2=-(int(pos)+regionLen)
else:
pos2=int(pos)
if ([chrom,int(pos)]==targetRegion or
(len(targetRegion)>0 and
chrom==targetRegion[0] and
abs(targetRegion[1]-pos2)<=len(read.seq))):
## print([chrom,int(pos)])
continue
attempts=0
seq=None
while(seq is None):
try:
seq=refFa.fetch(region=chrom+':'
''+str(abs(int(pos)))+'-'
''+str(abs(int(pos))+regionLen-1))
except:
seq=None
attempts+=1
if attempts>=10:
print('ERROR!')
## logger.error(str(e))
print(refFa.filename)
logger.error(refFa.filename)
print(chrom,pos,regionLen)
exit(1)
# Determine, which sequence we should take:
# forward or reverse-complement
# If primer is on + strand and found region is on opposite
# or primer is on - strand and found region is on the same
# we take reverse-complement
if int(pos)>0:
regStrand=1
elif int(pos)<0:
regStrand=-1
if (strand>0 and int(pos)<0) or (strand<0 and int(pos)>0):
try:
seq=str(Seq.Seq(seq).reverse_complement()).upper()
# If there is an error like 'Mixed RNA/DNA found'
except ValueError as e:
if 'Mixed RNA/DNA found' in str(e):
print('ERROR (1): Mixed RNA/DNA found:')
print(seq)
exit(1)
else:
print('ERROR: Unknown ValueError!')
exit(0)
else:
seq=seq.upper()
# Check if read sequence is the same as read qname
if read.qname==read.seq:
primerSeq=read.seq
regionSeq=seq
else:
primerSeq=read.qname
regionSeq=revComplement(seq)
if len(regionSeq)==1:
continue
# If there is some insertions or deletion in found region
if 'I' in cigar or 'D' in cigar:
# We need to align its sequence with primer sequence
align=pairwise2.align.globalxx(primerSeq,regionSeq)
# Check that 3'-ends of primers are identical
## and one of two nucleotides before 3'-ends are identical, too
if (align[0][0][-1]==align[0][1][-1] or
align[0][0][-2]==align[0][1][-2]):
# Then we consider this region as a non-specific
# for this primer
primerNonSpecRegions.append([chrom,
regStrand,
abs(int(pos)),
regionLen])
else:
try:
if (regionSeq[-1]==primerSeq[-1] or
regionSeq[-2]==primerSeq[-2]):
# Then we consider this region as a non-specific for this primer
primerNonSpecRegions.append([chrom,
regStrand,
abs(int(pos)),
regionLen])
except IndexError:
print('ERROR (2): incorrect index for sequences:')
print('regionSeq:',regionSeq)
print('primerSeq:',primerSeq)
print('chr:',chrom)
print('position:',pos)
print('regionLen:',regionLen)
print(-1,-2)
exit(2)
return(read.qname,primerNonSpecRegions)