-
Notifications
You must be signed in to change notification settings - Fork 0
/
rmdup.py
267 lines (199 loc) · 11.2 KB
/
rmdup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/python3
# script removes duplicates and filters out unmapped reads
import sys, os, subprocess
import argparse
import pysam
import gzip
import array
import random
import psutil
class SlimRead:
def __init__(self, read, line_num):
self.line_num = line_num
self.tid = read.tid
self.reference_start = read.reference_start
self.reference_end = read.reference_end
def memory_usage_psutil():
# return the memory usage in MB
process = psutil.Process(os.getpid())
mem = process.get_memory_info()[0] / float(2 ** 20)
return mem
#@profile
def rmdup_pe(in_bam_file_name,out_bam_file_name,do_rand_quality):
sys.stderr.write("Running in paired-ends mode\n")
in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb")
sys.stderr.write("#refrences:%d\n" % in_bamfile.nreferences)
# build hash of read pairs and uniq position reads pairs
reads_dict = {}
uniq_reads_corr_dict = {}
r=0
for read in in_bamfile:
r += 1
if (r % 1000000 == 0):
#sys.stderr.write("parsing read %d\n" %r)
sys.stderr.write("parsing read %d, memory: %dMB\n" % (r,memory_usage_psutil()))
if read.is_paired:
#sys.stderr.write("read id: %s " % read.qname)
cur_read_key = read.qname.__hash__()
# skip over unmapped reads using FLAG
mapped_flags = [99,147,83,163,67,131,115,179,81,161,97,145,65,129,113,177]
if read.flag in mapped_flags:
# pair already in reads_dict
if cur_read_key in reads_dict:
# already one read in the list
if len(reads_dict[cur_read_key]) == 1:
# saving the line of the read for memory efficiency
reads_dict[cur_read_key].append(SlimRead(read,r))
# found a pair -> add id to the uniq set
cur_first_read = reads_dict[cur_read_key][0]
cur_second_read = reads_dict[cur_read_key][1]
#print("found a pair\n%s\n%s\n" % (cur_first_read, cur_second_read) )
#sys.stderr.write("found a pair: %s" % read.qname)
if (cur_first_read.tid != cur_first_read.tid):
sys.stderr.write("Removing pairs that are mapped to two different reference sequences (chromosomes)")
continue
# converting reference id to chr for the output to be sorted
cur_pair_key_str = (str(chr(65+cur_first_read.tid)) + "_" + str(chr(65+cur_second_read.tid)) + "_" +
str(min(cur_first_read.reference_start,cur_first_read.reference_end, cur_second_read.reference_start,cur_second_read.reference_end)) + "_" +
str(max(cur_first_read.reference_start,cur_first_read.reference_end, cur_second_read.reference_start,cur_second_read.reference_end)))
cur_pair_key = cur_pair_key_str.__hash__()
#sys.stderr.write("%s\n" % cur_pair_key)
# replacing the dictionary entry by just the line numbers
reads_dict[cur_read_key] = [cur_first_read.line_num, cur_second_read.line_num]
#sys.stderr.write("Read pair position key: %s" % cur_pair_key)
#if (cur_pair_key == "0_0_1472172_1472342"):
# print("found a pair\n%s\n%s\n" % (cur_first_read, cur_second_read) )
# adding to the dictionary of unique pairs
if cur_pair_key in uniq_reads_corr_dict:
#sys.stderr.write("-----in dict: %s, %s" % (cur_pair_key,read.qname) )
uniq_reads_corr_dict[cur_pair_key].append(cur_read_key)
else:
#sys.stderr.write("not in dict: %s, %s" % (cur_pair_key,read.qname))
uniq_reads_corr_dict[cur_pair_key] = [cur_read_key]
#
#uniq_reads_corr_dict[]
else: # more than 2 rreads with the same id? error!
raise Exception("More than two reads with the same id: %s\n" % read)
else: #adding the first read in a pair
#sys.stderr.write("Adding first read: %s\n" % read.qname)
# saving the line of the read for memory efficiency
reads_dict[cur_read_key] = [SlimRead(read,r)]
else:
continue
else:
sys.stderr.write("Found a single end read: %s\n" % read)
#print read
#print("YYYYYYYYYYYYYYYYY")
in_bamfile.close()
# implemented for pair ends
sys.stderr.write("Found %d uniq reads pairs (%d reads), memory: %dMB\n" % (len(uniq_reads_corr_dict),len(uniq_reads_corr_dict)*2, memory_usage_psutil()))
sys.stderr.write("Looking for duplicates...\n")
max_num_of_dupliactes = 0
cnt_dup_pos = 0
out_reads_line_numbers = []
for uniq_pos,pairs_ids in sorted(uniq_reads_corr_dict.items()):
cur_num_pairs = len(pairs_ids)
if (cur_num_pairs>1):
cnt_dup_pos += 1
max_num_of_dupliactes = max(max_num_of_dupliactes,cur_num_pairs)
#print("duplicated position %s, #pairs: %d\n" % (uniq_pos, cur_num_pairs))
#print("position %s, #pairs: %d\n" % (uniq_pos, cur_num_pairs))
cur_pair_id = pairs_ids[random.randint(0,cur_num_pairs-1)]
cur_pair = reads_dict[cur_pair_id]
# writing to the output file
#out_bamfile.write(cur_pair[0])
#out_bamfile.write(cur_pair[1])
# choosing which lines to print
out_reads_line_numbers.append(cur_pair[0])
out_reads_line_numbers.append(cur_pair[1])
# sorting the reads
out_reads_line_numbers.sort()
sys.stderr.write("Found %d duplicated segments in %d uniq segments (%f), max #duplictaes: %d" % (cnt_dup_pos,len(uniq_reads_corr_dict),(1.0*cnt_dup_pos)/len(uniq_reads_corr_dict),max_num_of_dupliactes) )
# printing the output bam
in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb")
out_bamfile = pysam.AlignmentFile(out_bam_file_name, "wb", template=in_bamfile)
sys.stderr.write("Writing output bam file...\n")
out_ind=0
r=0
for read in in_bamfile:
r += 1
if (r<out_reads_line_numbers[out_ind]):
next
elif (r==out_reads_line_numbers[out_ind]):
out_bamfile.write(read)
out_ind += 1
if (out_ind >= len(out_reads_line_numbers)):
break
else:
sys.stderr.write("Error bug in printing the out file %d,%d,%d\n" % (r,out_ind,out_reads_line_numbers[out_ind]))
exit()
if (r % 1000000 == 0):
sys.stderr.write("writing. going over read %d , printed %d reads memroy: %dMB\n" % (r,out_ind,memory_usage_psutil()))
out_bamfile.close()
in_bamfile.close()
# writing a single random pair from each unique position
sys.stderr.write("Finish writing to output file: %s" % (out_bam_file_name) )
def rmdup_se(in_bam_file_name,out_bam_file_name,do_rand_quality):
sys.stderr.write("Running in single end mode\n")
in_bamfile = pysam.AlignmentFile(in_bam_file_name,"rb")
out_bamfile = pysam.AlignmentFile(out_bam_file_name, "wb", template=in_bamfile)
# build hash of read pairs and uniq position reads pairs
reads_dict = {}
uniq_reads_corr_dict = {}
r=0
for read in in_bamfile:
r += 1
if (r % 10000 == 0):
sys.stderr.write("parsing read %d, memroy: %dMB\n" % (r,memory_usage_psutil()))
# adding to dict
reads_dict[read.qname] = read
# converting reference id to chr for the ourput to be sorted
cur_pair_key = str(chr(65+read.tid)) + "_" + str(read.tid) + "_" + str(min(read.reference_start,read.reference_end)) + "_" + str(max(read.reference_start,read.reference_end))
# adding to the dictionary of unique pairs
if cur_pair_key in uniq_reads_corr_dict:
#sys.stderr.write("-----in dict: %s, %s" % (cur_pair_key,read.qname) )
uniq_reads_corr_dict[cur_pair_key].append(read.qname)
else:
#sys.stderr.write("not in dict: %s, %s" % (cur_pair_key,read.qname))
uniq_reads_corr_dict[cur_pair_key] = [read.qname]
sys.stderr.write("Found %d uniq positions for %d reads\n" % (len(uniq_reads_corr_dict),r))
sys.stderr.write("Current memory: %d MB\n" % memory_usage_psutil())
max_num_of_dupliactes = 0
cnt_dup_pos = 0
for uniq_pos,read_ids in sorted(uniq_reads_corr_dict.items()):
cur_num_reads = len(read_ids)
if (cur_num_reads>1):
cnt_dup_pos += 1
max_num_of_dupliactes = max(max_num_of_dupliactes,cur_num_reads)
#print("duplicated position %s, #pairs: %d\n" % (uniq_pos, cur_num_pairs))
#print("position %s, #pairs: %d\n" % (uniq_pos, cur_num_pairs))
cur_read_id = read_ids[random.randint(0,cur_num_reads-1)]
cur_read = reads_dict[cur_read_id]
# writing to the output file
out_bamfile.write(cur_read)
# writing a single random pair from each unique position
sys.stderr.write("Finished! found %d duplicated segments in %d uniq segments (%f), max #duplictaes: %d" % (cnt_dup_pos,len(uniq_reads_corr_dict),(1.0*cnt_dup_pos)/len(uniq_reads_corr_dict),max_num_of_dupliactes) )
out_bamfile.close()
in_bamfile.close()
def main():
parser = argparse.ArgumentParser("Identifies duplicated reads and select one (pair) randomly")
#parser.add_argument("-r", action='store_true', dest='do_rand_quality', default=False)
parser.add_argument("-p", action='store_true', dest='is_paired_ends', default=False)
parser.add_argument("in_bam_file", action='store')
parser.add_argument("out_bam_file", action='store')
options = parser.parse_args()
in_bam_file_name = options.in_bam_file
out_bam_file_name = options.out_bam_file
# if randomize quality scores such that samtools will select a random pair and not the highest score
# notice that this will make the quality scores in downstream analysis random
#sys.stderr.write("Warnning: the -r flag is not implemented, the selection is random - i.e. quality independent\n")
if (options.is_paired_ends):
rmdup_pe(in_bam_file_name,out_bam_file_name,options.do_rand_quality)
else:
# TODO - implement memory efficient version
rmdup_se(in_bam_file_name,out_bam_file_name,options.do_rand_quality)
#sys.stderr.write('TODO - not implemented\n')
#raise Exception('Not implemented')
sys.stderr.write("\nDone!\n")
if __name__ == '__main__':
main()