-
Notifications
You must be signed in to change notification settings - Fork 0
/
DetMappability.py
executable file
·428 lines (353 loc) · 13.7 KB
/
DetMappability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
#!/usr/bin/python
# Created on: 2015.04.13
# Author: Carlo Artieri
##############################
# HISTORY AND THINGS TO FIX #
##############################
#2015.04.13 - Initial script
# - Non-functional, in planning stages.
#2015.04.14 - Finished preliminary script. Resolve following bugs:
# First position in the output file is wrong and has '0' for edit_dist, no NA
# Two soft-clips in the same read cause two 'S' in type
# Make sure that -c/-clipmax takes into account sum of all clips
# Test NM tag on I/D CIGARs to see how edit distance is handled.
#2015.04.15 - Fixed first position error
# Fixed clipmax
# Added I/D edit distance calc, but still not sure if I/D is included in NM tag,
# information I've found seems to suggest no.
# - Want to output the edit distance in BedGraph format to load onto IGV.
###########
# MODULES #
###########
import sys #Access to simple command-line arguments
import argparse #Access to long command-line parsing
import datetime #Access to calendar/clock functions
import re #Access to REGEX splitting
import math #Access to math functions
import subprocess #Access to shell commands
import random #Access to random number generator
import os #Access to shell commands
##########################
# COMMAND-LINE ARGUMENTS #
##########################
epilog = """\
This script determines the mappability of reads of a specified length to a specified genome
using the Burrows-Wheeler Aligner in MEM mode (http://bio-bwa.sourceforge.net/), which must be
installed and in the PATH.
An input FASTA is tiled into segments of length N, and mapped back to the whole genome,
recording the minimum 'edit distance' to the original location among all reads that can be
mapped elsewhere.
edit distance = # of mismatches + # indel bases + # of soft-clips
Reads that cannot be mapped elsewhere are assigned an edit distance of int(N/2).
More detailed description of options follows:
-f/--fasta
An input FASTA file containing the sequences upon which to determine mappability. The
script runs in two 'modes'. If the FASTA is the whole genome (or whole chromsomes),
mappabiltiy for the whole genome will be reported. If the FASTA contains segments of the
genome, then a GFF/GTF file MUST be provided with the -g/--gff command giving the genomic
coordinates of the segments. Each segment must be a single GFF line with a column 9
identifier matching the header of the segment in the FASTA (e.g. gene=ADH; for a GFF,
gene "ADH"; for a GTF)
-l/--length
The length of the reads for which to determine mappability.
-o/--outfile
The name of a table that will contain the following columns:
CHROMOSOME
POSITION 1-based position
FEATURE Name of the feature if running in -g/--gff mode
MIN_EDIT_DIST The minimum edit distance to map non-specifically for this position
EDIT_TYPE The min necessary edits: M, mismatch; I, insertion; D, deletion: S, soft-clip
-x/--index
Location of the bwa mem index.
-t/--threads
Number of CPUs to use for BWA.
-g/--gff
GFF or GTF annotation file for running the script on genomic segments. See above.
-i/--identifier
The identifier in column 9 to use for matching the annotations to the FASTA file. In GFF
files, this is typically 'gene', whereas in GTF files it's 'gene_id'
-b/--bedgraph
In addition to the output table, produce a track of edit distance in BedGraph format
(see: http://genome.ucsc.edu/goldenpath/help/bedgraph.html)
-c/--clipmax
The maximum value that soft-clips can contribute to the edit distance. For example, if
mapping without allowing clipping, set this to 0.
"""
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description='Determine the DNA-seq mappability of genomic regions using BWA', add_help=False, epilog=epilog, formatter_class=CustomFormatter)
req = parser.add_argument_group('Required arguments:')
req.add_argument('-f','--fasta', action='store', dest='fasta', help='Input fasta file', required=True, metavar='')
req.add_argument('-l','--length', action='store', dest='length', type=int, help='Length of sequences to determine mappability', required=True, metavar='')
req.add_argument('-o','--outfile', action='store', dest='outfile', help='Output table containing mappability information', required=True, metavar='')
bwa = parser.add_argument_group('bwa-specific arguments:')
bwa.add_argument('-x','--index', action='store', dest='index', help='bwa mem index', default=None, metavar='', required=True,)
bwa.add_argument('-t','--threads', action='store', dest='threads', type=int, help='Number of bwa threads', default='1', metavar='')
opt = parser.add_argument_group('Optional arguments:')
opt.add_argument('-g','--gff', action='store', dest='gff', help='Supply gff and run in feature mode (see below)', metavar='')
opt.add_argument('-i','--identifier', action='store', dest='id', help='GFF/GTF column 9 identifier of interest', default='gene', metavar='')
opt.add_argument('-b','--bedgraph', action='store_true', dest='bg', help='Produce a BedGraph formatted track from the output table named <output>.bg')
opt.add_argument('-c','--clipmax', action='store', dest='clip', help='Max soft-clips to add to edit distance', type=int, default='30', metavar='')
opt.add_argument('-h', '--help', action='help', help='show this help message and exit')
args = parser.parse_args()
#############
# FUNCTIONS #
#############
#Creates a dictionary from a FASTA file where: dict[header] = sequence
def fasta_to_dict(file):
fasta_file = open(file, "r") #Open the file for reading
fasta_dic = {}
for line in fasta_file:
line = line.rstrip('\n')
if re.match('^>', line):
line_split = line.split(' ') #By default keeps only first word of the header.
header = line_split[0].translate(None, '>')
fasta_dic[header] = ''
else:
fasta_dic[header] += line
fasta_file.close()
return fasta_dic
#Splits a CIGAR string into two lists where the one letter type and numerical value are in
#the same order as found in the string.
def split_CIGAR(cigar):
cig_types_tmp = re.split('[0-9]',cigar)
cig_vals_tmp = re.split('[MIDNSHP\=X]',cigar)
cig_types = []
cig_vals = []
for i in cig_types_tmp:
if i != '':
cig_types.append(i)
for i in cig_vals_tmp:
if i != '':
cig_vals.append(i)
return cig_types,cig_vals
##########
# SCRIPT #
##########
#First step is to read in the FASTA sequence and create the 'dummy' FASTQ
fasta_dict = fasta_to_dict(args.fasta)
out_fastq_name = args.fasta + '.temp.fastq'
out_fastq = open(out_fastq_name, 'w') #Open a temporary file where we'll hold
#our dummy FASTQ
#Initialize a mismatch dictionary for plotting purposes.
mismatch_dist = {}
mismatch_min_type = {}
#If we're running in 'feature' mode, then we supply a gff file to determine the actual genomic
#coordinates of the sequences.
if args.gff is not None:
gff_dict = {}
gff_file = open(args.gff, 'r')
for line in gff_file:
if re.match('^#', line):
continue
gff_type = args.id + '='
gtf_type = args.id + ' "'
line = line.rstrip('\n')
line_t = line.split('\t')
if gff_type in line_t[8]:
line_t8_split = line_t[8].split(';')
for i in line_t8_split:
if gff_type in i:
type_split = i.split('=')
start = int(line_t[3])-1
gff_dict[type_split[1]] = [line_t[0],start]
elif gtf_type in line_t[8]:
line_t8_split = line_t[8].split(';')
for i in line_t8_split:
if gtf_type in i:
type_split = i.split('"')
type_split[1] = type_split[1].strip()
start = int(line_t[3])-1
gff_dict[type_split[1]] = [line_t[0],start]
gff_file.close()
keys = sorted(fasta_dict.keys())
for key in keys:
seq_len = len(fasta_dict[key])
mismatch_dist[key] = [0]*seq_len
#Tile across the sequence with substrings
for i in range(seq_len):
#If we're running in feature mode, then we need to adjust the read positions by
#the gff coordinates. Header will be in format @GENE|CHR|0-START|0-STOP
if args.gff is not None:
start = i + int(gff_dict[key][1])
stop = start + args.length
chrom = gff_dict[key][0]
else:
start = i
stop = start + args.length
chrom = key
i_stop = i + args.length #Set the substr
if i_stop <= seq_len:
subseq = fasta_dict[key][i:i_stop]
qual = '~' * args.length #Quality will be max value
out_fastq.write('@' + str(key)+'|'+str(chrom)+'|'+str(start)+'|'+str(stop-1)+'\n'+subseq+'\n'+'+\n'+qual+'\n')
out_fastq.close()
#Now we map the FASTQ back to the genome using BWA.
os.system('bwa mem -a -t ' + str(args.threads) + str(args.index) + ' ' + str(out_fastq_name) + '> ' + str(args.fasta) + '.temp.sam')
#Now go through the SAM file and extract mismatching reads.
sam_file = open(args.fasta + '.temp.sam', 'r')
curr_read = None
min_edit_dist = None
edit_dist = None
min_mis_type = None
mis_type = None
#Open the outfile where we'll write our mismatch table
mismatch_table = open(args.outfile, 'w')
mismatch_table.write('CHROMOSOME\tPOSITION\tFEATURE\tMIN_EDIT_DIST\tEDIT_TYPE\n')
for line in sam_file:
if re.match('^@', line):
continue
line = line.rstrip('\n')
line_t = line.split('\t')
read_deets = line_t[0].split('|')
edit_dist = None
mis_type = []
#Determine the edit distance, which is the value of soft-clips 'S' in the CIGAR string plus
#mismatches in the NM flag.
#First soft-clips if they exist
cigar_types,cigar_vals = split_CIGAR(line_t[5])
rem_clip = args.clip
for i in range(len(cigar_types)):
if cigar_types[i] == 'S':
mis_type.append(str(cigar_types[i])+cigar_vals[i])
#Adjust for max clip distance:
if int(cigar_vals[i]) > rem_clip:
if edit_dist is None:
edit_dist = int(rem_clip)
rem_clip = 0
else:
edit_dist += int(rem_clip)
rem_clip = 0
else:
if edit_dist is None:
edit_dist = int(cigar_vals[i])
rem_clip -= int(cigar_vals[i])
else:
edit_dist += int(cigar_vals[i])
rem_clip -= int(cigar_vals[i])
elif cigar_types[i] == 'D':
mis_type.append(str(cigar_types[i])+cigar_vals[i])
if edit_dist is None:
edit_dist = int(cigar_vals[i])
else:
edit_dist += int(cigar_vals[i])
elif cigar_types[i] == 'I':
mis_type.append(str(cigar_types[i])+cigar_vals[i])
if edit_dist is None:
edit_dist = int(cigar_vals[i])
else:
edit_dist += int(cigar_vals[i])
#Now NM tags
tags = line_t[11].split(' ')
for tag in tags:
if 'NM' in tag:
tag_val = tag.split(':')
if int(tag_val[2]) == 0:
pass
else:
mis_type.append('M'+str(tag_val[2]))
if edit_dist is None:
edit_dist = int(tag_val[2])
else:
edit_dist += int(tag_val[2])
if curr_read is None:
curr_read = line_t[0]
curr_read_deets = read_deets
min_edit_dist = None
if line_t[3] == int(read_deets[2])+1 and line_t[2] == read_deets[1]:
pass
else:
min_edit_dist = edit_dist
min_mis_type = mis_type
elif curr_read == line_t[0]:
curr_read_deets = read_deets
if line_t[3] == int(read_deets[2])+1 and line_t[2] == read_deets[1]:
pass
else:
#Is the edit-distance less than the minimum edit distance?
if min_edit_dist is None:
min_edit_dist = edit_dist
min_mis_type = mis_type
elif edit_dist < min_edit_dist:
min_edit_dist = edit_dist
min_mis_type = mis_type
elif curr_read != line_t[0]:
#Print everything to the table
if min_edit_dist is None:
min_edit_dist = int(args.length/2)
mistype = 'NA'
else:
if len(min_mis_type) == 1:
mistype = min_mis_type[0]
else:
mistype = '|'.join(min_mis_type)
if args.gff is None:
feat = 'NA'
else:
feat = read_deets[0]
mismatch_table.write(str(curr_read_deets[1]) + '\t' + str(int(curr_read_deets[2])+1) + '\t' + feat + '\t' + str(min_edit_dist) + '\t' + mistype + '\n')
#Reset everything
min_edit_dist = None
edit_dist = None
min_mis_type = None
mis_type = None
#Do analysis with new read
curr_read = line_t[0]
curr_read_deets = read_deets
if line_t[3] == int(read_deets[2])+1 and line_t[2] == read_deets[1]:
pass
else:
min_edit_dist = edit_dist
min_mis_type = mis_type
#Write out the last line before closing the output file
if min_edit_dist is None:
min_edit_dist = int(args.length/2)
mistype = 'NA'
else:
if len(min_mis_type) == 1:
mistype = min_mis_type[0]
else:
mistype = '|'.join(min_mis_type)
if args.gff is None:
feat = 'NA'
else:
feat = read_deets[0]
mismatch_table.write(str(curr_read_deets[1]) + '\t' + str(int(curr_read_deets[2])+1) + '\t' + feat + '\t' + str(min_edit_dist) + '\t' + mistype + '\n')
sam_file.close()
mismatch_table.close()
#Generate BedGraph track if args.bg is True
if args.bg is True:
outbg_file_name = str(args.outfile) + '.bedgraph'
outbg_file = open(outbg_file_name, 'w')
#Track header
outbg_file.write('track type=bedGraph name='+ str(args.outfile) + ' description=' + str(args.outfile) +' color=0,0,0\n')
#BedGraph options: track type=bedGraph name=track_label description=center_label
#visibility=display_mode color=r,g,b altColor=r,g,b
#priority=priority autoScale=on|off alwaysZero=on|off
#gridDefault=on|off maxHeightPixels=max:default:min
#graphType=bar|points viewLimits=lower:upper
#yLineMark=real-value yLineOnOff=on|off
#windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16
infile = open(args.outfile, 'r')
val = None
curr_chr = None
start_pos = 0
for line in infile:
if re.match('^CHROMOSOME', line):
continue
line = line.rstrip('\n')
line_t = line.split('\t')
if val is None:
val = line_t[3]
curr_chr = line_t[0]
start_pos = int(line_t[1])-1
stop_pos = int(line_t[1])-1
elif val == line_t[3] and curr_chr == line_t[0]:
stop_pos = int(line_t[1])-1
else:
outbg_file.write(str(curr_chr)+'\t'+str(start_pos)+'\t'+str(stop_pos)+'\t'+str(val)+'\n')
val = line_t[3]
curr_chr = line_t[0]
start_pos = int(line_t[1])-1
stop_pos = int(line_t[1])-1
outbg_file.close()