-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathin_silico_genetic_map_constructor.py
339 lines (270 loc) · 11.3 KB
/
in_silico_genetic_map_constructor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
### Boas Pucker ###
### [email protected] ###
### v0.15 ###
__usage__ = """
python in_silico_genetic_map_constructor.py
--assembly <ASSEMBLY_FASTA_FILE>
--ref <REFERENCE_FASTA_FILE>
--out <OUTPUT_FOLDER>
optional:
--cluster <flag activates submission of jobs to compute cluster>
bug reports and feature requests: [email protected]
"""
import os, sys, glob, re, time, datetime, shutil
from operator import itemgetter
# --- end of imports --- #
def load_sequences_with_order( fasta_file, len_cutoff ):
"""! @brief load candidate gene IDs from file """
sequences = {}
order = []
with open( fasta_file ) as f:
header = f.readline()[1:].strip().split( " " )[0]
seq = []
line = f.readline()
while line:
if line[0] == '>':
sequences.update( { header: "".join( seq ) } )
order.append( header )
header = line.strip()[1:].split( " " )[0]
seq = []
else:
seq.append( line.strip() )
line = f.readline()
sequences.update( { header: "".join( seq ) } )
order.append( header )
return sequences, order
def chunk_generator( sequence, kmer_len ):
"""! @brief generate chunks of given sequence """
chunks = []
for i in range( len( sequence ) / kmer_len ):
chunks.append( sequence[ i*kmer_len : (i+1) * kmer_len ] )
return chunks
def generate_in_silico_markers( in_silico_marker_file, reference_file, len_cutoff, kmer_len ):
"""! @brief generate in silico markers for in silico genetic map """
seqs, seq_order = load_sequences_with_order( reference_file, len_cutoff )
marker_counter = 0
with open( in_silico_marker_file, "w" ) as out:
for header in seq_order:
seq = seqs[ header ]
chunks = chunk_generator( seq, kmer_len )
for idx, chunk in enumerate( chunks ):
out.write( '>' + header + "_%_" + str( idx+1 ) + '\n' + chunk + '\n' )
marker_counter += len( chunks )
print "number of generated in silico markers: " + str( marker_counter )
def load_BLAST_results( blast_result_file, match_sim_cutoff, match_len_cutoff, unique_score_ratio ):
"""! @brief load BLAST hits """
# --- load raw hits --- #
blast_hits = {}
with open( blast_result_file, "r" ) as f:
line = f.readline()
while line:
parts = line.strip().split('\t')
if int( parts[3] ) >= match_len_cutoff:
if float( parts[2] ) >= match_sim_cutoff:
try:
blast_hits[ parts[0] ].append( { 'LG': parts[0].split('_%_')[0], 'cm': float( parts[0].split('_%_')[1] ), 'chr': parts[1], 'pos': ( int( parts[8] ) + int( parts[9] ) ) / 2.0, 'score': float( parts[-1] ) } )
except KeyError:
blast_hits.update( { parts[0]: [ { 'LG': parts[0].split('_%_')[0], 'cm': float( parts[0].split('_%_')[1] ), 'chr': parts[1], 'pos': ( int( parts[8] ) + int( parts[9] ) ) / 2.0, 'score': float( parts[-1] ) } ] } )
line = f.readline()
# --- screen and clean --- #
final_hits = []
for hits in blast_hits.values():
if len( hits ) == 1:
final_hits.append( hits[0] )
else:
sorted_hits = sorted( hits, key=itemgetter( 'score' ) )
if sorted_hits[-2]['score'] / sorted_hits[-1]['score'] <= unique_score_ratio:
final_hits.append( sorted_hits[-1] )
return final_hits
def generate_genetic_map_file( allmaps_input_file, marker_info ):
"""! @brief generate ALLMAPS input file """
markers = sorted( marker_info, key=itemgetter('LG', 'cm') )
with open( allmaps_input_file, "w" ) as out:
out.write( "ScaffoldID\tScaffoldPosition\tLinkageGroup\tGeneticPosition\n" )
for marker in markers:
out.write( "\t".join( map( str, [ marker['chr'], marker['pos'], marker['LG'], marker['cm'] ] ) ) + '\n' )
def submit_jobs_to_cluster( prefix, query_file_names, reference_blastn_db, para_jobs, additional_options ):
"""! @brief submit BLAST jobs for each file to cluster """
IDs_to_check = []
batch_ID = str( datetime.datetime.now() )[-3:]
for idx, file_name in enumerate( query_file_names ):
ID = "B_" + batch_ID + '_' + str( idx ).zfill(4)
IDs_to_check.append( ID )
sh_file = prefix + ID + '.sh'
out_file = prefix + ID + '.out'
err_file = prefix + ID + '.err'
cmd = "/vol/biotools/bin/blastn -query " + file_name + " -db " + reference_blastn_db + " -out " + '.'.join( file_name.split('.')[:-1] ) + ".txt " + additional_options
with open( sh_file, "w" ) as out:
out.write( "#!/bin/bash\n" + " ".join( [ "echo " + '"',
cmd + '"',
"| qsub -cwd",
"-N",
ID,
"-l vf=1G",
"-l arch=lx-amd64",
"-P denbi",
"-o",
out_file,
"-e",
err_file
] ) + '\n'
)
os.popen( "chmod +x " + sh_file )
os.popen( sh_file )
time.sleep(1)
os.remove( sh_file )
waiting_status = True
while waiting_status:
qstat = os.popen( "qstat" )
content = qstat.read()
qstat_IDs = re.findall( "B_" + batch_ID + "_\d{4}", content )
counter = 0
for ID in qstat_IDs:
if ID in IDs_to_check:
counter += 1
if counter < para_jobs:
waiting_status = False
else:
time.sleep( 1 )
waiting_status = True
while waiting_status:
qstat = os.popen( "qstat" )
content = qstat.read()
qstat_IDs = re.findall( "B_" + batch_ID + "_\d{4}", content )
waiting_status = False
for ID in IDs_to_check:
if ID in qstat_IDs:
for each in content.split('\n')[2:-1]:
if ID in each.split()[2] and not 'd' in each.split()[4]:
waiting_status = True
time.sleep( 10 )
def load_sequences( multiple_fasta_file ):
"""! @brief load candidate gene IDs from file """
sequences = {}
with open( multiple_fasta_file ) as f:
header = f.readline()[1:].strip()
seq = ""
line = f.readline()
while line:
if line[0] == '>':
sequences.update( { header: seq } )
header = line.strip()[1:]
seq = ""
else:
seq += line.strip()
line = f.readline()
sequences.update( { header: seq } )
return sequences
def produce_multiple_query_files( query_file, cutoff ):
"""! @brief produce multiple query files """
prefix = query_file + str( datetime.datetime.now() )[-5:] + "/"
os.makedirs( prefix )
sequences = load_sequences( query_file )
query_file_names = []
len_counter = 0
name_counter = 1
query_file = prefix + "0".zfill(4) + ".fasta"
query_file_names.append( query_file )
out = open( query_file, "w" )
for idx, seq_id in enumerate( sorted( sequences.keys() ) ):
if len_counter >= cutoff:
len_counter = 0
out.close()
query_file = prefix + str( name_counter ).zfill(4) + ".fasta"
query_file_names.append( query_file )
out = open( query_file, "w" )
name_counter += 1
out.write( '>' + seq_id + '\n' + sequences[ seq_id ] + '\n' )
len_counter += len( sequences[ seq_id ] )
out.close()
return prefix, query_file_names
def final_processing( query_file_names, final_result_file ):
"""! @brief blt processing of BLAT results for identification of best hit """
result_file_names = []
for filename in query_file_names:
result_file_names.append( '.fasta'.join( filename.split('.fasta')[:-1] ) + '.txt' )
cmd1 = "cat " + " ".join( result_file_names ) + " > " + final_result_file
os.popen( cmd1 )
def run_blastn_on_cluster( query_file, reference_blastn_db, output_file, para_jobs ):
"""! @brief check inputs and call functions """
additional_options = ' -outfmt 6 -evalue 0.01 '
cutoff=5000000
prefix, query_file_names = produce_multiple_query_files( query_file, cutoff )
submit_jobs_to_cluster( prefix, query_file_names, reference_blastn_db, para_jobs, additional_options )
final_processing( query_file_names, output_file )
def load_genetic_marker_infos( allmaps_input_file ):
"""! @brief load infos from given input file """
infos = []
with open( allmaps_input_file, "r" ) as f:
f.readline()
line = f.readline()
while line:
parts = line.strip().split('\t')
infos.append( { 'chr': parts[0], 'pos': int( float( parts[1] ) ), 'LG': parts[2], 'cm': parts[3] } )
line = f.readline()
return infos
def main( arguments ):
"""! @brief run everything """
assembly_file = arguments[ arguments.index( '--assembly' )+1 ]
reference_file = arguments[ arguments.index( '--ref' )+1 ]
output_dir = arguments[ arguments.index( '--out' )+1 ]
if not os.path.exists( output_dir ):
os.makedirs( output_dir )
if '--cluster' in arguments:
cluster_status = True
else:
cluster_status = False
len_cutoff = 100000 #minimal length of reference sequences to be considered during genetic marker generation
kmer_len = 1000 #lenght of in silico marker sequences
cpus = 1 #number of cores to use for BLAST
word_size = 12 #word size for BLAST of in silico markers
match_sim_cutoff = 0.7 #minimal similarity required to keep BLAST hit
match_len_cutoff = kmer_len*0.7 #minimal alignment length required to keep BLAST hit
unique_score_ratio = 0.9 #max ratio between second best and best BLAST hit score to keep BLAST hit
para_jobs = 500 #number of BLAST jobs to run in parallel
# --- generating marker sequences --- #
in_silico_marker_file = output_dir + "in_silico_markers.fasta"
if not os.path.isfile( in_silico_marker_file ):
generate_in_silico_markers( in_silico_marker_file, reference_file, len_cutoff, kmer_len )
# --- map marker to assembly --- #
blast_result_file = output_dir + "blast_results.txt"
blastdb = output_dir + "blastdb"
if not os.path.isfile( blast_result_file ):
os.popen( "makeblastdb -in " + assembly_file + " -out " + blastdb + " -dbtype nucl" )
if cluster_status:
print "running BLASTn jobs on cluster ..."
run_blastn_on_cluster( in_silico_marker_file, blastdb, blast_result_file, para_jobs )
else:
print "running BLAST locally... "
os.popen( "blastn -query " + in_silico_marker_file + " -db " + blastdb + " -out " + blast_result_file + " -outfmt 6 -evalue 0.001 -word_size " + str( word_size ) +" -num_threads " + str( cpus ) )
# --- generate genetic map file for ALLMAPS --- #
allmaps_input_file = output_dir + "in_silico_genetic_map.txt"
if not os.path.isfile( allmaps_input_file ):
marker_info = load_BLAST_results( blast_result_file, match_sim_cutoff, match_len_cutoff, unique_score_ratio )
print "number of mapped genetic markers: " + str( len( marker_info ) )
generate_genetic_map_file( allmaps_input_file, marker_info )
# --- generate genetic map in BED format --- #
allmaps_bed_file = output_dir + "genetic_map.bed"
weight_file = output_dir + "weights.txt"
if not os.path.isfile( allmaps_bed_file ):
infos = load_genetic_marker_infos( allmaps_input_file )
with open( allmaps_bed_file, "w" ) as out:
for idx, x in enumerate( infos ):
if idx % 20: #### REDUCE THE NUMBER OF GENETIC MARKER BY ONLY TAKING A FEW OF THEM ###
out.write( "\t".join( [ x['chr'], str( x['pos'] ), str( x['pos']+1 ), "genetic_map-" + x['LG']+":"+x['cm'], x['chr'] + ":" + str( x['pos']+1) ] ) + '\n' )
with open( weight_file, "w" ) as out:
out.write( "genetic_map\t1\n" )
# --- run ALLMAPS --- #
os.chdir( output_dir )
os.popen( "PYTHONPATH=/vol/biotools/lib/ALLMAPS/lib64/python3.7/site-packages" ) #setting up environment
cmd = [ "python3.7 -m jcvi.assembly.allmaps path",
"-w " + output_dir + "weights.txt",
allmaps_bed_file,
assembly_file
]
os.popen( " ".join( cmd ) )
os.popen( "python3.7 -m jcvi.assembly.allmaps estimategaps " + output_dir + "allmaps_input.bed" )
if '--assembly' in sys.argv and '--ref' in sys.argv and '--out' in sys.argv:
main( sys.argv )
else:
sys.exit( __usage__ )