-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy paththreshold_candidates.py
executable file
·293 lines (268 loc) · 13.9 KB
/
threshold_candidates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python
# David Prihoda
# Convert a Domain CSV file with predictions into a BGC candidate CSV file using a given prediction threshold
# Generated by averaging domain predictions by protein and then merging consecutive proteins with prediction satisfying given threshold
# Candidates can be postprocessed by merging and filtering
import argparse
import pandas as pd
import json
import os
import numpy as np
from multiprocessing import Pool
import hashlib
from average_protein_prediction import average_protein_prediction
def num_bio_pfams(pfam_ids):
"""
Calculate number of unique biosynthetic pfams in given list of pfams
:param pfam_ids: List of pfam_ids
:return: number of unique biosynthetic pfams in given list of pfams
"""
from biosynthetic_pfams import AS_BIO_PFAM_IDS as BIO_PFAM_IDS
return len(set(pfam_ids).intersection(BIO_PFAM_IDS))
def parse_cand_pfam_ids(pfam_string):
"""
Get flat list of pfam_ids from candidate csv field 'pfam_ids'
:param pfam_string: pfam_ids string separated by '|' (between each protein) and ';' (between each pfam_id)
:return: flat list of pfam_ids
"""
return [pfam_id for pfam_ids in pfam_string.split('|') for pfam_id in pfam_ids.split(';')]
def filter_cands_min_bio_pfams(cands, min_pfams, verbose=1):
"""
Filter candidates by minimum number of biosynthetic pfams
:param cands: Candidate DataFrame to filter
:param min_pfams: Minimum number of biosynthetic pfams
:param verbose: Verbosity
:return: candidates filtered by minimum number of biosynthetic pfams
"""
if cands.empty:
return cands
filtered = cands[cands['num_bio_domains'] >= min_pfams]
if verbose:
print('Filtered candidates by minimum of {} biosynthetic pfams, {}/{} remained.'.format(min_pfams, len(filtered), len(cands)))
return filtered
def filter_cands_min_nucleotides(cands, min_nucleotides, verbose=1):
"""
Filter candidates by minimum number of nucleotides
:param cands: Candidate DataFrame to filter
:param min_nucleotides: Minimum number of nucleotides
:param verbose: Verbosity
:return: candidates filtered by minimum number of nucleotides
"""
if cands.empty:
return cands
filtered = cands[cands['nucl_length'] >= min_nucleotides]
if verbose:
print('Filtered candidates by minimum of {} nucleotides, {}/{} remained.'.format(min_nucleotides, len(filtered), len(cands)))
return filtered
def filter_cands_min_proteins(cands, min_proteins, verbose=1):
"""
Filter candidates by minimum number of proteins
:param cands: Candidate DataFrame to filter
:param min_proteins: Minimum number of proteins
:param verbose: Verbosity
:return: candidates filtered by minimum number of proteins
"""
if cands.empty:
return cands
filtered = cands[cands['num_proteins'] >= min_proteins]
if verbose:
print('Filtered candidates by minimum of {} proteins, {}/{} remained.'.format(min_proteins, len(filtered), len(cands)))
return filtered
def get_candidate(start, end, pfam_ids, protein_ids, protein_predictions):
"""
Get single BGC candidate dictionary
:param start: nucleotide coordinate start
:param end: nucleotide coordinate end
:param pfam_ids: list of pfam ids in candidate
:param protein_ids: list of protein ids in candidate
:param protein_predictions: list of protein model prediction outputs
:return: BGC candidate dictionary
"""
return {
'nucl_start': start,
'nucl_end': end,
'num_proteins': len(protein_ids),
'num_bio_domains': num_bio_pfams(pfam_ids),
'num_all_domains': len(pfam_ids),
'protein_ids': ';'.join(protein_ids),
'pfam_ids': ';'.join(pfam_ids),
'avg_prediction': np.mean(protein_predictions)
}
def threshold_contig_candidates(domain_predictions, threshold, max_protein_gap=0, max_nucl_gap=0):
"""
Get a BGC candidate DataFrame for domain predictions in a single contig.
Generated by averaging domain predictions by protein and then merging consecutive proteins with prediction satisfying given threshold
:param domain_predictions: DataFrame of domains and their 'prediction' column
:param threshold: Averaged protein prediction threshold (inclusive) used to include or discard BGC proteins
:param max_protein_gap: Merge candidates with given (or smaller) number of non-BGC proteins between them
:param max_nucl_gap: Merge candidates with given (or smaller) number of nucleotides between them
:return: DataFrame of BGC candidates
"""
candidates = []
protein_predictions = average_protein_prediction(domain_predictions, y=domain_predictions['prediction'], concat_domains=True)
candidate_start = None
candidate_end = None
candidate_domains = []
candidate_proteins = []
candidate_predictions = []
gap_domains = []
gap_proteins = []
gap_predictions = []
for i, protein in protein_predictions.iterrows():
prediction = protein['prediction']
# Inactive protein, add to gap
if prediction < threshold:
gap_proteins.append(protein['protein_id'])
gap_domains += protein['pfam_ids'].split(';')
gap_predictions.append(prediction)
# We just changed from active to inactive, add previous region as candidate
if candidate_start is not None:
candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_predictions))
candidate_start = None
candidate_end = None
candidate_domains = []
candidate_proteins = []
candidate_predictions = []
# Active protein
else:
if not candidate_start:
candidate_start = protein['gene_start']
if candidates:
# Check if we should merge with the previous candidate
prev_start, prev_end, prev_domains, prev_proteins, prev_predictions = candidates[-1]
if len(gap_proteins) <= max_protein_gap or (candidate_start - prev_end) <= max_nucl_gap:
# Remove previous candidate and continue where it started
candidates = candidates[:-1]
candidate_start = prev_start
candidate_domains = prev_domains + gap_domains
candidate_proteins = prev_proteins + gap_proteins
candidate_predictions = prev_predictions + gap_predictions
candidate_end = protein['gene_end']
candidate_proteins.append(protein['protein_id'])
candidate_domains += protein['pfam_ids'].split(';')
candidate_predictions.append(prediction)
gap_domains = []
gap_proteins = []
gap_predictions = []
# Last protein was active, add previous region as candidate
if candidate_start is not None:
candidates.append((candidate_start, candidate_end, candidate_domains, candidate_proteins, candidate_predictions))
candidates = [get_candidate(*args) for args in candidates]
return pd.DataFrame(candidates)
def contig_id_from_filename(path):
"""
Create a basic contig_id from a file name without extension
:param path: Path of file
:return: file name without extension that can be used as contig_id
"""
return os.path.splitext(os.path.basename(path))[0]
def threshold_candidates(predictions, threshold, max_protein_gap=0, max_nucl_gap=0, min_bio_domains=0,
min_proteins=0, min_nucleotides=0, i=0, verbose=1):
"""
Get a BGC candidate DataFrame for domain predictions in multiple contigs.
Generated by averaging domain predictions by protein and then merging consecutive proteins with prediction satisfying given threshold
Candidates can be postprocessed by merging and filtering
:param predictions: DataFrame of domains and their 'prediction' column
:param threshold: Averaged protein prediction threshold (inclusive) used to include or discard BGC proteins
:param max_protein_gap: Merge candidates with given (or smaller) number of non-BGC proteins between them
:param max_nucl_gap: Merge candidates with given (or smaller) number of nucleotides between them
:param min_bio_domains: Discard candidates with less than min_bio_domains biosynthetic domains
:param min_proteins: Discard candidates with less than min_proteins proteins
:param min_nucleotides: Discard candidates with less than min_nucleotides nucleotides
:param i: Work index for logging purposes
:param verbose: Verbosity
:return: DataFrame of BGC candidates
"""
groups = predictions.groupby('contig_id')
# One file can contain multiple contigs, get candidates separately for each contig_id group
candidates = []
for contig_id, sample_predictions in groups:
if verbose:
print('Thresholding {} predictions in {} (#{})'.format(len(sample_predictions), contig_id, i))
cands = threshold_contig_candidates(sample_predictions, threshold, max_protein_gap=max_protein_gap, max_nucl_gap=max_nucl_gap)
if cands.empty:
continue
cands['contig_id'] = contig_id
cands['nucl_start'] = cands['nucl_start'].astype('int64')
cands['nucl_end'] = cands['nucl_end'].astype('int64')
cands['nucl_length'] = cands['nucl_end'] - cands['nucl_start'] + 1
cands['candidate_hash'] = cands['pfam_ids'].apply(
lambda pfam_ids: hashlib.md5(pfam_ids.encode('utf-8')).hexdigest())
cands['candidate_id'] = cands.apply(
lambda row: '{}({}-{})'.format(row['contig_id'], row['nucl_start'], row['nucl_end']), axis=1)
cands = cands[['contig_id', 'candidate_id', 'candidate_hash', 'avg_prediction', 'nucl_length', 'nucl_start', 'nucl_end',
'num_bio_domains', 'num_all_domains', 'num_proteins', 'protein_ids', 'pfam_ids']]
if min_bio_domains:
cands = filter_cands_min_bio_pfams(cands, min_pfams=min_bio_domains, verbose=verbose)
if min_proteins:
cands = filter_cands_min_proteins(cands, min_proteins, verbose=verbose)
if min_nucleotides:
cands = filter_cands_min_nucleotides(cands, min_nucleotides, verbose=verbose)
candidates.append(cands)
return pd.concat(candidates) if candidates else pd.DataFrame()
def task(args):
"""
Single parallel task wrapper for the threshold_candidates function
Processes a single Domain CSV file
:param args: Task arguments
:return: DataFrame of BGC candidates, result of threshold_candidates
"""
i, path, threshold, options = args
predictions = pd.read_csv(path)
if 'contig_id' not in predictions.columns:
contig_id = contig_id_from_filename(path)
predictions['contig_id'] = contig_id
return threshold_candidates(
predictions=predictions,
threshold=threshold,
i=i,
max_protein_gap=options.genegap,
max_nucl_gap=options.nuclgap,
min_bio_domains=options.min_bio_domains,
min_proteins=options.min_proteins,
min_nucleotides=options.min_nucleotides
)
if __name__ == "__main__":
# Parse command line
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--threshold", dest="threshold", required=False, type=float,
help="Prediction threshold to select proteins.", metavar="INT")
parser.add_argument("-gg", "--gene-gap", dest="genegap", required=False, default=0, type=int,
help="Merge candidates with gene-gap or less genes in between.", metavar="INT")
parser.add_argument("-ng", "--nucl-gap", dest="nuclgap", required=False, default=0, type=int,
help="Merge candidates with nucl-gap or less nucleotides in between.", metavar="INT")
parser.add_argument("-md", "--min-bio-domains", dest="min_bio_domains", required=False, default=0, type=int,
help="Include only candidate with at least given number of known biosynthetic protein domains.", metavar="INT")
parser.add_argument("-mp", "--min-proteins", dest="min_proteins", required=False, default=0, type=int,
help="Include only candidate with at least given number of proteins.", metavar="INT")
parser.add_argument("-mn", "--min-nucleotides", dest="min_nucleotides", required=False, default=0, type=int,
help="Include only candidate with at least given number of nucleotides.", metavar="INT")
parser.add_argument("-c", "--confusion", dest="confusion", required=False,
help="Confusion matrix JSON file to get threshold from.", metavar="FILE")
parser.add_argument("-o", "--output", dest="output", required=True,
help="Output csv file path.", metavar="FILE")
parser.add_argument(dest='predictions', nargs='+',
help="Paths to domain sequence CSVs with predictions.", metavar="FILE")
options = parser.parse_args()
if options.threshold:
threshold = options.threshold
elif options.confusion:
with open(options.confusion) as f:
rates = json.load(f)
threshold = rates['threshold']
print('Threshold {:.5f} with {}'.format(threshold, rates))
else:
raise AttributeError('Specify either threshold or path to validation file')
paths = []
for path in options.predictions:
if os.path.isdir(path):
files = [os.path.join(path, name) for name in os.listdir(path)]
print('Adding {} files from directory: {}'.format(len(files), path))
paths += files
else:
paths.append(path)
pool = Pool()
candidates = pool.map(task, [(i, path, threshold, options) for i, path in enumerate(paths)])
candidates: pd.DataFrame = pd.concat(candidates)
candidates.to_csv(options.output, index=False)
print('Saved {} candidates to {}'.format(len(candidates), options.output))