-
Notifications
You must be signed in to change notification settings - Fork 30
/
signature.py
239 lines (212 loc) · 8.25 KB
/
signature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import numpy as np
import string
from scipy.optimize import basinhopping
import pdb
import sys
import multiprocessing
def make(maf_path, out_path=None, substitution_order=None):
id_col = "Tumor_Sample_Barcode"
ref_allele_col = "Reference_Allele"
variant_type_col = "Variant_Type"
tum_allele_col = "Tumor_Seq_Allele2"
ref_tri_col = "Ref_Tri"
required_columns = [id_col, ref_allele_col, variant_type_col, \
tum_allele_col, ref_tri_col]
maf_f = open(maf_path, 'r')
column_labels = maf_f.readline().strip().split('\t')
column_index = {}
for i,x in enumerate(column_labels):
column_index[x] = i
for column_label in required_columns:
if not column_label in column_labels:
print "Fatal: Required column '%s' missing"%column_label
return None
id_index = column_index[id_col]
ref_allele_index = column_index[ref_allele_col]
variant_type_index = column_index[variant_type_col]
tum_allele_index = column_index[tum_allele_col]
ref_tri_index = column_index[ref_tri_col]
nucleotides = ['A','C','G','T']
nucleotide_complement = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
transitions = ['CA','CG','CT','TA','TC','TG']
if substitution_order == None:
substitution_order = []
for change in transitions:
for left in nucleotides:
for right in nucleotides:
substitution_order.append(left+change+right)
signatures = {}
line_number = 0
num_snps_counted = 0
num_snps_skipped = 0
num_nonsnps = 0
for line in maf_f:
line_number += 1
sline = map(string.strip, line.split('\t'))
if len(sline) != len(column_labels):
print "Warning: Line length doesnt match number of columns." +\
"skipping line %d"%line_number
continue
variant_type = sline[variant_type_index]
if not variant_type.startswith("SNP"):
# Skip
num_nonsnps += 1
continue
num_snps_skipped += 1
samp_id = sline[id_index]
ref_allele = sline[ref_allele_index]
tum_allele = sline[tum_allele_index]
snp = ref_allele + tum_allele
if not snp in transitions:
snp = nucleotide_complement[ref_allele] + nucleotide_complement[tum_allele]
ref_trinuc = sline[ref_tri_index]
if ref_trinuc == "NA":
print "Warning: Reference allele not available on "+\
"line %d; skipping line"%line_number
continue
if not ref_trinuc[1] == snp[0]:
print "Warning: Reference allele does not match reference "+\
"trinucleotide; skipping line %d"%line_number
continue
snp_with_ctx = ref_trinuc[0] + snp + ref_trinuc[2]
if not samp_id in signatures:
signatures[samp_id] = [0 for i in substitution_order]
if snp_with_ctx not in substitution_order:
print "Warning: substitution on line " + \
"%d is %s, not "%(line_number,snp_with_ctx) + \
"found among possible substitutions. Skipping line."
continue
signatures[samp_id][substitution_order.index(snp_with_ctx)] += 1
num_snps_counted += 1
num_snps_skipped -= 1
maf_f.close()
print "%d lines read; %d SNPS counted, %d SNPs skipped, %d non-SNPs skipped"%(line_number, num_snps_counted, num_snps_skipped, num_nonsnps)
if out_path != None:
out_f = open(out_path, 'w')
out_f.write(string.join(["Tumor_Sample_Barcode"] + substitution_order,'\t'))
out_f.write('\n')
for samp_id in signatures:
out_f.write(samp_id)
out_f.write('\t')
out_f.write(string.join(map(str, signatures[samp_id]), '\t'))
out_f.write('\n')
out_f.close()
return {'substitution_order':substitution_order, 'signatures':signatures}
def alogb(a,b):
a = max(a, 0.0001)
b = max(b, 0.0001)
if a == 0 and b == 0:
return 0
elif b == 0:
return -1*float("-inf")
else:
return a*np.log(b)
def div(to_array, from_array):
to_l = to_array.tolist()
from_l = from_array.tolist()
return np.sum(map(alogb, to_l, to_l)) - np.sum(map(alogb, to_l, from_l))
def sym_div(a,b):
return div(a,b) + div(b,a)
def load_stratton_signatures(path):
f = open(path, 'r')
substitution_order = map(string.strip, f.readline().split("\t"))
sigs = []
names = []
for line in f:
sline = line.split('\t')
names.append(sline[0])
sigs.append(map(float, sline[1:]))
return {'signatures': np.array(sigs).T, \
'names':names, \
'substitution_order': substitution_order}
def load(path):
f = open(path, 'r')
f.readline() # skip substitution order
ret = {}
for line in f:
sline = line.split('\t')
ret[sline[0]] = np.array(map(float, sline[1:]))
return ret
def decompose(target, sigs, random_seed = None):
num_muts = np.sum(target)
if num_muts < 5:
print "Warning: sample has less than 5 mutations; cancelling decomposition"
return None
num_sigs = sigs.shape[1]
target = np.array(target)/float(np.sum(target))
if not random_seed == None: np.random.seed(seed = random_seed)
seed = np.random.multinomial(num_muts, np.ones((num_sigs,))/float(num_sigs), 1)[0].astype(float)
seed = seed/np.linalg.norm(seed)
class Step(object):
def __init__(self, stepsize=5):
self.stepsize = stepsize
def __call__(self, x):
x += np.random.uniform(-self.stepsize, self.stepsize, np.shape(x))
return x
np.seterr(all="raise")
def error(x):
coeff = x*x
coeff = coeff/np.sum(coeff)
approx = sigs.dot(coeff)
try:
return -target.dot(np.log(np.maximum(approx, 0.0001)))
except:
pdb.set_trace()
result = basinhopping(error, seed, niter=3, disp=False, T=5, take_step=Step()).x
result = result*result
result = result/np.sum(result)
return result
def decompose_to_file(targets, sigs, sigs_names, to_file, random_seed = None):
to_file = open(to_file, 'w')
to_file.write(string.join(["Sample Name", "Number of Mutations"] + sigs_names, '\t'))
to_file.write('\n')
num_targets = len(targets.keys())
num_targets_decomposed = 0
for target_name in targets:
if num_targets_decomposed%50 == 0:
print "%d/%d decomposed"%(num_targets_decomposed, num_targets)
decomposition = None
try:
decomposition = decompose(targets[target_name], sigs, random_seed = random_seed)
except:
print("DECOMPOSITION EXCEPTION FOR "+target_name)
num_targets_decomposed += 1
if decomposition is None:
print "Sample %s not decomposed"%target_name
continue
to_file.write(target_name)
to_file.write('\t')
to_file.write(str(np.sum(targets[target_name])))
to_file.write('\t')
to_file.write(string.join(map(str, decomposition), '\t'))
to_file.write('\n')
to_file.close()
def parse_decompose(arg_list):
target_name = arg_list[0]
targets = arg_list[1]
sigs = arg_list[2]
"""Accessory function to retain sample name"""
try:
return [target_name, decompose(targets, sigs)]
except:
return [None, None, None]
def decompose_to_file_parallel(targets, sigs, sigs_names, out_file, threads):
pool = multiprocessing.Pool(processes=threads)
manager = multiprocessing.Manager()
input_data = list()
for target_name in targets:
input_data.append([target_name, targets[target_name], sigs])
result = pool.map_async(parse_decompose, input_data)
pool.close()
pool.join()
results = result.get()
### Header
to_file = open(out_file, 'w')
to_file.write(string.join(["Sample Name", "Number of Mutations"] + sigs_names + ['\n'], '\t'))
### Loop over results
num_targets = len(targets.keys())
for i in range(len(results)):
if results[i][1] is not None:
out = [results[i][0], str(np.sum(targets[results[i][0]])), string.join(map(str, results[i][1]), '\t'), '\n']
to_file.write(string.join(out, '\t'))
to_file.close()