-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimulate.py
274 lines (223 loc) · 9.86 KB
/
simulate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
import discsim
import ercs
import math
import multiprocessing
import random
import re
import subprocess as sub
import src.newick_conversion as newick
import time
from settings import settings
from src.validate_settings import validate
class Simulator(discsim.Simulator):
def setup(self, event_classes):
"""
Set simulation parameters
"""
self.sample = [None] + settings["sample_locations"]
self.event_classes = event_classes
self.recombination_probability = settings["recombination_probability"]
self.num_loci = int(settings["num_partitions"])
self.num_parents = settings["num_parents"]
def run_simulation(self, seed, seq_gen_seeds):
"""
Runs a single simulation. Converts simulation output to
newick format. Runs Seq-Gen with newick trees to generate
DNA sequence data. Returns dictionary of DNA sequences for
each sample.
"""
self.random_seed = seed
self.run()
result = self.get_history()
self.reset()
# Save file in Newick Tree Format
trees = self._make_trees(result)
filename = "tree_" + str(seed)
num_tree = 1
for tree in trees:
with open(filename + "_" + str(num_tree), "w") as f:
num_tree += 1
f.write(str(tree) + "\n")
# Call SeqGen
seqgen_sequences = self._run_seqgen(seed, seq_gen_seeds)
with open("DNA_" + str(seed), "w") as f:
for seq in seqgen_sequences:
sequence = "\t".join(seqgen_sequences[seq])
f.write("{0}\t{1}\n".format(seq, sequence))
# File cleanup
for i in range(1, num_tree):
sub.call(["rm", filename + "_" + str(i)])
sub.call(["rm", "DNA_" + str(seed) + "_" + str(i)])
sub.call(["mv", "DNA_" + str(seed), "output_sequences/"])
return seqgen_sequences
def _make_trees(self, oriented_trees):
"""
Takes pi and tau from the replicates generated by run_replicates
Converts them to Newick Tree Format.
Returns list of trees.
"""
pi = oriented_trees[0]
tau = oriented_trees[1]
trees = []
for i, j in zip(pi, tau):
# convert simulation time to generation time
j = self._rescale_coalescent_times(j)
# sometimes an extra 0L and 0.0 are added on to
# pi and tau respectively which breaks the newick script.
# the tree is otherwise correct
while(j[-1] == 0.0):
i.pop(-1)
j.pop(-1)
trees.append(newick.convert_tree(i, j))
return trees
def _run_seqgen(self, seed, seq_gen_seeds):
"""
Generates DNA sequences using HKY mutation model from newick trees.
Returns a dictionary from sample to list of sequences per loci.
"""
filename = "tree_" + str(seed)
partition_lengths = settings["partitions"][:]
num_tree = int(settings["num_partitions"])
for tree in range(1, num_tree + 1):
i = tree - 1
sub.call("./seqgen/seq-gen -mHKY -t2 -f0.35,0.15,0.25,0.25 -op -l{0} -s{1} -z{2} < {3}_{4} > DNA_{5}_{4}" \
.format(settings["partitions"][i], settings["mutation_rates"][i],
str(seq_gen_seeds[i]), filename, str(tree), str(seed)),
shell=True)
sequences = {}
for j in range(1, num_tree + 1):
with open("DNA_" + str(seed) + "_" + str(j)) as f:
lines = f.readlines()
for line in lines[1:]:
line.strip('\n')
split = line.split()
if split[0] in sequences:
sequences[split[0]].append(split[1])
else:
sequences[split[0]] = [split[1]]
return sequences
def _rescale_coalescent_times(self, tau):
"""
Rescales coalescent times from simulation time
to generation time.
"""
event = self.event_classes[0] # reproduction events
L_d = float(settings["length"] ** settings["dimensions"])
A = math.pi * (event.r**2) if settings["dimensions"] == 2 else 2 * event.r
scaled = map(lambda x: ( x * event.rate * event.u * A ) / L_d, tau )
return scaled
def generate_event_parameters(num_replicates):
"""
Make parameter sets for simulations, write to output file.
"""
def neighborhood_parameter_set():
seed = random.randint(1, 2**31 - 1)
seq_gen_seeds = [random.randint(1, 2**31 - 1) for i in range(int(settings["num_partitions"]))]
rate = random.uniform(settings["small_event"]["rate"][0], settings["small_event"]["rate"][1])
radius = random.uniform(settings["small_event"]["radius"][0], settings["small_event"]["radius"][1])
n_size = random.uniform(settings["neighborhood_size"][0], settings["neighborhood_size"][1])
u0 = settings["num_parents"] / float(n_size)
event_classes = [ ercs.DiscEventClass(rate = rate, r = radius, u = u0) ]
return (seed, seq_gen_seeds, event_classes, n_size)
def posterior_parameter_set(f_lines, length):
seed = random.randint(1, 2**31 - 1)
seq_gen_seeds = [random.randint(1, 2**31 - 1) for i in xrange(int(settings["num_partitions"]))]
rate = 1.0
line = f_lines[random.randint(1, length)].strip("\n").split(",")
radius = float(line[0])
u = settings["num_parents"] / float(line[1])
event_classes = [ ercs.DiscEventClass(rate = rate, r = radius, u = u)]
return (seed, seq_gen_seeds, event_classes, line[1])
random.seed()
filename = "parameters_{}_{}.txt".format(time.strftime("%x"), time.strftime("%X"))
filename = filename.replace("/","_").replace(":","")
if(settings["posterior_predictive_checks"]):
with open(settings["posterior_parameter_file"]) as f:
lines = f.readlines()
length = len(lines)
parameters = [ posterior_parameter_set(lines, length) for i in xrange(num_replicates) ]
with open(filename, "w") as f:
seed_parameters = {}
f.write("seed\trate\tradius\tu\tn_size\n")
for seed, seq_gen_seeds, event_classes, pop_size in parameters:
event = event_classes[0]
seed_parameters[str(seed)] = (event.rate, event.r, event.u, pop_size)
# Need to sort by string value to have same ordering as Arlequin
for seed in sorted(seed_parameters):
f.write("{}\t{}\t{}\t{}\t{}\n".format(seed, seed_parameters[seed][0], seed_parameters[seed][1],
seed_parameters[seed][2], seed_parameters[seed][3]))
else:
parameters = [ neighborhood_parameter_set() for i in xrange(num_replicates) ]
with open(filename, "w") as f:
seed_parameters = {}
f.write("seed\trate\tradius\tu\tn_size\n")
for seed, seq_gen_seeds, event_classes, pop_size in parameters:
event = event_classes[0]
seed_parameters[str(seed)] = (event.rate, event.r, event.u, pop_size)
# Need to sort by string value to have same ordering as Arlequin
for seed in sorted(seed_parameters):
f.write("{}\t{}\t{}\t{}\t{}\n".format(seed, seed_parameters[seed][0], seed_parameters[seed][1],
seed_parameters[seed][2], seed_parameters[seed][3]))
return parameters
def convert_to_arp(seed, sequence_dict):
"""
Generate an Arlequin file from DNA sequences.
"""
with open("ARQ_abc_" + str(seed) + ".arp", "w") as f:
f.write("[Profile]\n")
f.write("\tTitle=\"Discsim generated data\"\n")
f.write("\tNbSamples=1\n")
f.write("\tGenotypicData=0\n")
f.write("\tGameticPhase=1\n")
f.write("\tRecessiveData=0\n")
f.write("\tDataType=DNA\n")
f.write("\tLocusSeparator=TAB\n")
f.write("\tMissingData='?'\n")
f.write("\n")
f.write("[Data]\n")
f.write("\t[[Samples]]\n")
f.write("\n")
f.write("\t\tSampleName=\"Population1\"\n")
f.write("\t\tSampleSize=" + str(len(settings["sample_locations"])) + "\n")
f.write("\t\tSampleData= {\n")
for sample in sequence_dict:
f.write(re.search("\d+", sample).group(0) + "_1\t1\t")
f.write("\t".join(sequence_dict[sample]) + "\n")
f.write("\n}\n")
sub.call("mv " + "ARQ_abc_" + str(seed) + ".arp ./arlsumstat_files/ &> /dev/null &", shell=True)
def subprocess_worker(t):
"""
Runs a single simulation.
"""
# pop_size is unused here
seed, seq_gen_seeds, event_classes, pop_size = t
sim = Simulator(settings["length"])
sim.setup(event_classes)
seqgen_sequences = sim.run_simulation(seed, seq_gen_seeds)
convert_to_arp(seed, seqgen_sequences)
def run_simulations():
"""
Sets up multiprocessing parameters. Maps arguments
to subprocess workers.
"""
# Multiprocessing parameters
if settings["num_cpus"] == 0:
processes = multiprocessing.cpu_count()
else:
processes = settings["num_cpus"]
workers = multiprocessing.Pool(processes=processes, maxtasksperchild=1000)
args = generate_event_parameters(settings["num_replicates"])
if not settings["debug"]:
replicates = workers.map(subprocess_worker, args)
else:
for arg in args:
subprocess_worker(arg)
if __name__ == "__main__":
print "Validating settings file..."
validate()
print "Starting simulations..."
start_time = time.strftime("%x") + " " + time.strftime("%X")
run_simulations()
print "Done!"
print "Start Time: " + start_time
print "Stop Time: " + time.strftime("%x") + " " + time.strftime("%X")