-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgather_data.py
259 lines (211 loc) · 8.1 KB
/
gather_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Gather data script for NEST.
# Collects all the JSON data generated by each individual simulation in a given directory.
# Used by benchmark.sh after all the simulation runs.
# execute with:
# python3 gather_data.py PATH [--out=FILE]
# with PATH the path to the top directory containing the results of multiple simulations
# with FILE an optional argument to the path for the output file.
# defaults to "data.json"
import json
import numpy as np
from pathlib import Path
from argparse import ArgumentParser
def get_paths() -> tuple:
"""
Parses argument given in command line when launching script.
Required:
- path (string): the path to the top directory containing the results of multiple simulations
Optional:
- --out (string): optional argument to the path for the output file. Defaults to "data.json".
Returns:
- Tuple of path (Path) and output file (Path)
"""
parser = ArgumentParser()
parser.add_argument("path", type=str)
parser.add_argument("--out", type=str, default="data.json")
args = parser.parse_args()
p = Path(args.path)
o = Path(args.out)
assert p.is_dir() and (not o.exists() or o.is_file())
if o.is_file():
print(f"WARNING: overriding {o}")
return p, o
def get_json_results(path: Path) -> dict:
"""
Iteratively loads the content of every JSON file found in depth=1 of given directory.
Build a JSON-like dictionary with same structure as input JSONs,
containing lists of all the collected values.
Arguments:
- path (Path): path to the top directory containing the results of multiple simulations
Returns:
- results (dict): all the collected data as a dictionary.
"""
results = {}
for p in path.glob("*/*.json"):
with p.open() as f:
data = json.load(f)
d_conf = data["conf"]
d_seed = d_conf.pop("seed")
d_ranks = data["ranks"]
d_all_values = data["all_values"]
id_conf = f"p-{d_conf['procs']}-t-{d_conf['threads']}"
if id_conf not in results:
results[id_conf] = {
"conf": d_conf,
"seeds": [],
"ranks": {},
"all_values": {
"stats": {},
"timers": {}
}
}
# In one configuration a seed must be unique
r_seeds = results[id_conf]["seeds"]
assert d_seed not in r_seeds
r_seeds.append(d_seed)
r_ranks = results[id_conf]["ranks"]
r_all_values = results[id_conf]["all_values"]
# BEGIN rank lists
for rank in d_ranks:
d_stats = d_ranks[rank]["stats"]
d_timers = d_ranks[rank]["timers"]
if rank not in r_ranks:
r_ranks[rank] = {
"stats": {},
"timers": {}
}
r_stats = r_ranks[rank]["stats"]
r_timers = r_ranks[rank]["timers"]
# Ger per rank stats
if r_stats == {}:
for stat in d_stats:
r_stats[stat] = [d_stats[stat]]
else:
for stat in d_stats:
r_stats[stat].append(d_stats[stat])
# Get per rank timers
if r_timers == {}:
for timer in d_timers:
r_timers[timer] = [d_timers[timer]]
else:
for timer in d_timers:
r_timers[timer].append(d_timers[timer])
# END rank lists
# BEGIN all values
# Get per conf stats
if r_all_values["stats"] == {}:
for stat in d_all_values["stats"]:
r_all_values["stats"][stat] = [d_all_values["stats"][stat]]
else:
for stat in d_all_values["stats"]:
r_all_values["stats"][stat].append(d_all_values["stats"][stat])
# Get per conf timers
if r_all_values["timers"] == {}:
for timer in d_all_values["timers"]:
r_all_values["timers"][timer] = d_all_values["timers"][timer]
else:
for timer in d_all_values["timers"]:
r_all_values["timers"][timer].extend(d_all_values["timers"][timer])
# END all values
# Sanity check
for cid in results:
num_seeds = len(results[cid]["seeds"])
for rank in results[cid]["ranks"]:
for stat in results[cid]["ranks"][rank]["stats"]:
assert len(results[cid]["ranks"][rank]["stats"][stat]) == num_seeds
for timer in results[cid]["ranks"][rank]["timers"]:
assert len(results[cid]["ranks"][rank]["timers"][timer]) == num_seeds
for stat in results[cid]["all_values"]["stats"]:
assert len(results[cid]["all_values"]["stats"][stat]) == num_seeds
for timer in results[cid]["all_values"]["timers"]:
assert len(results[cid]["all_values"]["timers"][timer]) == num_seeds * results[cid]["conf"]["procs"]
return results
def get_statistics(results: dict) -> dict:
"""
Given a dictionary of results, for each list in the timers sub-dictionary,
computes the mean and standard deviation and replaces the list by a sub-dictionary containing these values.
Builds a JSON-like dictionary with same structure as input JSON-like dictionary.
Arguments:
- results (dict): all the collected data as a dictionary.
Returns:
- statistics (dict): all the computed data as a dictionary.
"""
stats = {}
for cid in results:
r_conf = results[cid]["conf"]
r_seeds = results[cid]["seeds"]
r_ranks = results[cid]["ranks"]
r_all_values = results[cid]["all_values"]
if cid not in stats:
stats[cid] = {
"conf": r_conf,
"seeds": r_seeds,
"ranks": {},
"all_values": {
"stats": {},
"timers": {}
}
}
s_ranks = stats[cid]["ranks"]
s_all_values = stats[cid]["all_values"]
# BEGIN rank lists
for rank in r_ranks:
r_stats = r_ranks[rank]["stats"]
r_timers = r_ranks[rank]["timers"]
assert rank not in s_ranks
s_ranks[rank] = {
"stats": {},
"timers": {}
}
s_stats = s_ranks[rank]["stats"]
s_timers = s_ranks[rank]["timers"]
for stat in r_stats:
stat_vals = np.array(r_stats[stat])
s_stats[stat] = {
"mean": np.mean(stat_vals),
"std": np.std(stat_vals)
}
for timer in r_timers:
timer_vals = np.array(r_timers[timer]) / 1e9
s_timers[timer] = {
"mean": np.mean(timer_vals),
"std": np.std(timer_vals)
}
# END rank lists
# BEGIN all values
for stat in r_all_values["stats"]:
stat_vals = np.array(r_all_values["stats"][stat])
s_all_values["stats"][stat] = {
"mean": np.mean(stat_vals),
"std": np.std(stat_vals)
}
for timer in r_all_values["timers"]:
timer_vals = np.array(r_all_values["timers"][timer]) / 1e9
s_all_values["timers"][timer] = {
"mean": np.mean(timer_vals),
"std": np.std(timer_vals)
}
# END all values
return stats
def save_statistics(stats: dict, out: Path) -> None:
"""
Given a JSON-like dictionary of statistics, and a path to a file,
saves as JSON data to file.
Arguments:
- statistics (dict): all the computed data as a dictionary.
- out (Path): Path object to output file.
"""
with out.open("w") as f:
json.dump(stats, f, indent=4)
def main():
"""
Main function of script.
"""
path, out = get_paths()
res = get_json_results(path)
stats = get_statistics(res)
save_statistics(stats, out)
if __name__ == "__main__":
main()