-
Notifications
You must be signed in to change notification settings - Fork 0
/
calculate_mean_std.py
122 lines (95 loc) · 4.35 KB
/
calculate_mean_std.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import argparse
import numpy as np
import logging
import multiprocessing as mp
import yaml
from os.path import join
import h5py
import importlib.resources
import d3d_signals
"""
This script calculates max, min, mean, and std individually for all signals in a dataset definition.
Max, min, mean, and std are calculated over all shots, and for each signal individually.
For 1d signals, all channels in the signal are used. That is, the calculation is over all shots
and all channels.
The results are stored in the file `normalization.yaml` as follows
predictor_name:
max:
mean:
min:
std:
"""
# Each task fetches a single variable from all shots and calculates the
# mean and standard deviation.
def process_variable(args):
"""Fetch a single variable from all shots and calculate the channel-wise min, max, mean and std.
args:
datadir (string): Directory to HDF5 files
group_name (string, string) : Tuple (predictor_name, map_to). This is used to find the HDF5
group name for the predictor.
shotlist (list[Int]): List of shots to process
"""
datadir, group_name, shotlist = args # Unpack the arguments
pred_name, map_to = group_name # Unpack predictor name and map_to
# 1. Cache the relevant data from list of hdf5 files
cache_data = []
for shotnr in shotlist:
with h5py.File(join(datadir, f"{shotnr}.h5"), "r") as df:
cache_data.append(df[map_to]["zdata"][:].flatten())
all_signals = np.hstack(cache_data)
# try:
# all_signals = np.hstack(cache_data)
# except:
# print(f"Error in shot {shotnr}")
# 2. Calculate min, max, mean and std. Convert numpy datatypes to float. Otherwise yaml.safe_dump
# throws an error
return {pred_name: {"min": float(all_signals.min()),
"max": float(all_signals.max()),
"mean": float(all_signals.mean()),
"std": float(np.std(all_signals, dtype=np.float64))}}
if __name__ == "__main__":
print("main")
logging.basicConfig(filename="instantiate.log",
format="%(asctime)s %(message)s",
encoding="utf-8",
level=logging.INFO)
parser = argparse.ArgumentParser(
prog="downloader.py",
description="Downloads D3D datasets according to yaml description")
parser.add_argument("--dataset_def", type=str,
help="YAML file that contains definition of the dataset")
parser.add_argument("--destination", type=str,
help="Destination for Dataset HDF5 files")
args = parser.parse_args()
# Load dataset definition
with open(args.dataset_def, "r") as stream:
dataset_def = yaml.safe_load(stream)
# The MDS/PTdata signals are stored under the field `map_to` in signals?.yaml
# Load the yaml files and find the map_to names that correspond to the field 'predictors' in the
# dataset definition
resource_path = importlib.resources.files("d3d_signals")
with open(join(resource_path, "signals_0d.yaml"), "r") as fp:
signals_0d = yaml.safe_load(fp)
with open(join(resource_path, "signals_1d.yaml"), "r") as fp:
signals_1d = yaml.safe_load(fp)
# Gather the name of the predictor and the corresponding HDF5 group names
# of the predictors listed in the definition of the dataset.
# The result is a list of tuples (predictor, map_to)
group_list_0d = [(k, signals_0d[k]["map_to"]) for k in dataset_def["predictors"] if k in signals_0d.keys()]
group_list_1d = [(k, signals_1d[k]["map_to"]) for k in dataset_def["predictors"] if k in signals_1d.keys()]
group_list = group_list_0d + group_list_1d
# Generate list from shots in the dataset
shot_list = list(dataset_def["shots"][-10:])
pool = mp.Pool(4)
res_mean_std = pool.map(process_variable, [(args.destination, grp, shot_list) for grp in group_list])
# all_mean_std is a list of dicts: [{"var1": (min, max, mean, std)}, {"var2": (min, max, mean, std)}, ...]
# combine them into a single dict
dict_mean_std = {}
for i in res_mean_std:
dict_mean_std.update(i)
# Serialize the dictionary to file
with open("normalization.yaml", "w") as fp:
fp.write(yaml.safe_dump(dict_mean_std))
# end of file calculate_mean_std.py