-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathDistillFileStatsToDB.py
153 lines (132 loc) · 6 KB
/
DistillFileStatsToDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import argparse
import os
import pandas as pd
import math
import datetime
import numpy as np
import h5py
import matplotlib.pyplot as plt
import sqlite3
import re
def make_wide(formatter, w=150, h=36):
"""Return a wider HelpFormatter, if possible."""
try:
# https://stackoverflow.com/a/5464440
# beware: "Only the name of this class is considered a public API."
kwargs = {'width': w, 'max_help_position': h}
formatter(None, **kwargs)
return lambda prog: formatter(prog, **kwargs)
except TypeError:
warnings.warn("argparse help formatter failed, falling back.")
return formatter
### Make a parser
parser = argparse.ArgumentParser(description='Loops over keys of an HDF5 file, derives stats for values and time, saves into a database.',
formatter_class=make_wide(argparse.ArgumentDefaultsHelpFormatter))
### Add options
parser.add_argument ('infilename')
parser.add_argument ('-v', dest='debug', action="store_true", default=False,
help="Turn on verbose debugging. (default: F)")
parser.add_argument ('-q','--quick', dest='quick', action="store_true", default=False,
help="Stop after maxparams plots are rendered. (default: F)")
parser.add_argument ('--maxparams', dest='maxparams', default=2,
help="Max parameters to loop if -q. (default: 2).")
parser.add_argument ('--maxrows', dest='maxrows', default=1E2,
help="Max rows to process if -q. (default: 1E2).")
parser.add_argument ('--db', dest='dbname', default='GMPSAI.db',
help="Name of sqlite3 database file to save caculated stats into. (default: GMPSAI.db)")
### Get the options and argument values from the parser....
options = parser.parse_args()
### ...and assign them to variables. (No declaration needed, just like bash!)
infilename = options.infilename
if not os.path.isfile(infilename): exit ('File not found: '+infilename)
debug = options.debug
quick = options.quick
maxparams = int(options.maxparams)
maxrows = int(options.maxrows)
dbname = options.dbname
# Open the file
infile = h5py.File(infilename, 'r')
filekeys = list (infile.keys())
infile.close()
keycount = len (filekeys)
keynum = 1
conn = sqlite3.connect(dbname)
dbcurs = conn.cursor()
# Some fiddly work to extract the start and end datetime for this file.
justthefile = infilename.split('/')[infilename.count('/')]
StartTime = re.findall(r"\d\d\d\d-\d\d-\d\d\+\d\d:\d\d:\d\d",justthefile)[0]
if debug: print ('StartTime:')
if debug: print (StartTime)
# Convert to epoch
utc_time = datetime.datetime.strptime(StartTime, '%Y-%m-%d+%H:%M:%S')
epoch_time = (utc_time - datetime.datetime(1970, 1, 1)).total_seconds()
if debug: print (epoch_time)
timedflist = []
# Loop over ACNET devices (hdf5 top set of keys)
for key in filekeys:
if quick and keynum > maxparams: break
keynum += 1
# Temporary: Dataframe of two columns: values and timestamps for this param ('key')
tempdf = pd.read_hdf(infilename, key)
if debug: print (" "+key+" shape:", tempdf.shape)
# Get the two column names of interest, the values and the times:
cols = list (tempdf.columns)
if debug: print (cols)
timecolname = ''
valcolname = ''
for colname in cols:
if colname.count('utc_seconds') > 0:
timecolname = colname
else:
valcolname = colname
if debug: print ('timecol = ',timecolname,' & valcol = ',valcolname)
# Append timestamps column to a separate dataframe of them, for analysis outside this loop.
newcol = tempdf[timecolname].rename('utc_'+key, axis=1)
timedflist.append(newcol)
# A temporary dataframe of just the values for this parameter, for stats calulation
valdf = tempdf[valcolname]
if debug: print (valdf.describe())
# Get some stats to make plots, and store for long-term time trends
statnames = ['mode','std','min','max']
valstatsdict = {}
valstatsdict['mode'] = valdf.mode()[0] # Zeroth mode only
valstatsdict['mean'] = valdf.mean()
valstatsdict['std' ] = valdf.std()
valstatsdict['min' ] = valdf.min()
valstatsdict['max' ] = valdf.max()
if debug: print (valstatsdict)
upsertstr = 'REPLACE INTO ACNETparameterStats (epochUTCsec, paramname, statname, statval) VALUES ('
upsertstr += str(epoch_time) +', "'+key +'", '
for statname in statnames:
cmdstr = upsertstr +'"'+statname +'", '+str(valstatsdict[statname])+');'
if debug: print (cmdstr)
dbcurs.execute(cmdstr)
conn.commit()
# Done with loop over statnames.
# Calculate sample-to-sample time deltas (units of timedel)
timelist = tempdf[timecolname]#.sort_values() # Force to be non-negative
timedeltas = timelist.diff() #/ timedel
if debug: print (timedeltas.describe())
if debug: print(type(timedeltas[1]))
# Get some stats on the sample-to-sample time intervals (deltas)
tdelstatsdict = {}
tdelstatsdict['mode'] = timedeltas.mode()[0] # Zeroth mode only
tdelstatsdict['std' ] = timedeltas.std()
tdelstatsdict['min' ] = timedeltas.min()
tdelstatsdict['max' ] = timedeltas.max()
if debug: print (tdelstatsdict)
# Also database entries for the time deltas
# Now the paramname is like Interval_B:IMIN
upsertstr = upsertstr.replace(key,'Interval_'+key)
for statname in statnames: # Same stats but on consecutive entry time deltas
cmdstr = upsertstr +'"'+statname +'", '+str(tdelstatsdict[statname])+');'
if debug: print (cmdstr)
dbcurs.execute(cmdstr)
conn.commit()
# Possible that we should be using recorded values of b:LINEFRQ to correct the expected range of time deltas.
# inliermin = list(refsub_timedf.min(numeric_only=True, axis=0))
# del inliermin[-1] # Drop the final entry, which is from the line frequency
# inliermin = min(inliermin)
# inliermax = list(refsub_timedf.max(numeric_only=True, axis=0))
# del inliermax[-1] # Drop the final entry, which is from the line frequency
# inliermax = max(inliermax)