Skip to content

Commit

Permalink
Merge pull request #76 from fact-project/fix_progress_bars
Browse files Browse the repository at this point in the history
Fix progress bars
  • Loading branch information
maxnoe authored Jan 8, 2019
2 parents 2be22fb + 1c2e897 commit ac9571f
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 30 deletions.
2 changes: 1 addition & 1 deletion aict_tools/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def apply_cuts_h5py_chunked(
with h5py.File(input_path, 'r') as infile, h5py.File(output_path, 'w') as outfile:
group = outfile.create_group(key)

for chunk in tqdm(range(n_chunks), disable=not progress):
for chunk in tqdm(range(n_chunks), disable=not progress, total=n_chunks):
start = chunk * chunksize
end = min(n_events, (chunk + 1) * chunksize)

Expand Down
76 changes: 56 additions & 20 deletions aict_tools/io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from os import path
import os
from sklearn.externals import joblib
from sklearn2pmml import sklearn2pmml, PMMLPipeline
import logging
Expand Down Expand Up @@ -43,32 +43,68 @@ def read_telescope_data_chunked(path, aict_config, chunksize, columns, feature_g
'''
Reads data from hdf5 file given as PATH and yields dataframes for each chunk
'''
n_rows = h5py_get_n_rows(path, aict_config.telescope_events_key)
if chunksize:
n_chunks = int(np.ceil(n_rows / chunksize))
else:
n_chunks = 1
chunksize = n_rows
log.info('Splitting data into {} chunks'.format(n_chunks))

for chunk in range(n_chunks):

start = chunk * chunksize
end = min(n_rows, (chunk + 1) * chunksize)
return TelescopeDataIterator(
path,
aict_config,
chunksize,
columns,
feature_generation_config=feature_generation_config,
)


class TelescopeDataIterator:

def __init__(
self,
path,
aict_config,
chunksize,
columns,
feature_generation_config=None,
):
self.aict_config = aict_config
self.columns = columns
self.feature_generation_config = feature_generation_config
self.n_rows = h5py_get_n_rows(path, aict_config.telescope_events_key)
self.path = path
if chunksize:
self.chunksize = chunksize
self.n_chunks = int(np.ceil(self.n_rows / chunksize))
else:
self.n_chunks = 1
self.chunksize = self.n_rows
log.info('Splitting data into {} chunks'.format(self.n_chunks))

self.current_chunk = 0

def __len__(self):
return self.n_chunks

def __iter__(self):
return self

def __next__(self):
if self.current_chunk == self.n_chunks:
raise StopIteration

chunk = self.current_chunk
start = chunk * self.chunksize
end = min(self.n_rows, (chunk + 1) * self.chunksize)
self.current_chunk += 1

df = read_telescope_data(
path,
aict_config=aict_config,
columns=columns,
self.path,
aict_config=self.aict_config,
columns=self.columns,
first=start,
last=end
)
df.index = np.arange(start, end)

if feature_generation_config:
feature_generation(df, feature_generation_config, inplace=True)
if self.feature_generation_config:
feature_generation(df, self.feature_generation_config, inplace=True)

yield df, start, end
return df, start, end


def read_telescope_data(path, aict_config, columns, feature_generation_config=None, n_sample=None, first=None, last=None):
Expand Down Expand Up @@ -131,7 +167,7 @@ def read_telescope_data(path, aict_config, columns, feature_generation_config=No


def pickle_model(classifier, feature_names, model_path, label_text='label'):
p, extension = path.splitext(model_path)
p, extension = os.path.splitext(model_path)
classifier.feature_names = feature_names

if (extension == '.pmml'):
Expand Down
3 changes: 2 additions & 1 deletion aict_tools/scripts/train_disp_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def main(configuration_path, signal_path, predictions_path, disp_model_path, sig
random_state=config.seed,
)

for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))):
total = model_config.n_cross_validations
for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=total)):

cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]

Expand Down
16 changes: 8 additions & 8 deletions aict_tools/scripts/train_energy_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def main(configuration_path, signal_path, predictions_path, model_path, verbose)
if model_config.log_target is True:
target = np.log(target)

n_cross_validations = model_config.n_cross_validations
n_cv = model_config.n_cross_validations
regressor = model_config.model
log.info('Starting {} fold cross validation... '.format(n_cross_validations))
log.info('Starting {} fold cross validation... '.format(n_cv))
scores = []
cv_predictions = []

kfold = model_selection.KFold(n_splits=n_cross_validations, shuffle=True, random_state=config.seed)
kfold = model_selection.KFold(n_splits=n_cv, shuffle=True, random_state=config.seed)

for fold, (train, test) in tqdm(enumerate(kfold.split(df_train.values))):
for fold, (train, test) in enumerate(tqdm(kfold.split(df_train.values), total=n_cv)):

cv_x_train, cv_x_test = df_train.values[train], df_train.values[test]
cv_y_train, cv_y_test = target.values[train], target.values[test]
Expand Down Expand Up @@ -109,10 +109,10 @@ def main(configuration_path, signal_path, predictions_path, model_path, verbose)

log.info('Pickling model to {} ...'.format(model_path))
pickle_model(
regressor,
feature_names=list(df_train.columns),
model_path=model_path,
label_text='estimated_energy',
regressor,
feature_names=list(df_train.columns),
model_path=model_path,
label_text='estimated_energy',
)


Expand Down

0 comments on commit ac9571f

Please sign in to comment.