From 099e9c67c08211936c63e9605a6f567a5b9bf48c Mon Sep 17 00:00:00 2001 From: Dacheng Xu Date: Wed, 31 Jul 2024 16:33:02 +0800 Subject: [PATCH] Separate and save lineage for all levels (#179) * Separte and save lineage for all levels * Debug * Debug --- appletree/component.py | 62 ++++++++++++++++++++---------------- appletree/config.py | 69 +++++++++++++++++++++-------------------- appletree/context.py | 42 +++++++++++++++---------- appletree/likelihood.py | 58 ++++++++++++++++++++-------------- appletree/plugin.py | 30 ++++++++++-------- appletree/share.py | 2 +- appletree/utils.py | 12 +++++++ 7 files changed, 161 insertions(+), 114 deletions(-) diff --git a/appletree/component.py b/appletree/component.py index 925302ce..be273a39 100644 --- a/appletree/component.py +++ b/appletree/component.py @@ -1,3 +1,4 @@ +import os from warnings import warn from functools import partial from typing import Tuple, List, Dict, Optional, Union, Set @@ -183,9 +184,13 @@ def compile(self): pass @property - def lineage_hash(self): + def lineage(self): raise NotImplementedError + @property + def lineage_hash(self): + return deterministic_hash(self.lineage) + @export class ComponentSim(Component): @@ -564,24 +569,26 @@ def new_component(self, llh_name: Optional[str] = None, pass_binning: bool = Tru return component @property - def lineage_hash(self): - return deterministic_hash( - { - **{ - "rate_name": self.rate_name, - "norm_type": self.norm_type, - "bins": self.bins, - "bins_type": self.bins_type, - "code": self.code, - }, - **dict( + def lineage(self): + return { + **{ + "rate_name": self.rate_name, + "norm_type": self.norm_type, + "bins": ( + tuple(b.tolist() for b in self.bins) if self.bins is not None else self.bins + ), + "bins_type": self.bins_type, + "code": self.code, + }, + **{ + "instances": dict( zip( self.instances, - [_cached_functions[self.llh_name][p].lineage_hash for p in self.instances], + [_cached_functions[self.llh_name][p].lineage for p in self.instances], ) - ), - } - ) + ) + }, + } @export @@ -627,16 +634,19 @@ def simulate_weighted_data(self, parameters, *args, **kwargs): return result @property - def lineage_hash(self): - return deterministic_hash( - { - "rate_name": self.rate_name, - "norm_type": self.norm_type, - "bins": self.bins, - "bins_type": self.bins_type, - "file_name": calculate_sha256(get_file_path(self._file_name)), - } - ) + def lineage(self): + return { + "rate_name": self.rate_name, + "norm_type": self.norm_type, + "bins": tuple(b.tolist() for b in self.bins) if self.bins is not None else self.bins, + "bins_type": self.bins_type, + "file_path": ( + os.path.basename(self._file_name) + if not utils.FULL_PATH_LINEAGE + else get_file_path(self._file_name) + ), + "sha256": calculate_sha256(get_file_path(self._file_name)), + } @export diff --git a/appletree/config.py b/appletree/config.py index cc5076da..b058b8f7 100644 --- a/appletree/config.py +++ b/appletree/config.py @@ -8,6 +8,7 @@ import numpy as np from strax import deterministic_hash +from appletree import utils from appletree.share import _cached_configs from appletree.utils import ( exporter, @@ -112,9 +113,13 @@ def required_parameter(self, llh_name=None): return None @property - def lineage_hash(self): + def lineage(self): raise NotImplementedError + @property + def lineage_hash(self): + return deterministic_hash(self.lineage) + @export class Constant(Config): @@ -145,13 +150,11 @@ def build(self, llh_name: Optional[str] = None): self.value = value @property - def lineage_hash(self): - return deterministic_hash( - { - "llh_name": self.llh_name, - "value": self.value, - } - ) + def lineage(self): + return { + "llh_name": self.llh_name, + "value": self.value, + } @export @@ -338,15 +341,17 @@ def pdf_to_cdf(self, x, pdf): return x, cdf @property - def lineage_hash(self): - return deterministic_hash( - { - "llh_name": self.llh_name, - "method": self.method, - "file_path": os.path.basename(self.file_path), - "sha256": calculate_sha256(get_file_path(self.file_path)), - } - ) + def lineage(self): + return { + "llh_name": self.llh_name, + "method": self.method, + "file_path": ( + os.path.basename(self.file_path) + if not utils.FULL_PATH_LINEAGE + else get_file_path(self.file_path) + ), + "sha256": calculate_sha256(get_file_path(self.file_path)), + } @export @@ -500,16 +505,14 @@ def apply(self, pos, parameters): return median + add @property - def lineage_hash(self): - return deterministic_hash( - { - "llh_name": self.llh_name, - "method": self.method, - "median": self.median.lineage_hash, - "lower": self.lower.lineage_hash, - "upper": self.upper.lineage_hash, - } - ) + def lineage(self): + return { + "llh_name": self.llh_name, + "method": self.method, + "median": self.median.lineage, + "lower": self.lower.lineage, + "upper": self.upper.lineage, + } @export @@ -559,10 +562,8 @@ def _sanity_check(self): assert np.all(np.isclose(volumes, volumes[0])), mesg @property - def lineage_hash(self): - return deterministic_hash( - { - "llh_name": self.llh_name, - "value": self.value, - } - ) + def lineage(self): + return { + "llh_name": self.llh_name, + "value": self.value, + } diff --git a/appletree/context.py b/appletree/context.py index d56e85cc..2e346d1d 100644 --- a/appletree/context.py +++ b/appletree/context.py @@ -14,7 +14,7 @@ import appletree as apt from appletree import randgen from appletree import Parameter -from appletree.utils import load_json, get_file_path +from appletree.utils import JSON_OPTIONS, load_json, get_file_path from appletree.share import _cached_configs, set_global_config os.environ["OMP_NUM_THREADS"] = "1" @@ -303,19 +303,23 @@ def _dump_meta(self, batch_size, metadata=None): if self.backend_h5 is not None: name = self.sampler.backend.name with h5py.File(self.backend_h5, "r+") as opt: - opt[name].attrs["metadata"] = json.dumps(metadata) + opt[name].attrs["metadata"] = json.dumps(metadata, **JSON_OPTIONS) # parameters prior configuration - opt[name].attrs["par_config"] = json.dumps(self.par_manager.par_config) + opt[name].attrs["par_config"] = json.dumps( + self.par_manager.par_config, **JSON_OPTIONS + ) # max posterior parameters - opt[name].attrs["post_parameters"] = json.dumps(self.get_post_parameters()) + opt[name].attrs["post_parameters"] = json.dumps( + self.get_post_parameters(), **JSON_OPTIONS + ) # the order of parameters saved in backend opt[name].attrs["parameter_fit"] = self.par_manager.parameter_fit # instructions - opt[name].attrs["instruct"] = json.dumps(self.instruct) + opt[name].attrs["instruct"] = json.dumps(self.instruct, **JSON_OPTIONS) # configs - opt[name].attrs["config"] = json.dumps(self.config) + opt[name].attrs["config"] = json.dumps(self.config, **JSON_OPTIONS) # configurations, maybe users will manually add some maps - opt[name].attrs["_cached_configs"] = json.dumps(_cached_configs) + opt[name].attrs["_cached_configs"] = json.dumps(_cached_configs, **JSON_OPTIONS) # batch size opt[name].attrs["batch_size"] = batch_size @@ -392,16 +396,20 @@ def update_parameter_config(self, likelihoods): return needed_parameters @property - def lineage_hash(self): - return deterministic_hash( - { - **self.instruct, - **self.par_config, - **dict( + def lineage(self): + return { + **self.instruct, + **{"par_config": self.par_config}, + **{ + "likelihoods": dict( zip( self.likelihoods.keys(), - [v.lineage_hash for v in self.likelihoods.values()], + [v.lineage for v in self.likelihoods.values()], ) - ), - } - ) + ) + }, + } + + @property + def lineage_hash(self): + return deterministic_hash(self.lineage) diff --git a/appletree/likelihood.py b/appletree/likelihood.py index f1b0ff2c..df4f47a1 100644 --- a/appletree/likelihood.py +++ b/appletree/likelihood.py @@ -1,3 +1,4 @@ +import os from warnings import warn from typing import Type, Dict, Set, Optional, cast import inspect @@ -7,6 +8,7 @@ from scipy.stats import norm from strax import deterministic_hash +from appletree import utils from appletree import randgen from appletree.hist import make_hist_mesh_grid, make_hist_irreg_bin_1d, make_hist_irreg_bin_2d from appletree.utils import ( @@ -151,7 +153,7 @@ def set_binning(self, config): clip=config["clip"], which_np=np, ) - self._bins = [self._bins] + self._bins = (self._bins,) self.data_hist = make_hist_irreg_bin_1d( self.data[:, 0], bins=self._bins[0], @@ -228,6 +230,7 @@ def set_binning(self, config): ) else: raise ValueError("'bins_type' should either be meshgrid, equiprob or irreg") + assert isinstance(self._bins, tuple), "bins should be tuple after setting binning!" def register_component( self, component_cls: Type[Component], component_name: str, file_name: Optional[str] = None @@ -410,21 +413,30 @@ def print_likelihood_summary(self, indent: str = " " * 4, short: bool = True): print("-" * 40) @property - def lineage_hash(self): - return deterministic_hash( - { - **{ - "config": self._config, - "sha256": calculate_sha256(get_file_path(self._data_file_name)), - }, - **dict( + def lineage(self): + return { + **{ + "config": self._config, + "file_path": ( + os.path.basename(self._data_file_name) + if not utils.FULL_PATH_LINEAGE + else get_file_path(self._data_file_name) + ), + "sha256": calculate_sha256(get_file_path(self._data_file_name)), + }, + **{ + "components": dict( zip( self.components.keys(), - [v.lineage_hash for v in self.components.values()], + [v.lineage for v in self.components.values()], ) - ), - } - ) + ) + }, + } + + @property + def lineage_hash(self): + return deterministic_hash(self.lineage) class LikelihoodLit(Likelihood): @@ -577,17 +589,17 @@ def print_likelihood_summary(self, indent: str = " " * 4, short: bool = True): print("-" * 40) @property - def lineage_hash(self): - return deterministic_hash( - { - **{ - "config": self._config, - }, - **dict( + def lineage(self): + return { + **{ + "config": self._config, + }, + **{ + "components": dict( zip( self.components.keys(), [v.lineage_hash for v in self.components.values()], ) - ), - } - ) + ) + }, + } diff --git a/appletree/plugin.py b/appletree/plugin.py index 19e494e6..3d55941c 100644 --- a/appletree/plugin.py +++ b/appletree/plugin.py @@ -92,22 +92,26 @@ def sanity_check(self): raise ValueError(mesg) @property - def lineage_hash(self): - return deterministic_hash( - { - **{ - "depends_on": self.depends_on, - "provides": self.provides, - "parameters": self.parameters, - }, - **dict( + def lineage(self): + return { + **{ + "depends_on": self.depends_on, + "provides": self.provides, + "parameters": self.parameters, + }, + **{ + "takes_config": dict( zip( self.takes_config.keys(), - [v.lineage_hash for v in self.takes_config.values()], + [v.lineage for v in self.takes_config.values()], ) - ), - } - ) + ) + }, + } + + @property + def lineage_hash(self): + return deterministic_hash(self.lineage) @export diff --git a/appletree/share.py b/appletree/share.py index 5d4e2901..e2dd33c9 100644 --- a/appletree/share.py +++ b/appletree/share.py @@ -15,7 +15,7 @@ def __setitem__(self, key, value): return super().__setitem__(key, value) def __repr__(self): - return json.dumps(self, indent=4) + return json.dumps(self, sort_keys=True, indent=4) def __str__(self): return self.__repr__() diff --git a/appletree/utils.py b/appletree/utils.py index 4e6bd050..1ddb98b4 100644 --- a/appletree/utils.py +++ b/appletree/utils.py @@ -28,6 +28,10 @@ SKIP_MONGO_DB = True +JSON_OPTIONS = dict(sort_keys=True, indent=4) + +FULL_PATH_LINEAGE = False + def exporter(export_self=False): """Export utility modified from https://stackoverflow.com/a/41895194 @@ -206,6 +210,7 @@ def get_file_path(fname): @export def calculate_sha256(file_path): + """Get sha256 hash of the file.""" sha256_hash = hashlib.sha256() with open(file_path, "rb") as f: for byte_block in iter(lambda: f.read(4096), b""): @@ -213,6 +218,13 @@ def calculate_sha256(file_path): return sha256_hash.hexdigest() +@export +def dump_lineage(file_path, entity): + """Dump lineage of whatever level into .json file.""" + with open(file_path, "w") as f: + f.write(json.dumps(entity.lineage, **JSON_OPTIONS)) + + @export def timeit(indent=""): """Use timeit as a decorator.