From af68d9022e7d96312d84847834930abc786a98c5 Mon Sep 17 00:00:00 2001 From: Matteo Bunino <48362942+matbun@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:15:03 +0100 Subject: [PATCH] Add Path in loggers and clearer error handling (#261) * Add Path in loggers and clearer error handling * Cleanup * Fix typo * ADD warning for training config * remove cmake as horovod is not there anymore * backup * cleanup * Fix path problems * Use elif when possible * Fix type hints * Remove identifier casting * Refactor action --- .github/workflows/lint.yml | 6 +- .readthedocs.yaml | 2 +- src/itwinai/loggers.py | 108 +++++++++++++++++------------------- src/itwinai/parser.py | 40 ++++++++++--- src/itwinai/torch/config.py | 10 ++++ 5 files changed, 96 insertions(+), 70 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e8d0816f..fcc0a716 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -58,6 +58,6 @@ jobs: VALIDATE_ALL_CODEBASE: false # Fail on errors DISABLE_ERRORS: false - # Skip linting of docs - FILTER_REGEX_EXCLUDE: .*docs/index.md|.*docs/docs/.*|.*ISSUE_TEMPLATE/.*|use-cases/.*|experimental/.* - BASH_SEVERITY: error + # Skip linting of some locations + FILTER_REGEX_EXCLUDE: .*ISSUE_TEMPLATE/.*|use-cases/.* + BASH_SEVERITY: warning diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e1dd2f69..52bfc6b0 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -22,7 +22,7 @@ build: apt_packages: - gcc-11 - g++-11 - - cmake + # - cmake - pandoc # Build documentation in the "docs/" directory with Sphinx diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py index f7a322e3..fcca584c 100644 --- a/src/itwinai/loggers.py +++ b/src/itwinai/loggers.py @@ -113,7 +113,7 @@ class Logger(LogMixin): """Base class for logger Args: - savedir (str, optional): filesystem location where logs are stored. + savedir (Union[Path, str], optional): filesystem location where logs are stored. Defaults to 'mllogs'. log_freq (Union[int, Literal['epoch', 'batch']], optional): how often should the logger fulfill calls to the `log()` @@ -136,7 +136,7 @@ class Logger(LogMixin): """ #: Location on filesystem where to store data. - savedir: str = None + savedir: Path #: Supported logging 'kind's. supported_kinds: Tuple[str] #: Current worker global rank @@ -146,13 +146,13 @@ class Logger(LogMixin): def __init__( self, - savedir: str = "mllogs", + savedir: Union[Path, str] = "mllogs", log_freq: Union[int, Literal["epoch", "batch"]] = "epoch", log_on_workers: Union[int, List[int]] = 0, experiment_id: Optional[str] = None, run_id: Optional[Union[int, str]] = None, ) -> None: - self.savedir = savedir + self.savedir = Path(savedir) self.log_freq = log_freq self.log_on_workers = log_on_workers self._experiment_id = experiment_id @@ -209,8 +209,7 @@ def start_logging(self, rank: Optional[int] = None): @abstractmethod def create_logger_context(self, rank: Optional[int] = None) -> Any: - """ - Initializes the logger context. + """Initializes the logger context. Args: rank (Optional[int]): global rank of current process, @@ -240,7 +239,7 @@ def serialize(self, obj: Any, identifier: str) -> str: Returns: str: local path of the serialized object to be logged. """ - itm_path = os.path.join(self.savedir, identifier) + itm_path = self.savedir / identifier with open(itm_path, "wb") as itm_file: pickle.dump(obj, itm_file) @@ -301,7 +300,7 @@ class _EmptyLogger(Logger): def __init__( self, - savedir: str = "mllogs", + savedir: Union[Path, str] = "mllogs", log_freq: int | Literal["epoch"] | Literal["batch"] = "epoch", log_on_workers: int | List[int] = 0, ) -> None: @@ -332,7 +331,7 @@ class ConsoleLogger(Logger): """Simplified logger. Args: - savedir (str, optional): where to store artifacts. + savedir (Union[Path, str], optional): where to store artifacts. Defaults to 'mllogs'. log_freq (Union[int, Literal['epoch', 'batch']], optional): determines whether the logger should fulfill or ignore @@ -349,16 +348,15 @@ class ConsoleLogger(Logger): def __init__( self, - savedir: str = "mllogs", + savedir: Union[Path, str] = "mllogs", log_freq: Union[int, Literal["epoch", "batch"]] = "epoch", log_on_workers: Union[int, List[int]] = 0, ) -> None: - savedir = savedir = Path(savedir) / "simple-logger" - super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers) + cl_savedir = Path(savedir) / "simple-logger" + super().__init__(savedir=cl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) def create_logger_context(self, rank: Optional[int] = None): - """ - Initializes the logger context. + """Initializes the logger context. Args: rank (Optional[int]): global rank of current process, @@ -462,7 +460,7 @@ class MLFlowLogger(Logger): """Abstraction around MLFlow logger. Args: - savedir (str, optional): path on local filesystem where logs are + savedir (Union[Path, str], optional): path on local filesystem where logs are stored. Defaults to 'mllogs'. experiment_name (str, optional): experiment name. Defaults to ``itwinai.loggers.BASE_EXP_NAME``. @@ -501,7 +499,7 @@ class MLFlowLogger(Logger): def __init__( self, - savedir: str = "mllogs", + savedir: Union[Path, str] = "mllogs", experiment_name: str = BASE_EXP_NAME, tracking_uri: Optional[str] = None, run_description: Optional[str] = None, @@ -509,8 +507,8 @@ def __init__( log_freq: Union[int, Literal["epoch", "batch"]] = "epoch", log_on_workers: Union[int, List[int]] = 0, ): - savedir = os.path.join(savedir, "mlflow") - super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers) + mfl_savedir = Path(savedir) / "mlflow" + super().__init__(savedir=mfl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) self.tracking_uri = tracking_uri self.run_description = run_description self.run_name = run_name @@ -523,8 +521,7 @@ def __init__( ) def create_logger_context(self, rank: Optional[int] = None) -> mlflow.ActiveRun: - """ - Initializes the logger context. Start MLFLow run. + """Initializes the logger context. Start MLFLow run. Args: rank (Optional[int]): global rank of current process, @@ -600,22 +597,22 @@ def log( if kind == "metric": mlflow.log_metric(key=identifier, value=item, step=step) - if kind == "artifact": + elif kind == "artifact": if not isinstance(item, str): # Save the object locally and then log it name = os.path.basename(identifier) - save_path = Path(self.savedir) / ".trash" / name + save_path = self.savedir / ".trash" / str(name) save_path.mkdir(os.path.dirname(save_path), exist_ok=True) item = self.serialize(item, save_path) mlflow.log_artifact(local_path=item, artifact_path=identifier) - if kind == "model": + elif kind == "model": import torch if isinstance(item, torch.nn.Module): mlflow.pytorch.log_model(item, identifier) else: print("WARNING: unrecognized model type") - if kind == "dataset": + elif kind == "dataset": # Log mlflow dataset # https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.log_input # It may be needed to convert item into a mlflow dataset, e.g.: @@ -625,29 +622,29 @@ def log( mlflow.log_input(item) else: print("WARNING: unrecognized dataset type. " "Must be an MLFlow dataset") - if kind == "torch": + elif kind == "torch": import torch # Save the object locally and then log it name = os.path.basename(identifier) - save_path = Path(self.savedir) / ".trash" / name + save_path = self.savedir / ".trash" / str(name) save_path.mkdir(os.path.dirname(save_path), exist_ok=True) torch.save(item, save_path) # Log into mlflow mlflow.log_artifact(local_path=save_path, artifact_path=identifier) - if kind == "dict": + elif kind == "dict": mlflow.log_dict(dictionary=item, artifact_file=identifier) - if kind == "figure": + elif kind == "figure": mlflow.log_figure( artifact_file=identifier, figure=item, save_kwargs=kwargs.get("save_kwargs"), ) - if kind == "image": + elif kind == "image": mlflow.log_image(artifact_file=identifier, image=item) - if kind == "param": + elif kind == "param": mlflow.log_param(key=identifier, value=item) - if kind == "text": + elif kind == "text": mlflow.log_text(artifact_file=identifier, text=item) @@ -655,7 +652,7 @@ class WandBLogger(Logger): """Abstraction around WandB logger. Args: - savedir (str, optional): location on local filesystem where logs + savedir (Union[Path, str], optional): location on local filesystem where logs are stored. Defaults to 'mllogs'. project_name (str, optional): experiment name. Defaults to ``itwinai.loggers.BASE_EXP_NAME``. @@ -685,18 +682,17 @@ class WandBLogger(Logger): def __init__( self, - savedir: str = "mllogs", + savedir: Union[Path, str] = "mllogs", project_name: str = BASE_EXP_NAME, log_freq: Union[int, Literal["epoch", "batch"]] = "epoch", log_on_workers: Union[int, List[int]] = 0, ) -> None: - savedir = os.path.join(savedir, "wandb") - super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers) + wbl_savedir = Path(savedir) / "wandb" + super().__init__(savedir=wbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) self.project_name = project_name def create_logger_context(self, rank: Optional[int] = None) -> None: - """ - Initializes the logger context. Init WandB run. + """Initializes the logger context. Init WandB run. Args: rank (Optional[int]): global rank of current process, @@ -707,10 +703,11 @@ def create_logger_context(self, rank: Optional[int] = None) -> None: if not self.should_log(): return - os.makedirs(os.path.join(self.savedir, "wandb"), exist_ok=True) - self.active_run = wandb.init( - dir=os.path.abspath(self.savedir), project=self.project_name + (self.savedir / "wandb").mkdir( + exist_ok=True, + parents=True, ) + self.active_run = wandb.init(dir=self.savedir.resolve(), project=self.project_name) def destroy_logger_context(self): """Destroy logger.""" @@ -767,7 +764,7 @@ class TensorBoardLogger(Logger): TensorFlow. Args: - savedir (str, optional): location on local filesystem where logs + savedir (Union[Path, str], optional): location on local filesystem where logs are stored. Defaults to 'mllogs'. log_freq (Union[int, Literal['epoch', 'batch']], optional): determines whether the logger should fulfill or ignore @@ -793,29 +790,28 @@ class TensorBoardLogger(Logger): def __init__( self, - savedir: str = "mllogs", + savedir: Union[Path, str] = "mllogs", log_freq: Union[int, Literal["epoch", "batch"]] = "epoch", framework: Literal["tensorflow", "pytorch"] = "pytorch", log_on_workers: Union[int, List[int]] = 0, ) -> None: - savedir = os.path.join(savedir, "tensorboard") - super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers) + tbl_savedir = Path(savedir) / "tensorboard" + super().__init__(savedir=tbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers) self.framework = framework if framework.lower() == "tensorflow": import tensorflow as tf self.tf = tf - self.writer = tf.summary.create_file_writer(savedir) + self.writer = tf.summary.create_file_writer(tbl_savedir.resolve().as_posix()) elif framework.lower() == "pytorch": from torch.utils.tensorboard import SummaryWriter - self.writer = SummaryWriter(savedir) + self.writer = SummaryWriter(tbl_savedir.resolve().as_posix()) else: raise ValueError("Framework must be either 'tensorflow' or 'pytorch'") def create_logger_context(self, rank: Optional[int] = None) -> None: - """ - Initializes the logger context. Init Tensorboard run. + """Initializes the logger context. Init Tensorboard run. Args: rank (Optional[int]): global rank of current process, @@ -914,7 +910,7 @@ class LoggersCollection(Logger): supported_kinds: Tuple[str] def __init__(self, loggers: List[Logger]) -> None: - super().__init__(savedir="/tmp/mllogs_LoggersCollection", log_freq=1) + super().__init__(savedir=Path("/tmp/mllogs_LoggersCollection"), log_freq=1) self.loggers = loggers def should_log(self, batch_idx: int = None) -> bool: @@ -964,8 +960,7 @@ def log( ) def create_logger_context(self, rank: Optional[int] = None) -> Any: - """ - Initializes all loggers. + """Initializes all loggers. Args: rank (Optional[int]): global rank of current process, @@ -998,7 +993,7 @@ class Prov4MLLogger(Logger): files will be uploaded. Defaults to "www.example.org". experiment_name (str, optional): experiment name. Defaults to "experiment_name". - provenance_save_dir (str, optional): path where to store provenance + provenance_save_dir (Union[Path, str], optional): path where to store provenance files and logs. Defaults to "prov". save_after_n_logs (Optional[int], optional): how often to save logs to disk from main memory. Defaults to 100. @@ -1031,9 +1026,9 @@ class Prov4MLLogger(Logger): def __init__( self, - prov_user_namespace="www.example.org", - experiment_name="experiment_name", - provenance_save_dir="mllogs/prov_logs", + prov_user_namespace: str = "www.example.org", + experiment_name: str = "experiment_name", + provenance_save_dir: Union[Path, str] = "mllogs/prov_logs", save_after_n_logs: Optional[int] = 100, create_graph: Optional[bool] = True, create_svg: Optional[bool] = True, @@ -1054,8 +1049,7 @@ def __init__( @override def create_logger_context(self, rank: Optional[int] = None): - """ - Initializes the logger context. + """Initializes the logger context. Args: rank (Optional[int]): global rank of current process, diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py index 8d0a9efc..735e9056 100644 --- a/src/itwinai/parser.py +++ b/src/itwinai/parser.py @@ -27,6 +27,12 @@ from .utils import load_yaml +class _ArgumentParser(JAPArgumentParser): + def error(self, message: str, ex: Optional[Exception] = None) -> None: + """Patch error method to re-raise exception instead of exiting execution.""" + raise ex + + def add_replace_field(config: Dict, key_chain: str, value: Any) -> None: """Replace or add (if not present) a field in a dictionary, following a path of dot-separated keys. Adding is not supported for list items. @@ -61,9 +67,16 @@ def add_replace_field(config: Dict, key_chain: str, value: Any) -> None: sub_config[k] = value +def get_root_cause(exception: Exception) -> Exception: + """Recursively extract the first exception in the exception chain.""" + root = exception + while root.__cause__ is not None: # Traverse the exception chain + root = root.__cause__ + return root + + class ConfigParser: - """ - Parses a pipeline from a configuration file. + """Parses a pipeline from a configuration file. It also provides functionalities for dynamic override of fields by means of nested key notation. @@ -150,7 +163,7 @@ def parse_pipeline( Returns: Pipeline: instantiated pipeline. """ - pipe_parser = JAPArgumentParser() + pipe_parser = _ArgumentParser() pipe_parser.add_subclass_arguments(Pipeline, "pipeline") pipe_dict = self.config @@ -163,9 +176,13 @@ def parse_pipeline( print("Assembled pipeline:") print(json.dumps(pipe_dict, indent=4)) - # Parse pipeline dict once merged with steps - conf = pipe_parser.parse_object(pipe_dict) - pipe = pipe_parser.instantiate_classes(conf) + try: + # Parse pipeline dict once merged with steps + conf = pipe_parser.parse_object(pipe_dict) + pipe = pipe_parser.instantiate_classes(conf) + except Exception as exc: + exc = get_root_cause(exc) + raise exc self.pipeline = pipe["pipeline"] return self.pipeline @@ -187,10 +204,15 @@ def parse_step( # Wrap config under "step" field and parse it step_dict_config = {"step": step_dict_config} - step_parser = JAPArgumentParser() + step_parser = _ArgumentParser() step_parser.add_subclass_arguments(BaseComponent, "step") - parsed_namespace = step_parser.parse_object(step_dict_config) - return step_parser.instantiate_classes(parsed_namespace)["step"] + try: + parsed_namespace = step_parser.parse_object(step_dict_config) + step = step_parser.instantiate_classes(parsed_namespace)["step"] + except Exception as exc: + exc = get_root_cause(exc) + raise exc + return step class ArgumentParser(JAPArgumentParser): diff --git a/src/itwinai/torch/config.py b/src/itwinai/torch/config.py index 687e7264..d9e0e5d0 100644 --- a/src/itwinai/torch/config.py +++ b/src/itwinai/torch/config.py @@ -35,6 +35,16 @@ class TrainingConfiguration(Configuration): >>> from rich import print >>> print(cfg) # pretty-print of configuration + .. warning:: + + Don't reinvent parameters that already exist in the training coniguration, if possible. + Instead, use the name of the parameters in the training configuration when possible to + avoid inconsistencies. For instance, the training configuration defines the learning + rate as ``optim_lr``, so if you redefine it as ``lr`` by doing + ``TrainingConfiguration(lr=0.005)`` in the configuration you will now have both + ``optim_lr`` (created by default) and ``lr`` (created by you). This may create + confusion and potentially (and silently) break the logic in your code. + """ #: Batch size. In a distributed environment it is usually the