From af68d9022e7d96312d84847834930abc786a98c5 Mon Sep 17 00:00:00 2001
From: Matteo Bunino <48362942+matbun@users.noreply.github.com>
Date: Tue, 3 Dec 2024 17:15:03 +0100
Subject: [PATCH] Add Path in loggers and clearer error handling (#261)

* Add Path in loggers and clearer error handling

* Cleanup

* Fix typo

* ADD warning for training config

* remove cmake as horovod is not there anymore

* backup

* cleanup

* Fix path problems

* Use elif when possible

* Fix type hints

* Remove identifier casting

* Refactor action
---
 .github/workflows/lint.yml  |   6 +-
 .readthedocs.yaml           |   2 +-
 src/itwinai/loggers.py      | 108 +++++++++++++++++-------------------
 src/itwinai/parser.py       |  40 ++++++++++---
 src/itwinai/torch/config.py |  10 ++++
 5 files changed, 96 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index e8d0816f..fcc0a716 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -58,6 +58,6 @@ jobs:
           VALIDATE_ALL_CODEBASE: false
           # Fail on errors
           DISABLE_ERRORS: false
-          # Skip linting of docs
-          FILTER_REGEX_EXCLUDE: .*docs/index.md|.*docs/docs/.*|.*ISSUE_TEMPLATE/.*|use-cases/.*|experimental/.*
-          BASH_SEVERITY: error
+          # Skip linting of some locations
+          FILTER_REGEX_EXCLUDE: .*ISSUE_TEMPLATE/.*|use-cases/.*
+          BASH_SEVERITY: warning
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index e1dd2f69..52bfc6b0 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -22,7 +22,7 @@ build:
   apt_packages:
     - gcc-11
     - g++-11
-    - cmake
+    # - cmake
     - pandoc
 
 # Build documentation in the "docs/" directory with Sphinx
diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py
index f7a322e3..fcca584c 100644
--- a/src/itwinai/loggers.py
+++ b/src/itwinai/loggers.py
@@ -113,7 +113,7 @@ class Logger(LogMixin):
     """Base class for logger
 
     Args:
-        savedir (str, optional): filesystem location where logs are stored.
+        savedir (Union[Path, str], optional): filesystem location where logs are stored.
             Defaults to 'mllogs'.
         log_freq (Union[int, Literal['epoch', 'batch']], optional):
             how often should the logger fulfill calls to the `log()`
@@ -136,7 +136,7 @@ class Logger(LogMixin):
     """
 
     #: Location on filesystem where to store data.
-    savedir: str = None
+    savedir: Path
     #: Supported logging 'kind's.
     supported_kinds: Tuple[str]
     #: Current worker global rank
@@ -146,13 +146,13 @@ class Logger(LogMixin):
 
     def __init__(
         self,
-        savedir: str = "mllogs",
+        savedir: Union[Path, str] = "mllogs",
         log_freq: Union[int, Literal["epoch", "batch"]] = "epoch",
         log_on_workers: Union[int, List[int]] = 0,
         experiment_id: Optional[str] = None,
         run_id: Optional[Union[int, str]] = None,
     ) -> None:
-        self.savedir = savedir
+        self.savedir = Path(savedir)
         self.log_freq = log_freq
         self.log_on_workers = log_on_workers
         self._experiment_id = experiment_id
@@ -209,8 +209,7 @@ def start_logging(self, rank: Optional[int] = None):
 
     @abstractmethod
     def create_logger_context(self, rank: Optional[int] = None) -> Any:
-        """
-        Initializes the logger context.
+        """Initializes the logger context.
 
         Args:
             rank (Optional[int]): global rank of current process,
@@ -240,7 +239,7 @@ def serialize(self, obj: Any, identifier: str) -> str:
         Returns:
             str: local path of the serialized object to be logged.
         """
-        itm_path = os.path.join(self.savedir, identifier)
+        itm_path = self.savedir / identifier
         with open(itm_path, "wb") as itm_file:
             pickle.dump(obj, itm_file)
 
@@ -301,7 +300,7 @@ class _EmptyLogger(Logger):
 
     def __init__(
         self,
-        savedir: str = "mllogs",
+        savedir: Union[Path, str] = "mllogs",
         log_freq: int | Literal["epoch"] | Literal["batch"] = "epoch",
         log_on_workers: int | List[int] = 0,
     ) -> None:
@@ -332,7 +331,7 @@ class ConsoleLogger(Logger):
     """Simplified logger.
 
     Args:
-        savedir (str, optional): where to store artifacts.
+        savedir (Union[Path, str], optional): where to store artifacts.
             Defaults to 'mllogs'.
         log_freq (Union[int, Literal['epoch', 'batch']], optional):
             determines whether the logger should fulfill or ignore
@@ -349,16 +348,15 @@ class ConsoleLogger(Logger):
 
     def __init__(
         self,
-        savedir: str = "mllogs",
+        savedir: Union[Path, str] = "mllogs",
         log_freq: Union[int, Literal["epoch", "batch"]] = "epoch",
         log_on_workers: Union[int, List[int]] = 0,
     ) -> None:
-        savedir = savedir = Path(savedir) / "simple-logger"
-        super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers)
+        cl_savedir = Path(savedir) / "simple-logger"
+        super().__init__(savedir=cl_savedir, log_freq=log_freq, log_on_workers=log_on_workers)
 
     def create_logger_context(self, rank: Optional[int] = None):
-        """
-        Initializes the logger context.
+        """Initializes the logger context.
 
         Args:
             rank (Optional[int]): global rank of current process,
@@ -462,7 +460,7 @@ class MLFlowLogger(Logger):
     """Abstraction around MLFlow logger.
 
     Args:
-        savedir (str, optional): path on local filesystem where logs are
+        savedir (Union[Path, str], optional): path on local filesystem where logs are
             stored. Defaults to 'mllogs'.
         experiment_name (str, optional): experiment name. Defaults to
             ``itwinai.loggers.BASE_EXP_NAME``.
@@ -501,7 +499,7 @@ class MLFlowLogger(Logger):
 
     def __init__(
         self,
-        savedir: str = "mllogs",
+        savedir: Union[Path, str] = "mllogs",
         experiment_name: str = BASE_EXP_NAME,
         tracking_uri: Optional[str] = None,
         run_description: Optional[str] = None,
@@ -509,8 +507,8 @@ def __init__(
         log_freq: Union[int, Literal["epoch", "batch"]] = "epoch",
         log_on_workers: Union[int, List[int]] = 0,
     ):
-        savedir = os.path.join(savedir, "mlflow")
-        super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers)
+        mfl_savedir = Path(savedir) / "mlflow"
+        super().__init__(savedir=mfl_savedir, log_freq=log_freq, log_on_workers=log_on_workers)
         self.tracking_uri = tracking_uri
         self.run_description = run_description
         self.run_name = run_name
@@ -523,8 +521,7 @@ def __init__(
         )
 
     def create_logger_context(self, rank: Optional[int] = None) -> mlflow.ActiveRun:
-        """
-        Initializes the logger context. Start MLFLow run.
+        """Initializes the logger context. Start MLFLow run.
 
         Args:
             rank (Optional[int]): global rank of current process,
@@ -600,22 +597,22 @@ def log(
 
         if kind == "metric":
             mlflow.log_metric(key=identifier, value=item, step=step)
-        if kind == "artifact":
+        elif kind == "artifact":
             if not isinstance(item, str):
                 # Save the object locally and then log it
                 name = os.path.basename(identifier)
-                save_path = Path(self.savedir) / ".trash" / name
+                save_path = self.savedir / ".trash" / str(name)
                 save_path.mkdir(os.path.dirname(save_path), exist_ok=True)
                 item = self.serialize(item, save_path)
             mlflow.log_artifact(local_path=item, artifact_path=identifier)
-        if kind == "model":
+        elif kind == "model":
             import torch
 
             if isinstance(item, torch.nn.Module):
                 mlflow.pytorch.log_model(item, identifier)
             else:
                 print("WARNING: unrecognized model type")
-        if kind == "dataset":
+        elif kind == "dataset":
             # Log mlflow dataset
             # https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.log_input
             # It may be needed to convert item into a mlflow dataset, e.g.:
@@ -625,29 +622,29 @@ def log(
                 mlflow.log_input(item)
             else:
                 print("WARNING: unrecognized dataset type. " "Must be an MLFlow dataset")
-        if kind == "torch":
+        elif kind == "torch":
             import torch
 
             # Save the object locally and then log it
             name = os.path.basename(identifier)
-            save_path = Path(self.savedir) / ".trash" / name
+            save_path = self.savedir / ".trash" / str(name)
             save_path.mkdir(os.path.dirname(save_path), exist_ok=True)
             torch.save(item, save_path)
             # Log into mlflow
             mlflow.log_artifact(local_path=save_path, artifact_path=identifier)
-        if kind == "dict":
+        elif kind == "dict":
             mlflow.log_dict(dictionary=item, artifact_file=identifier)
-        if kind == "figure":
+        elif kind == "figure":
             mlflow.log_figure(
                 artifact_file=identifier,
                 figure=item,
                 save_kwargs=kwargs.get("save_kwargs"),
             )
-        if kind == "image":
+        elif kind == "image":
             mlflow.log_image(artifact_file=identifier, image=item)
-        if kind == "param":
+        elif kind == "param":
             mlflow.log_param(key=identifier, value=item)
-        if kind == "text":
+        elif kind == "text":
             mlflow.log_text(artifact_file=identifier, text=item)
 
 
@@ -655,7 +652,7 @@ class WandBLogger(Logger):
     """Abstraction around WandB logger.
 
     Args:
-        savedir (str, optional): location on local filesystem where logs
+        savedir (Union[Path, str], optional): location on local filesystem where logs
             are stored. Defaults to 'mllogs'.
         project_name (str, optional): experiment name. Defaults to
             ``itwinai.loggers.BASE_EXP_NAME``.
@@ -685,18 +682,17 @@ class WandBLogger(Logger):
 
     def __init__(
         self,
-        savedir: str = "mllogs",
+        savedir: Union[Path, str] = "mllogs",
         project_name: str = BASE_EXP_NAME,
         log_freq: Union[int, Literal["epoch", "batch"]] = "epoch",
         log_on_workers: Union[int, List[int]] = 0,
     ) -> None:
-        savedir = os.path.join(savedir, "wandb")
-        super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers)
+        wbl_savedir = Path(savedir) / "wandb"
+        super().__init__(savedir=wbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers)
         self.project_name = project_name
 
     def create_logger_context(self, rank: Optional[int] = None) -> None:
-        """
-        Initializes the logger context. Init WandB run.
+        """Initializes the logger context. Init WandB run.
 
         Args:
             rank (Optional[int]): global rank of current process,
@@ -707,10 +703,11 @@ def create_logger_context(self, rank: Optional[int] = None) -> None:
         if not self.should_log():
             return
 
-        os.makedirs(os.path.join(self.savedir, "wandb"), exist_ok=True)
-        self.active_run = wandb.init(
-            dir=os.path.abspath(self.savedir), project=self.project_name
+        (self.savedir / "wandb").mkdir(
+            exist_ok=True,
+            parents=True,
         )
+        self.active_run = wandb.init(dir=self.savedir.resolve(), project=self.project_name)
 
     def destroy_logger_context(self):
         """Destroy logger."""
@@ -767,7 +764,7 @@ class TensorBoardLogger(Logger):
     TensorFlow.
 
     Args:
-        savedir (str, optional): location on local filesystem where logs
+        savedir (Union[Path, str], optional): location on local filesystem where logs
             are stored. Defaults to 'mllogs'.
         log_freq (Union[int, Literal['epoch', 'batch']], optional):
             determines whether the logger should fulfill or ignore
@@ -793,29 +790,28 @@ class TensorBoardLogger(Logger):
 
     def __init__(
         self,
-        savedir: str = "mllogs",
+        savedir: Union[Path, str] = "mllogs",
         log_freq: Union[int, Literal["epoch", "batch"]] = "epoch",
         framework: Literal["tensorflow", "pytorch"] = "pytorch",
         log_on_workers: Union[int, List[int]] = 0,
     ) -> None:
-        savedir = os.path.join(savedir, "tensorboard")
-        super().__init__(savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers)
+        tbl_savedir = Path(savedir) / "tensorboard"
+        super().__init__(savedir=tbl_savedir, log_freq=log_freq, log_on_workers=log_on_workers)
         self.framework = framework
         if framework.lower() == "tensorflow":
             import tensorflow as tf
 
             self.tf = tf
-            self.writer = tf.summary.create_file_writer(savedir)
+            self.writer = tf.summary.create_file_writer(tbl_savedir.resolve().as_posix())
         elif framework.lower() == "pytorch":
             from torch.utils.tensorboard import SummaryWriter
 
-            self.writer = SummaryWriter(savedir)
+            self.writer = SummaryWriter(tbl_savedir.resolve().as_posix())
         else:
             raise ValueError("Framework must be either 'tensorflow' or 'pytorch'")
 
     def create_logger_context(self, rank: Optional[int] = None) -> None:
-        """
-        Initializes the logger context. Init Tensorboard run.
+        """Initializes the logger context. Init Tensorboard run.
 
         Args:
             rank (Optional[int]): global rank of current process,
@@ -914,7 +910,7 @@ class LoggersCollection(Logger):
     supported_kinds: Tuple[str]
 
     def __init__(self, loggers: List[Logger]) -> None:
-        super().__init__(savedir="/tmp/mllogs_LoggersCollection", log_freq=1)
+        super().__init__(savedir=Path("/tmp/mllogs_LoggersCollection"), log_freq=1)
         self.loggers = loggers
 
     def should_log(self, batch_idx: int = None) -> bool:
@@ -964,8 +960,7 @@ def log(
             )
 
     def create_logger_context(self, rank: Optional[int] = None) -> Any:
-        """
-        Initializes all loggers.
+        """Initializes all loggers.
 
         Args:
             rank (Optional[int]): global rank of current process,
@@ -998,7 +993,7 @@ class Prov4MLLogger(Logger):
             files will be uploaded. Defaults to "www.example.org".
         experiment_name (str, optional): experiment name.
             Defaults to "experiment_name".
-        provenance_save_dir (str, optional): path where to store provenance
+        provenance_save_dir (Union[Path, str], optional): path where to store provenance
             files and logs. Defaults to "prov".
         save_after_n_logs (Optional[int], optional): how often to save
             logs to disk from main memory. Defaults to 100.
@@ -1031,9 +1026,9 @@ class Prov4MLLogger(Logger):
 
     def __init__(
         self,
-        prov_user_namespace="www.example.org",
-        experiment_name="experiment_name",
-        provenance_save_dir="mllogs/prov_logs",
+        prov_user_namespace: str = "www.example.org",
+        experiment_name: str = "experiment_name",
+        provenance_save_dir: Union[Path, str] = "mllogs/prov_logs",
         save_after_n_logs: Optional[int] = 100,
         create_graph: Optional[bool] = True,
         create_svg: Optional[bool] = True,
@@ -1054,8 +1049,7 @@ def __init__(
 
     @override
     def create_logger_context(self, rank: Optional[int] = None):
-        """
-        Initializes the logger context.
+        """Initializes the logger context.
 
         Args:
             rank (Optional[int]): global rank of current process,
diff --git a/src/itwinai/parser.py b/src/itwinai/parser.py
index 8d0a9efc..735e9056 100644
--- a/src/itwinai/parser.py
+++ b/src/itwinai/parser.py
@@ -27,6 +27,12 @@
 from .utils import load_yaml
 
 
+class _ArgumentParser(JAPArgumentParser):
+    def error(self, message: str, ex: Optional[Exception] = None) -> None:
+        """Patch error method to re-raise exception instead of exiting execution."""
+        raise ex
+
+
 def add_replace_field(config: Dict, key_chain: str, value: Any) -> None:
     """Replace or add (if not present) a field in a dictionary, following a
     path of dot-separated keys. Adding is not supported for list items.
@@ -61,9 +67,16 @@ def add_replace_field(config: Dict, key_chain: str, value: Any) -> None:
     sub_config[k] = value
 
 
+def get_root_cause(exception: Exception) -> Exception:
+    """Recursively extract the first exception in the exception chain."""
+    root = exception
+    while root.__cause__ is not None:  # Traverse the exception chain
+        root = root.__cause__
+    return root
+
+
 class ConfigParser:
-    """
-    Parses a pipeline from a configuration file.
+    """Parses a pipeline from a configuration file.
     It also provides functionalities for dynamic override
     of fields by means of nested key notation.
 
@@ -150,7 +163,7 @@ def parse_pipeline(
         Returns:
             Pipeline: instantiated pipeline.
         """
-        pipe_parser = JAPArgumentParser()
+        pipe_parser = _ArgumentParser()
         pipe_parser.add_subclass_arguments(Pipeline, "pipeline")
 
         pipe_dict = self.config
@@ -163,9 +176,13 @@ def parse_pipeline(
             print("Assembled pipeline:")
             print(json.dumps(pipe_dict, indent=4))
 
-        # Parse pipeline dict once merged with steps
-        conf = pipe_parser.parse_object(pipe_dict)
-        pipe = pipe_parser.instantiate_classes(conf)
+        try:
+            # Parse pipeline dict once merged with steps
+            conf = pipe_parser.parse_object(pipe_dict)
+            pipe = pipe_parser.instantiate_classes(conf)
+        except Exception as exc:
+            exc = get_root_cause(exc)
+            raise exc
         self.pipeline = pipe["pipeline"]
         return self.pipeline
 
@@ -187,10 +204,15 @@ def parse_step(
 
         # Wrap config under "step" field and parse it
         step_dict_config = {"step": step_dict_config}
-        step_parser = JAPArgumentParser()
+        step_parser = _ArgumentParser()
         step_parser.add_subclass_arguments(BaseComponent, "step")
-        parsed_namespace = step_parser.parse_object(step_dict_config)
-        return step_parser.instantiate_classes(parsed_namespace)["step"]
+        try:
+            parsed_namespace = step_parser.parse_object(step_dict_config)
+            step = step_parser.instantiate_classes(parsed_namespace)["step"]
+        except Exception as exc:
+            exc = get_root_cause(exc)
+            raise exc
+        return step
 
 
 class ArgumentParser(JAPArgumentParser):
diff --git a/src/itwinai/torch/config.py b/src/itwinai/torch/config.py
index 687e7264..d9e0e5d0 100644
--- a/src/itwinai/torch/config.py
+++ b/src/itwinai/torch/config.py
@@ -35,6 +35,16 @@ class TrainingConfiguration(Configuration):
     >>> from rich import print
     >>> print(cfg)             # pretty-print of configuration
 
+    .. warning::
+
+        Don't reinvent parameters that already exist in the training coniguration, if possible.
+        Instead, use the name of the parameters in the training configuration when possible to
+        avoid inconsistencies. For instance, the training configuration defines the learning
+        rate as ``optim_lr``, so if you redefine it as ``lr`` by doing
+        ``TrainingConfiguration(lr=0.005)`` in the configuration you will now have both
+        ``optim_lr`` (created by default) and ``lr`` (created by you). This may create
+        confusion and potentially (and silently) break the logic in your code.
+
     """
 
     #: Batch size. In a distributed environment it is usually the