interTwin-eu · r-sarma · Jul 2, 2024 · Jul 29, 2024 · Oct 2, 2024 · Oct 2, 2024
diff --git a/src/itwinai/torch/inference.py b/src/itwinai/torch/inference.py
@@ -5,19 +5,28 @@
 #
 # Credit:
 # - Matteo Bunino <[email protected]> - CERN
+# - Rakesh Sarma <[email protected]> - FZJ
 # --------------------------------------------------------------------------------------
 
-import abc
 import os
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import torch
 from torch import nn
 from torch.utils.data import DataLoader, Dataset
 
 from ..components import Predictor, monitor_exec
+from ..loggers import Logger
 from ..serialization import ModelLoader
-from ..utils import clear_key, dynamically_import_class
+from .config import TrainingConfiguration
+from .distributed import (
+    DeepSpeedStrategy,
+    HorovodStrategy,
+    NonDistributedStrategy,
+    TorchDDPStrategy,
+    TorchDistributedStrategy,
+    distributed_resources_available,
+)
 from .type import Batch
 
 
@@ -42,10 +51,11 @@ def __call__(self) -> nn.Module:
         """
         if os.path.exists(self.model_uri):
             # Model is on local filesystem.
-            model = torch.load(self.model_uri)
-            return model.eval()
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            model = torch.load(self.model_uri, map_location=device)
+            return model
 
-        if self.model_uri.startswith("mlflow+"):
+        if self.model_uri.startswith('mlflow+'):
             # Model is on an MLFLow server
             # Form is 'mlflow+MLFLOW_TRACKING_URI+RUN_ID+ARTIFACT_PATH'
             import mlflow
@@ -68,7 +78,7 @@ def __call__(self) -> nn.Module:
                 tracking_uri=mlflow.get_tracking_uri(),
             )
             model = torch.load(ckpt_path)
-            return model.eval()
+            return model
 
         raise ValueError(
             "Unrecognized model URI: model may not be there! "
@@ -79,43 +89,115 @@ def __call__(self) -> nn.Module:
 class TorchPredictor(Predictor):
     """Applies a pre-trained torch model to unseen data."""
 
+    _strategy: TorchDistributedStrategy = None
+    #: PyTorch ``DataLoader`` for inference dataset.
+    inference_dataloader: DataLoader = None
     #: Pre-trained PyTorch model used to make predictions.
     model: nn.Module = None
-    #: ``Dataset`` on which to make predictions (ML inference).
-    test_dataset: Dataset
-    #: ``DataLoader`` for test dataset.
-    test_dataloader: DataLoader = None
+    #: PyTorch random number generator (PRNG).
+    torch_rng: torch.Generator = None
+    #: itwinai ``itwinai.Logger``
+    logger: Logger = None
 
     def __init__(
         self,
+        config: Union[Dict, TrainingConfiguration],
         model: Union[nn.Module, ModelLoader],
-        test_dataloader_class: str = "torch.utils.data.DataLoader",
-        test_dataloader_kwargs: Optional[Dict] = None,
-        name: str = None,
+        strategy: Literal["ddp", "deepspeed", "horovod"] = 'ddp',
+        logger: Optional[Logger] = None,
+        checkpoints_location: str = "checkpoints",
+        name: str = None
     ) -> None:
         super().__init__(model=model, name=name)
         self.save_parameters(**self.locals2params(locals()))
-        self.model = self.model.eval()
-
-        # Train and validation dataloaders
-        self.test_dataloader_class = dynamically_import_class(test_dataloader_class)
-        test_dataloader_kwargs = (
-            test_dataloader_kwargs if test_dataloader_kwargs is not None else {}
+        if isinstance(config, dict):
+            self.config = TrainingConfiguration(**config)
+        else:
+            self.config = config
+        self.strategy = strategy
+        self.logger = logger
+        self.checkpoints_location = checkpoints_location
+
+    @property
+    def strategy(self) -> TorchDistributedStrategy:
+        """Strategy currently in use."""
+        return self._strategy
+
+    @strategy.setter
+    def strategy(self, strategy: Union[str, TorchDistributedStrategy]) -> None:
+        if isinstance(strategy, TorchDistributedStrategy):
+            self._strategy = strategy
+        else:
+            self._strategy = self._detect_strategy(strategy)
+
+    @property
+    def device(self) -> str:
+        """Current device from distributed strategy."""
+        return self.strategy.device()
+
+    def _detect_strategy(self, strategy: str) -> TorchDistributedStrategy:
+        if not distributed_resources_available():
+            print("WARNING: falling back to non-distributed strategy.")
+            dist_str = NonDistributedStrategy()
+        elif strategy == 'ddp':
+            dist_str = TorchDDPStrategy(backend='nccl')
+        elif strategy == 'horovod':
+            dist_str = HorovodStrategy()
+        elif strategy == 'deepspeed':
+            dist_str = DeepSpeedStrategy(backend='nccl')
+        else:
+            raise NotImplementedError(
+                f"Strategy '{strategy}' is not recognized/implemented.")
+        return dist_str
+
+    def _init_distributed_strategy(self) -> None:
+        if not self.strategy.is_initialized:
+            self.strategy.init()
+
+    def distribute_model(self) -> None:
+        """
+        Distribute the torch model with the chosen strategy.
+        """
+        if self.model is None:
+            raise ValueError(
+                "self.model is None! Mandatory constructor argument "
+            )
+        distribute_kwargs = {}
+        # Distributed model
+        self.model, _, _ = self.strategy.distributed(
+            self.model, None, None, **distribute_kwargs
         )
-        self.test_dataloader_kwargs = clear_key(
-            test_dataloader_kwargs, "train_dataloader_kwargs", "dataset"
+
+    def create_dataloaders(
+        self,
+        inference_dataset: Dataset
+    ) -> None:
+        """
+        Create inference dataloader.
+
+        Args:
+            inference_dataset (Dataset): inference dataset object.
+        """
+
+        self.inference_dataloader = self.strategy.create_dataloader(
+            dataset=inference_dataset,
+            batch_size=self.config.batch_size,
+            num_workers=self.config.num_workers_dataloader,
+            pin_memory=self.config.pin_gpu_memory,
+            generator=self.torch_rng,
+            shuffle=self.config.shuffle_test
         )
 
     @monitor_exec
     def execute(
         self,
-        test_dataset: Dataset,
+        inference_dataset: Dataset,
         model: nn.Module = None,
     ) -> Dict[str, Any]:
         """Applies a torch model to a dataset for inference.
 
         Args:
-            test_dataset (Dataset[str, Any]): each item in this dataset is a
+            inference_dataset (Dataset[str, Any]): each item in this dataset is a
                 couple (item_unique_id, item)
             model (nn.Module, optional): torch model. Overrides the existing
                 model, if given. Defaults to None.
@@ -124,18 +206,27 @@ def execute(
             Dict[str, Any]: maps each item ID to the corresponding predicted
                 value(s).
         """
+        self._init_distributed_strategy()
         if model is not None:
             # Overrides existing "internal" model
             self.model = model
 
-        test_dataloader = self.test_dataloader_class(
-            test_dataset, **self.test_dataloader_kwargs
+        self.create_dataloaders(
+            inference_dataset=inference_dataset
         )
 
+        self.distribute_model()
+
+        if self.logger:
+            self.logger.create_logger_context(rank=self.strategy.global_rank())
+            hparams = self.config.model_dump()
+            hparams['distributed_strategy'] = self.strategy.__class__.__name__
+            self.logger.save_hyperparameters(hparams)
+
         all_predictions = dict()
-        for samples_ids, samples in test_dataloader:
+        for ids, (samples_ids, samples) in enumerate(self.inference_dataloader):
             with torch.no_grad():
-                pred = self.model(samples)
+                pred = self.model(samples.to(self.device))
             pred = self.transform_predictions(pred)
             for idx, pre in zip(samples_ids, pred):
                 # For each item in the batch
@@ -144,11 +235,49 @@ def execute(
                 else:
                     pre = pre.to_dense().tolist()
                 all_predictions[idx] = pre
+
+        if self.logger:
+            self.logger.destroy_logger_context()
+
+        self.strategy.clean_up()
+
         return all_predictions
 
-    @abc.abstractmethod
+    def log(
+        self,
+        item: Union[Any, List[Any]],
+        identifier: Union[str, List[str]],
+        kind: str = 'metric',
+        step: Optional[int] = None,
+        batch_idx: Optional[int] = None,
+        **kwargs
+    ) -> None:
+        """Log ``item`` with ``identifier`` name of ``kind`` type at ``step``
+        time step.
+
+        Args:
+            item (Union[Any, List[Any]]): element to be logged (e.g., metric).
+            identifier (Union[str, List[str]]): unique identifier for the
+                element to log(e.g., name of a metric).
+            kind (str, optional): type of the item to be logged. Must be one
+                among the list of self.supported_types. Defaults to 'metric'.
+            step (Optional[int], optional): logging step. Defaults to None.
+            batch_idx (Optional[int], optional): DataLoader batch counter
+                (i.e., batch idx), if available. Defaults to None.
+        """
+        if self.logger:
+            self.logger.log(
+                item=item,
+                identifier=identifier,
+                kind=kind,
+                step=step,
+                batch_idx=batch_idx,
+                **kwargs
+            )
+
     def transform_predictions(self, batch: Batch) -> Batch:
-        """Post-process the predictions of the torch model (e.g., apply
+        """
+        Post-process the predictions of the torch model (e.g., apply
         threshold in case of multi-label classifier).
         """
 

diff --git a/tests/use-cases/test_mnist.py b/tests/use-cases/test_mnist.py
@@ -87,7 +87,8 @@ def test_mnist_inference_torch(torch_env, install_requirements):
     )
     with tempfile.TemporaryDirectory() as temp_dir:
         # Create fake inference dataset and checkpoint
-        generate_model_cmd = f"{torch_env}/bin/python {exec} " f"--root {temp_dir}"
+        generate_model_cmd = (f"{torch_env}/bin/python {exec} "
+                f"--root {temp_dir}")
         subprocess.run(generate_model_cmd.split(), check=True, cwd=temp_dir)
 
         # Running inference

diff --git a/use-cases/mnist/torch/config.yaml b/use-cases/mnist/torch/config.yaml
@@ -98,11 +98,15 @@ inference_pipeline:
             class_path: itwinai.torch.inference.TorchModelLoader
             init_args:
               model_uri: ${inference_model_mlflow_uri}
-          test_dataloader_kwargs:
-            batch_size: ${batch_size}
+          config:
+            batch_size: 32
+            num_workers_dataloader: 4
+            pin_gpu_memory: true
+            shuffle_test: false
+          strategy: ${strategy}
 
       - class_path: saver.TorchMNISTLabelSaver
         init_args:
           save_dir: ${predictions_dir}
           predictions_file: ${predictions_file}
-          class_labels: ${class_labels}
+          class_labels: ${class_labels}
diff --git a/use-cases/xtclim/README.md b/use-cases/xtclim/README.md
@@ -24,14 +24,63 @@ The file `train.py` trains the network. Caution: It will overwrite the weights o
 
 The `anomaly.py` file evaluates the network on the available datasets - train, test, and projection.
 
-## How to launch pipeline
+## Installation
 
-The config file `pipeline.yaml` contains all the steps to execute the workflow. You can launch it from the root of the repository with:
+Please follow the documentation to install the itwinai environment.
+After that, install the required libraries within the itwinai environment with:
 
 ```bash
-python train.py -p pipeline.yaml
+pip install -r Requirements.txt
+```
+
+## How to launch pipeline locally
+
+The config file `pipeline.yaml` contains all the steps to execute the workflow. 
+This file also contains all the seasons, and a separate run is launched for each season.
+You can launch the pipeline through `train.py` from the root of the repository with:
+
+```bash
+python train.py
 
 ```
 
-## TODOs
-Integration of post-processing step + distributed strategies
+## How to launch pipeline on an HPC system
+
+The `startscript` job script can be used to launch a pipeline with SLURM on an HPC system.
+These steps should be followed to export the environment variables required by the script.
+
+```bash
+# Distributed training with torch DistributedDataParallel
+PYTHON_VENV="../../envAI_hdfml"
+DIST_MODE="ddp"
+RUN_NAME="ddp-cerfacs"
+sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",PYTHON_VENV="$PYTHON_VENV" \
+    startscript
+```
+
+The results and/or errors are available in `job.out` and `job.err` log files.
+Training and inference steps are defined in the pipeline, where distributed resources
+are exploited in both the steps.
+
+With MLFLow logger, the logs can be visualized in the MLFlow UI:
+
+```bash
+mlflow ui --backend-store-uri mllogs/mlflow
+
+# In background
+mlflow ui --backend-store-uri mllogs/mlflow > /dev/null 2>&1 &
+```
+
+### Hyperparameter Optimization (HPO)
+
+The repository also provides functionality to perform HPO with Ray. With HPO, 
+multiple trials with different hyperparameter configurations are run in a distributed 
+infrastructure, typically in an HPC environment. This allows finding the optimal 
+configurations which provides the minimal/maximal loss for the investigated network.
+The `hpo.py` file contains the implementation, which launches the `pipeline.yaml` pipeline.
+To launch an HPO experiment, simply run:
+```bash
+sbatch slurm_ray.sh
+```
+The parsing arguments to the `hpo.py` file can be changed to customize the required parameters
+that need to be considered in the HPO process.