ecmwf · mc4117 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ Keep it human-readable, your future self will thank you!
     - Enable longer validation rollout than training
 ### Added
 - Included more loss functions and allowed configuration [#70](https://github.com/ecmwf/anemoi-training/pull/70)
+- Include option to use datashader and optimised asyncronohous callbacks [#102](https://github.com/ecmwf/anemoi-training/pull/102)
    - Fix that applies the metric_ranges in the post-processed variable space [#116](https://github.com/ecmwf/anemoi-training/pull/116)
 - Sub-hour datasets [#63](https://github.com/ecmwf/anemoi-training/pull/63)
 - Add synchronisation workflow [#92](https://github.com/ecmwf/anemoi-training/pull/92)
@@ -95,6 +96,7 @@ Keep it human-readable, your future self will thank you!
 
 - Updated configuration examples in documentation and corrected links - [#46](https://github.com/ecmwf/anemoi-training/pull/46)
 - Remove credential prompt from mlflow login, replace with seed refresh token via web - [#78](https://github.com/ecmwf/anemoi-training/pull/78)
+- Modified training configuration to support max_steps and tied lr iterations to max_steps by default [#67](https://github.com/ecmwf/anemoi-training/pull/67)
 - Update CODEOWNERS
 
 ## [0.1.0 - Anemoi training - First release](https://github.com/ecmwf/anemoi-training/releases/tag/0.1.0) - 2024-08-16

@@ -46,6 +46,7 @@ dependencies = [
   "anemoi-graphs",
   "anemoi-models>=0.3",
   "anemoi-utils[provenance]>=0.3.10",
+  "datashader>=0.16.3",
   "einops>=0.6.1",
   "hydra-core>=1.3",
   "matplotlib>=3.7.1",

diff --git a/src/anemoi/training/config/diagnostics/plot/detailed.yaml b/src/anemoi/training/config/diagnostics/plot/detailed.yaml
@@ -1,4 +1,5 @@
 asynchronous: True # Whether to plot asynchronously
+scatter: False # Choose which technique to use for plotting
 frequency: # Frequency of the plotting
   batch: 750
   epoch: 5

diff --git a/src/anemoi/training/config/diagnostics/plot/simple.yaml b/src/anemoi/training/config/diagnostics/plot/simple.yaml
@@ -1,4 +1,5 @@
 asynchronous: True # Whether to plot asynchronously
+scatter: False # Choose which technique to use for plotting
 frequency: # Frequency of the plotting
   batch: 750
   epoch: 10

diff --git a/src/anemoi/training/data/datamodule.py b/src/anemoi/training/data/datamodule.py
@@ -140,8 +140,7 @@ def ds_train(self) -> NativeGridDataset:
 
     @cached_property
     def ds_valid(self) -> NativeGridDataset:
-        r = self.rollout
-        r = max(r, self.config.dataloader.get("validation_rollout", 1))
+        r = max(self.rollout, self.config.dataloader.get("validation_rollout", 1))
 
         assert self.config.dataloader.training.end < self.config.dataloader.validation.start, (
             f"Training end date {self.config.dataloader.training.end} is not before"

diff --git a/src/anemoi/training/diagnostics/callbacks/__init__.py b/src/anemoi/training/diagnostics/callbacks/__init__.py
@@ -14,6 +14,7 @@
 from typing import TYPE_CHECKING
 from typing import Callable
 from typing import Iterable
+from typing import Optional
 
 from hydra.utils import instantiate
 from omegaconf import DictConfig

diff --git a/src/anemoi/training/diagnostics/callbacks/plot.py b/src/anemoi/training/diagnostics/callbacks/plot.py
@@ -7,13 +7,13 @@
 # granted to it by virtue of its status as an intergovernmental organisation
 # nor does it submit to any jurisdiction.
 
-# ruff: noqa: ANN001
 
 from __future__ import annotations
 
+import asyncio
 import copy
 import logging
-import sys
+import threading
 import time
 import traceback
 from abc import ABC
@@ -23,8 +23,6 @@
 from functools import cached_property
 from pathlib import Path
 from typing import TYPE_CHECKING
-from typing import Any
-from typing import Callable
 
 import matplotlib.patches as mpatches
 import matplotlib.pyplot as plt
@@ -43,33 +41,14 @@
 from anemoi.training.losses.weightedloss import BaseWeightedLoss
 
 if TYPE_CHECKING:
+    from typing import Any
+
     import pytorch_lightning as pl
     from omegaconf import OmegaConf
 
 LOGGER = logging.getLogger(__name__)
 
 
-class ParallelExecutor(ThreadPoolExecutor):
-    """Wraps parallel execution and provides accurate information about errors.
-
-    Extends ThreadPoolExecutor to preserve the original traceback and line number.
-
-    Reference: https://stackoverflow.com/questions/19309514/getting-original-line-
-    number-for-exception-in-concurrent-futures/24457608#24457608
-    """
-
-    def submit(self, fn: Any, *args, **kwargs) -> Callable:
-        """Submits the wrapped function instead of `fn`."""
-        return super().submit(self._function_wrapper, fn, *args, **kwargs)
-
-    def _function_wrapper(self, fn: Any, *args: list, **kwargs: dict) -> Callable:
-        """Wraps `fn` in order to preserve the traceback of any kind of."""
-        try:
-            return fn(*args, **kwargs)
-        except Exception as exc:
-            raise sys.exc_info()[0](traceback.format_exc()) from exc
-
-
 class BasePlotCallback(Callback, ABC):
     """Factory for creating a callback that plots data to Experiment Logging."""
 
@@ -93,11 +72,21 @@ def __init__(self, config: OmegaConf) -> None:
 
         self.plot = self._plot
         self._executor = None
+        self._error: BaseException = None
+        self.datashader_plotting = config.diagnostics.plot.datashader
 
         if self.config.diagnostics.plot.asynchronous:
-            self._executor = ParallelExecutor(max_workers=1)
-            self._error: BaseException | None = None
+            LOGGER.info("Setting up asynchronous plotting ...")
             self.plot = self._async_plot
+            self._executor = ThreadPoolExecutor(max_workers=1)
+            self.loop_thread = threading.Thread(target=self.start_event_loop, daemon=True)
+            self.loop_thread.start()
+
+    def start_event_loop(self) -> None:
+        """Start the event loop in a separate thread."""
+        self.loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_forever()
 
     @rank_zero_only
     def _output_figure(
@@ -113,27 +102,48 @@ def _output_figure(
             save_path = Path(
                 self.save_basedir,
                 "plots",
-                f"{tag}_epoch{epoch:03d}.png",
+                f"{tag}_epoch{epoch:03d}.jpg",
             )
 
             save_path.parent.mkdir(parents=True, exist_ok=True)
-            fig.savefig(save_path, dpi=100, bbox_inches="tight")
+            fig.canvas.draw()
+            image_array = np.array(fig.canvas.renderer.buffer_rgba())
+            plt.imsave(save_path, image_array, dpi=100)
             if self.config.diagnostics.log.wandb.enabled:
                 import wandb
 
                 logger.experiment.log({exp_log_tag: wandb.Image(fig)})
-
             if self.config.diagnostics.log.mlflow.enabled:
                 run_id = logger.run_id
                 logger.experiment.log_artifact(run_id, str(save_path))
 
         plt.close(fig)  # cleanup
 
+    @rank_zero_only
+    def _plot_with_error_catching(self, trainer: pl.Trainer, args: Any, kwargs: Any) -> None:
+        """To execute the plot function but ensuring we catch any errors."""
+        try:
+            self._plot(trainer, *args, **kwargs)
+        except BaseException:
+            import os
+
+            LOGGER.exception(traceback.format_exc())
+            os._exit(1)  # to force exit when sanity val steps are used
+
     def teardown(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str) -> None:
-        """Method is called to close the threads."""
+        """Teardown the callback."""
         del trainer, pl_module, stage  # unused
+        LOGGER.info("Teardown of the Plot Callback ...")
+
         if self._executor is not None:
-            self._executor.shutdown(wait=True)
+            LOGGER.info("waiting and shutting down the executor ...")
+            self._executor.shutdown(wait=False, cancel_futures=True)
+
+            self.loop.call_soon_threadsafe(self.loop.stop)
+            self.loop_thread.join()
+            # Step 3: Close the asyncio event loop
+            self.loop_thread._stop()
+            self.loop_thread._delete()
 
     def apply_output_mask(self, pl_module: pl.LightningModule, data: torch.Tensor) -> torch.Tensor:
         if hasattr(pl_module, "output_mask") and pl_module.output_mask is not None:
@@ -147,31 +157,39 @@ def _plot(
         self,
         trainer: pl.Trainer,
         pl_module: pl.LightningModule,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         """Plotting function to be implemented by subclasses."""
 
+    # Async function to run the plot function in the background thread
+    async def submit_plot(self, trainer: pl.Trainer, *args: Any, **kwargs: Any) -> None:
+        """Async function or coroutine to schedule the plot function."""
+        loop = asyncio.get_running_loop()
+        # run_in_executor doesn't support keyword arguments,
+        await loop.run_in_executor(
+            self._executor,
+            self._plot_with_error_catching,
+            trainer,
+            args,
+            kwargs,
+        )  # One because loop.run_in_executor expects positional arguments, not keyword arguments
+
     @rank_zero_only
     def _async_plot(
         self,
         trainer: pl.Trainer,
-        *args: list,
-        **kwargs: dict,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
-        """To execute the plot function but ensuring we catch any errors."""
-        future = self._executor.submit(
-            self._plot,
-            trainer,
-            *args,
-            **kwargs,
-        )
-        # otherwise the error won't be thrown till the validation epoch is finished
-        try:
-            future.result()
-        except Exception:
-            LOGGER.exception("Critical error occurred in asynchronous plots.")
-            sys.exit(1)
+        """Run the plot function asynchronously.
+
+        This is the function that is called by the callback. It schedules the plot
+        function to run in the background thread. Since we have an event loop running in
+        the background thread, we need to schedule the plot function to run in that
+        loop.
+        """
+        asyncio.run_coroutine_threadsafe(self.submit_plot(trainer, *args, **kwargs), self.loop)
 
 
 class BasePerBatchPlotCallback(BasePlotCallback):
@@ -192,26 +210,12 @@ def __init__(self, config: OmegaConf, batch_frequency: int | None = None):
         super().__init__(config)
         self.batch_frequency = batch_frequency or self.config.diagnostics.plot.frequency.batch
 
-    @abstractmethod
     @rank_zero_only
-    def _plot(
+    def on_validation_batch_end(
         self,
         trainer: pl.Trainer,
         pl_module: pl.LightningModule,
-        outputs: list[torch.Tensor],
-        batch: torch.Tensor,
-        batch_idx: int,
-        epoch: int,
-        **kwargs,
-    ) -> None:
-        """Plotting function to be implemented by subclasses."""
-
-    @rank_zero_only
-    def on_validation_batch_end(
-        self,
-        trainer,
-        pl_module,
-        output,
+        output: list[torch.Tensor],
         batch: torch.Tensor,
         batch_idx: int,
         **kwargs,
@@ -310,12 +314,12 @@ def __init__(
     @rank_zero_only
     def _plot(
         self,
-        trainer,
+        trainer: pl.Trainer,
         pl_module: pl.LightningModule,
         output: list[torch.Tensor],
         batch: torch.Tensor,
-        batch_idx,
-        epoch,
+        batch_idx: int,
+        epoch: int,
     ) -> None:
         _ = output
 
@@ -406,9 +410,9 @@ def _plot(
     @rank_zero_only
     def on_validation_batch_end(
         self,
-        trainer,
-        pl_module,
-        output,
+        trainer: pl.Trainer,
+        pl_module: pl.LightningModule,
+        output: list[torch.Tensor],
         batch: torch.Tensor,
         batch_idx: int,
     ) -> None:
@@ -451,18 +455,17 @@ def _plot(
         pl_module: pl.LightningModule,
         epoch: int,
     ) -> None:
-        _ = epoch
         model = pl_module.model.module.model if hasattr(pl_module.model, "module") else pl_module.model.model
 
-        fig = plot_graph_node_features(model)
+        fig = plot_graph_node_features(model, datashader=self.datashader_plotting)
 
         tag = "node_trainable_params"
         exp_log_tag = "node_trainable_params"
 
         self._output_figure(
             trainer.logger,
             fig,
-            epoch=trainer.current_epoch,
+            epoch=epoch,
             tag=tag,
             exp_log_tag=exp_log_tag,
         )
@@ -493,7 +496,6 @@ def _plot(
         pl_module: pl.LightningModule,
         epoch: int,
     ) -> None:
-        _ = epoch
 
         model = pl_module.model.module.model if hasattr(pl_module.model, "module") else pl_module.model.model
         fig = plot_graph_edge_features(model)
@@ -504,7 +506,7 @@ def _plot(
         self._output_figure(
             trainer.logger,
             fig,
-            epoch=trainer.current_epoch,
+            epoch=epoch,
             tag=tag,
             exp_log_tag=exp_log_tag,
         )
@@ -785,6 +787,7 @@ def _plot(
                 data[0, ...].squeeze(),
                 data[rollout_step + 1, ...].squeeze(),
                 output_tensor[rollout_step, ...],
+                datashader=self.datashader_plotting,
                 precip_and_related_fields=self.precip_and_related_fields,
             )
 
@@ -874,7 +877,7 @@ def _plot(
         self,
         trainer: pl.Trainer,
         pl_module: pl.LightningModule,
-        outputs: list,
+        outputs: list[torch.Tensor],
         batch: torch.Tensor,
         batch_idx: int,
         epoch: int,
@@ -956,7 +959,7 @@ def _plot(
         self,
         trainer: pl.Trainer,
         pl_module: pl.LightningModule,
-        outputs: list,
+        outputs: list[torch.Tensor],
         batch: torch.Tensor,
         batch_idx: int,
         epoch: int,

diff --git a/src/anemoi/training/diagnostics/maps.py b/src/anemoi/training/diagnostics/maps.py
@@ -32,7 +32,7 @@ def __init__(self) -> None:
     def __call__(self, lon: np.ndarray, lat: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
         lon_rad = np.radians(lon)
         lat_rad = np.radians(lat)
-        x = [v - 2 * np.pi if v > np.pi else v for v in lon_rad]
+        x = np.array([v - 2 * np.pi if v > np.pi else v for v in lon_rad], dtype=lon_rad.dtype)
         y = lat_rad
         return x, y