ecmwf · JPXKQX · Sep 13, 2024 · Sep 2, 2024 · Sep 2, 2024 · Sep 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,8 +20,9 @@ Keep it human-readable, your future self will thank you!
 
 - Enable the callback for plotting a histogram for variables containing NaNs
 - Enforce same binning for histograms comparing true data to predicted data
-- Fix: Inference checkpoints are now saved according the frequency settings defined in the config
+- Fix: Inference checkpoints are now saved according the frequency settings defined in the config [#37](https://github.com/ecmwf/anemoi-training/pull/37)
 - Feature: Add configurable models [#50](https://github.com/ecmwf/anemoi-training/pulls/50)
+- Feature: Support training for datasets with missing time steps [#48](https://github.com/ecmwf/anemoi-training/pulls/48)
 
 ### Fixed
 

diff --git a/src/anemoi/training/data/dataset.py b/src/anemoi/training/data/dataset.py
@@ -19,6 +19,7 @@
 from torch.utils.data import get_worker_info
 
 from anemoi.training.utils.seeding import get_base_seed
+from anemoi.training.utils.usable_indices import get_usable_indices
 
 LOGGER = logging.getLogger(__name__)
 
@@ -110,6 +111,20 @@ def resolution(self) -> dict:
         """Return dataset resolution."""
         return self.data.resolution
 
+    @cached_property
+    def valid_date_indices(self) -> np.ndarray:
+        """Return valid date indices.
+
+        A date t is valid if we can sample the sequence
+            (t - multistep + 1, ..., t + rollout)
+        without missing data (if time_increment is 1).
+
+        If there are no missing dates, total number of valid ICs is
+        dataset length minus rollout minus additional multistep inputs
+        (if time_increment is 1).
+        """
+        return get_usable_indices(self.data.missing, len(self.data), self.rollout, self.multi_step, self.timeincrement)
+
     def per_worker_init(self, n_workers: int, worker_id: int) -> None:
         """Called by worker_init_func on each copy of dataset.
 
@@ -125,13 +140,10 @@ def per_worker_init(self, n_workers: int, worker_id: int) -> None:
         """
         self.worker_id = worker_id
 
-        # Total number of valid ICs is dataset length minus rollout minus additional multistep inputs
-        len_corrected = len(self.data) - (self.rollout + (self.multi_step - 1)) * self.timeincrement
-
         # Divide this equally across shards (one shard per group!)
-        shard_size = len_corrected // self.model_comm_num_groups
-        shard_start = self.model_comm_group_id * shard_size + (self.multi_step - 1) * self.timeincrement
-        shard_end = min((self.model_comm_group_id + 1) * shard_size, len(self.data) - self.rollout * self.timeincrement)
+        shard_size = len(self.valid_date_indices) // self.model_comm_num_groups
+        shard_start = self.model_comm_group_id * shard_size
+        shard_end = (self.model_comm_group_id + 1) * shard_size
 
         shard_len = shard_end - shard_start
         self.n_samples_per_worker = shard_len // n_workers
@@ -149,7 +161,7 @@ def per_worker_init(self, n_workers: int, worker_id: int) -> None:
             high,
         )
 
-        self.chunk_index_range = np.arange(low, high, dtype=np.uint32)
+        self.chunk_index_range = self.valid_date_indices[np.arange(low, high, dtype=np.uint32)]
 
         # each worker must have a different seed for its random number generator,
         # otherwise all the workers will output exactly the same data

diff --git a/src/anemoi/training/utils/usable_indices.py b/src/anemoi/training/utils/usable_indices.py
@@ -0,0 +1,56 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def get_usable_indices(
+    missing_indices: set[int] | None,
+    series_length: int,
+    rollout: int,
+    multistep: int,
+    timeincrement: int = 1,
+) -> np.ndarray:
+    """Get the usable indices of a series whit missing indices.
+
+    Parameters
+    ----------
+    missing_indices : set[int]
+        Dataset to be used.
+    series_length : int
+        Length of the series.
+    rollout : int
+        Number of steps to roll out.
+    multistep : int
+        Number of previous indices to include as predictors.
+    timeincrement : int
+        Time increment, by default 1.
+
+    Returns
+    -------
+    usable_indices : np.array
+        Array of usable indices.
+    """
+    prev_invalid_dates = (multistep - 1) * timeincrement
+    next_invalid_dates = rollout * timeincrement
+
+    usable_indices = np.arange(series_length)  # set of all indices
+
+    if missing_indices is None:
+        missing_indices = set()
+
+    missing_indices |= {-1, series_length}  # to filter initial and final indices
+
+    # Missing indices
+    for i in missing_indices:
+        usable_indices = usable_indices[
+            (usable_indices < i - next_invalid_dates) + (usable_indices > i + prev_invalid_dates)
+        ]
+
+    return usable_indices
diff --git a/tests/utils/test_usable_indices.py b/tests/utils/test_usable_indices.py
@@ -0,0 +1,45 @@
+# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts.
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+
+import numpy as np
+
+from anemoi.training.utils.usable_indices import get_usable_indices
+
+
+def test_get_usable_indices() -> None:
+    """Test get_usable_indices function."""
+    # Test base case
+    valid_indices = get_usable_indices(missing_indices=None, series_length=10, rollout=1, multistep=1, timeincrement=1)
+    expected_values = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
+    assert np.allclose(valid_indices, expected_values)
+
+    # Test multiple steps inputs
+    valid_indices = get_usable_indices(missing_indices=None, series_length=10, rollout=1, multistep=2, timeincrement=1)
+    expected_values = np.array([1, 2, 3, 4, 5, 6, 7, 8])
+    assert np.allclose(valid_indices, expected_values)
+
+    # Test roll out
+    valid_indices = get_usable_indices(missing_indices=None, series_length=10, rollout=2, multistep=1, timeincrement=1)
+    expected_values = np.array([0, 1, 2, 3, 4, 5, 6, 7])
+    assert np.allclose(valid_indices, expected_values)
+
+    # Test longer time increments
+    valid_indices = get_usable_indices(missing_indices=None, series_length=10, rollout=1, multistep=2, timeincrement=2)
+    expected_values = np.array([2, 3, 4, 5, 6, 7])
+    assert np.allclose(valid_indices, expected_values)
+
+    # Test missing indices
+    missing_indices = {7, 5}
+    valid_indices = get_usable_indices(
+        missing_indices=missing_indices,
+        series_length=10,
+        rollout=1,
+        multistep=2,
+        timeincrement=1,
+    )
+    expected_values = np.array([1, 2, 3])
+    assert np.allclose(valid_indices, expected_values)