pySTEPS · mats-knmi · Jul 17, 2024 · Jul 17, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/ci/ci_test_env.yml b/ci/ci_test_env.yml
@@ -18,6 +18,7 @@ dependencies:
   - pillow
   - pyproj
   - scipy
+  - xarray
   # Optional dependencies
   - dask
   - pyfftw

diff --git a/pysteps/decorators.py b/pysteps/decorators.py
@@ -22,6 +22,8 @@
 
 import numpy as np
 
+from pysteps.xarray_helpers import convert_input_to_xarray_dataset
+
 
 def _add_extra_kwrds_to_docstrings(target_func, extra_kwargs_doc_text):
     """
@@ -66,7 +68,7 @@ def postprocess_import(fillna=np.nan, dtype="double"):
     def _postprocess_import(importer):
         @wraps(importer)
         def _import_with_postprocessing(*args, **kwargs):
-            precip, *other_args = importer(*args, **kwargs)
+            precip, quality, metadata = importer(*args, **kwargs)
 
             _dtype = kwargs.get("dtype", dtype)
 
@@ -88,7 +90,9 @@ def _import_with_postprocessing(*args, **kwargs):
                     mask = ~np.isfinite(precip)
                     precip[mask] = _fillna
 
-            return (precip.astype(_dtype),) + tuple(other_args)
+            return convert_input_to_xarray_dataset(
+                precip.astype(_dtype), quality, metadata
+            )
 
         extra_kwargs_doc = """
             Other Parameters
@@ -124,7 +128,9 @@ def new_function(*args, **kwargs):
             target motion_method_func function.
             """
 
-            input_images = args[0]
+            dataset = args[0]
+            precip_var = dataset.attrs["precip_var"]
+            input_images = dataset[precip_var].values
             if input_images.ndim != 3:
                 raise ValueError(
                     "input_images dimension mismatch.\n"

diff --git a/pysteps/io/importers.py b/pysteps/io/importers.py
@@ -65,6 +65,104 @@
 |   zr_b           | the Z-R exponent b in Z = a*R**b                         |
 +------------------+----------------------------------------------------------+
 
+The data and metadata is then postprocessed into an xarray dataset. This dataset will
+always contain an x and y dimension, but can be extended with a time dimension and/or
+an ensemble member dimension over the course of the process.
+
+The dataset can contain the following coordinate variables:
+
+
+.. tabularcolumns:: |p{2cm}|L|
+
++--------------------+-------------------------------------------------------------------------------------------+
+|  Coordinate        |                Description                                                                |
++====================+===========================================================================================+
+|   y                | y-coordinate in Cartesian system, with units determined by ``metadata["cartesian_unit"]`` |
++--------------------+-------------------------------------------------------------------------------------------+
+|   x                | x-coordinate in Cartesian system, with units determined by ``metadata["cartesian_unit"]`` |
++--------------------+-------------------------------------------------------------------------------------------+
+|   lat              | latitude coordinate in degrees                                                            |
++--------------------+-------------------------------------------------------------------------------------------+
+|   lon              | longitude coordinate in degrees                                                           |
++--------------------+-------------------------------------------------------------------------------------------+
+|   time             | forecast time in seconds since forecast start time                                        |
++--------------------+-------------------------------------------------------------------------------------------+
+|   ens_number       | ensemble member number (integer)                                                          |
++--------------------+-------------------------------------------------------------------------------------------+
+|   direction        | used by proesmans to return the forward and backward advection and consistency fields     |
++--------------------+-------------------------------------------------------------------------------------------+
+
+The time, x and y dimensions all MUST be regularly spaced, with the stepsize included
+in a ``stepsize`` attribute. The stepsize is given in the unit of the dimension (this
+is alwyas seconds for the time dimension).
+
+The dataset can contain the following data variables:
+
+.. tabularcolumns:: |p{2cm}|L|
+
++-------------------+-----------------------------------------------------------------------------------------------------------+
+|    Variable       |                Description                                                                                |
++===================+===========================================================================================================+
+| precip_intensity, | precipitation data, based on the unit the data has it is stored in one of these 3 possible variables      |
+| precip_accum      | precip_intensity if unit is ``mm/h``, precip_accum if unit is ``mm`` and reflectivity if unit is ``dBZ``, |
+| or reflectivity   | the attributes of this variable contain metadata relevant to this attribute (see below)                   |
++-------------------+-----------------------------------------------------------------------------------------------------------+
+| velocity_x        | x-component of the advection field in cartesian_unit per timestep                                         |
++-------------------+-----------------------------------------------------------------------------------------------------------+
+| velocity_y        | y-component of the advection field in cartesian_unit per timestep                                         |
++-------------------+-----------------------------------------------------------------------------------------------------------+
+| quality           | value between 0 and 1 denoting the quality of the precipitation data, currently not used for anything     |
++-------------------+-----------------------------------------------------------------------------------------------------------+
+| velocity_quality  | value between 0 and 1 denoting the quality of the velocity data, currently only returned by proesmans     |
++-------------------+-----------------------------------------------------------------------------------------------------------+
+
+Some of the metadata in the metadata dictionary is not explicitely stored in the dataset,
+but is still implicitly present. For example ``x1`` can easily be found by taking the first
+value from the x coordinate variable. Metadata that is not implicitly present is explicitly
+stored either in the datasets global attributes or as attributes of the precipitation variable.
+Data that relates to the entire dataset is stored in the global attributes. The following data
+is stored in the global attributes:
+
+.. tabularcolumns:: |p{2cm}|L|
+
++------------------+----------------------------------------------------------+
+|       Key        |                Value                                     |
++==================+==========================================================+
+|   projection     | PROJ.4-compatible projection definition                  |
++------------------+----------------------------------------------------------+
+|   institution    | name of the institution who provides the data            |
++------------------+----------------------------------------------------------+
+|   precip_var     | the name of the precipitation variable in this dataset   |
++------------------+----------------------------------------------------------+
+
+The following data is stored as attributes of the precipitation variable:
+
+.. tabularcolumns:: |p{2cm}|L|
+
++------------------+----------------------------------------------------------+
+|       Key        |                Value                                     |
++==================+==========================================================+
+|   units          | the physical unit of the data: 'mm/h', 'mm' or 'dBZ'     |
++------------------+----------------------------------------------------------+
+|   transform      | the transformation of the data: None, 'dB', 'Box-Cox' or |
+|                  | others                                                   |
++------------------+----------------------------------------------------------+
+|   accutime       | the accumulation time in minutes of the data, float      |
++------------------+----------------------------------------------------------+
+|   threshold      | the rain/no rain threshold with the same unit,           |
+|                  | transformation and accutime of the data.                 |
++------------------+----------------------------------------------------------+
+|   zerovalue      | the value assigned to the no rain pixels with the same   |
+|                  | unit, transformation and accutime of the data.           |
++------------------+----------------------------------------------------------+
+|   zr_a           | the Z-R constant a in Z = a*R**b                         |
++------------------+----------------------------------------------------------+
+|   zr_b           | the Z-R exponent b in Z = a*R**b                         |
++------------------+----------------------------------------------------------+
+
+Furthermore the dataset can contain some additional metadata to make the dataset
+CF-compliant.
+
 Available Importers
 -------------------
 
@@ -89,12 +187,10 @@
 from functools import partial
 
 import numpy as np
-
 from matplotlib.pyplot import imread
 
 from pysteps.decorators import postprocess_import
-from pysteps.exceptions import DataModelError
-from pysteps.exceptions import MissingOptionalDependency
+from pysteps.exceptions import DataModelError, MissingOptionalDependency
 from pysteps.utils import aggregate_fields
 
 try:

diff --git a/pysteps/io/readers.py b/pysteps/io/readers.py
@@ -12,13 +12,14 @@
 """
 
 import numpy as np
+import xarray as xr
 
 
-def read_timeseries(inputfns, importer, **kwargs):
+def read_timeseries(inputfns, importer, timestep=None, **kwargs) -> xr.Dataset | None:
     """
     Read a time series of input files using the methods implemented in the
-    :py:mod:`pysteps.io.importers` module and stack them into a 3d array of
-    shape (num_timesteps, height, width).
+    :py:mod:`pysteps.io.importers` module and stack them into a 3d xarray
+    dataset of shape (num_timesteps, height, width).
 
     Parameters
     ----------
@@ -27,55 +28,69 @@ def read_timeseries(inputfns, importer, **kwargs):
         :py:mod:`pysteps.io.archive` module.
     importer: function
         A function implemented in the :py:mod:`pysteps.io.importers` module.
+    timestep: int, optional
+        The timestep in seconds, this value is optional if more than 1 inputfns
+        are given.
     kwargs: dict
         Optional keyword arguments for the importer.
 
     Returns
     -------
-    out: tuple
-        A three-element tuple containing the read data and quality rasters and
+    out: Dataset
+        A dataset containing the read data and quality rasters and
         associated metadata. If an input file name is None, the corresponding
         precipitation and quality fields are filled with nan values. If all
         input file names are None or if the length of the file name list is
-        zero, a three-element tuple containing None values is returned.
+        zero, None is returned.
 
     """
 
     # check for missing data
-    precip_ref = None
+    dataset_ref = None
     if all(ifn is None for ifn in inputfns):
-        return None, None, None
+        return None
     else:
         if len(inputfns[0]) == 0:
-            return None, None, None
+            return None
         for ifn in inputfns[0]:
             if ifn is not None:
-                precip_ref, quality_ref, metadata = importer(ifn, **kwargs)
+                dataset_ref = importer(ifn, **kwargs)
                 break
 
-    if precip_ref is None:
-        return None, None, None
+    if dataset_ref is None:
+        return None
 
-    precip = []
-    quality = []
-    timestamps = []
+    startdate = min(inputfns[1])
+    sorted_dates = sorted(inputfns[1])
+    timestep_dates = int((sorted_dates[1] - sorted_dates[0]).total_seconds())
+
+    if timestep is None:
+        timestep = timestep_dates
+    if timestep != timestep_dates:
+        raise ValueError("given timestep does not match inputfns")
+    for i in range(len(sorted_dates) - 1):
+        if int((sorted_dates[i + 1] - sorted_dates[i]).total_seconds()) != timestep:
+            raise ValueError("supplied dates are not evenly spaced")
+
+    datasets = []
     for i, ifn in enumerate(inputfns[0]):
         if ifn is not None:
-            precip_, quality_, _ = importer(ifn, **kwargs)
-            precip.append(precip_)
-            quality.append(quality_)
-            timestamps.append(inputfns[1][i])
+            dataset_ = importer(ifn, **kwargs)
         else:
-            precip.append(precip_ref * np.nan)
-            if quality_ref is not None:
-                quality.append(quality_ref * np.nan)
-            else:
-                quality.append(None)
-            timestamps.append(inputfns[1][i])
-
-    # Replace this with stack?
-    precip = np.concatenate([precip_[None, :, :] for precip_ in precip])
-    # TODO: Q should be organized as R, but this is not trivial as Q_ can be also None or a scalar
-    metadata["timestamps"] = np.array(timestamps)
+            dataset_ = dataset_ref * np.nan
+        dataset_ = dataset_.expand_dims(dim="time", axis=0)
+        dataset_ = dataset_.assign_coords(
+            time=(
+                "time",
+                [inputfns[1][i]],
+                {
+                    "long_name": "forecast time",
+                    "units": f"seconds since {startdate:%Y-%m-%d %H:%M:%S}",
+                    "stepsize": timestep,
+                },
+            )
+        )
+        datasets.append(dataset_)
 
-    return precip, quality, metadata
+    dataset = xr.concat(datasets, dim="time")
+    return dataset
diff --git a/pysteps/motion/constant.py b/pysteps/motion/constant.py
@@ -14,27 +14,32 @@
 
 import numpy as np
 import scipy.optimize as op
+import xarray as xr
 from scipy.ndimage import map_coordinates
 
 
-def constant(R, **kwargs):
+def constant(dataset: xr.Dataset, **kwargs):
     """
     Compute a constant advection field by finding a translation vector that
     maximizes the correlation between two successive images.
 
     Parameters
     ----------
-    R: array_like
-      Array of shape (T,m,n) containing a sequence of T two-dimensional input
-      images of shape (m,n). If T > 2, two last elements along axis 0 are used.
+    dataset: xarray.Dataset
+        Input dataset as described in the documentation of
+        :py:mod:`pysteps.io.importers`. It has to contain a precipitation data variable.
+        The dataset has to have a time dimension. If the size of this dimension
+        is larger than 2, the last 2 entries of this dimension are used.
 
     Returns
     -------
-    out: array_like
-        The constant advection field having shape (2, m, n), where out[0, :, :]
-        contains the x-components of the motion vectors and out[1, :, :]
-        contains the y-components.
+    out: xarray.Dataset
+        The input dataset with the constant advection field added in the ``velocity_x``
+        and ``velocity_y`` data variables.
     """
+    dataset = dataset.copy(deep=True)
+    precip_var = dataset.attrs["precip_var"]
+    R = dataset[precip_var].values
     m, n = R.shape[1:]
     X, Y = np.meshgrid(np.arange(n), np.arange(m))
 
@@ -51,4 +56,7 @@ def f(v):
     options = {"initial_simplex": (np.array([(0, 1), (1, 0), (1, 1)]))}
     result = op.minimize(f, (1, 1), method="Nelder-Mead", options=options)
 
-    return np.stack([-result.x[0] * np.ones((m, n)), -result.x[1] * np.ones((m, n))])
+    output = np.stack([-result.x[0] * np.ones((m, n)), -result.x[1] * np.ones((m, n))])
+    dataset["velocity_x"] = (["y", "x"], output[0])
+    dataset["velocity_y"] = (["y", "x"], output[1])
+    return dataset