diff --git a/CHANGELOG.md b/CHANGELOG.md index 14fbe571..72215fb4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Keep it human-readable, your future self will thank you! ## [Unreleased](https://github.com/ecmwf/anemoi-datasets/compare/0.5.5...HEAD) +- Update documentation + ## [0.5.5](https://github.com/ecmwf/anemoi-datasets/compare/0.5.4...0.5.5) - 2024-10-04 ### Changed diff --git a/docs/building/filters/rename.rst b/docs/building/filters/rename.rst index 4af23be5..2b9edada 100644 --- a/docs/building/filters/rename.rst +++ b/docs/building/filters/rename.rst @@ -2,5 +2,19 @@ rename ######## +When combining several sources, it is common to have different values +for a given attribute to represents the same concept. For example, +``temperature_850hPa`` and ``t_850`` are two different ways to represent +the temperature at 850 hPa. The ``rename`` filter allows to rename a key +to another key. It is a :ref:`filter ` that needs to follow a +:ref:`source ` or another filter in a :ref:`building-pipe` +operation. + .. literalinclude:: yaml/rename.yaml :language: yaml + +.. note:: + + The ``rename`` filter was mostly designed to rename the ``param`` + attribute, but any key can be renamed. The ``rename`` filter can take + several renaming keys. diff --git a/docs/building/filters/yaml/rename.yaml b/docs/building/filters/yaml/rename.yaml index bcfd4f6f..137448e9 100644 --- a/docs/building/filters/yaml/rename.yaml +++ b/docs/building/filters/yaml/rename.yaml @@ -1,4 +1,11 @@ -rename: - param: - from: "old" - to: "new" +input: + pipe: + - source: # mars, grib, netcdf, etc. + # source attributes here + # ... + + - rename: + param: # Map old `param` names to new ones + temperature_2m: 2t + temperature_850hPa: t_850 + # ... diff --git a/src/anemoi/datasets/create/functions/sources/xarray/coordinates.py b/src/anemoi/datasets/create/functions/sources/xarray/coordinates.py index c5543d51..56bf8fe6 100644 --- a/src/anemoi/datasets/create/functions/sources/xarray/coordinates.py +++ b/src/anemoi/datasets/create/functions/sources/xarray/coordinates.py @@ -30,6 +30,8 @@ def extract_single_value(variable): if np.issubdtype(variable.values.dtype, np.datetime64): if len(shape) == 0: return to_datetime(variable.values) # Convert to python datetime + if shape == (1,): + return to_datetime(variable.values[0]) assert False, (shape, variable.values[:2]) if np.issubdtype(variable.values.dtype, np.timedelta64): diff --git a/src/anemoi/datasets/create/functions/sources/xarray/flavour.py b/src/anemoi/datasets/create/functions/sources/xarray/flavour.py index 1bdd5d15..638bf296 100644 --- a/src/anemoi/datasets/create/functions/sources/xarray/flavour.py +++ b/src/anemoi/datasets/create/functions/sources/xarray/flavour.py @@ -228,20 +228,15 @@ def _x_y_provided(self, x, y, variable): x = x[0] y = y[0] - _, unstructured = self._check_dims(variable, x, y) + dim_vars, unstructured = self._check_dims(variable, x, y) - if x.variable.dims != y.variable.dims: - raise ValueError(f"Dimensions do not match {x.name}{x.variable.dims} != {y.name}{y.variable.dims}") - - if (x.name, y.name) in self._cache: - return self._cache[(x.name, y.name)] - - if (x.name, y.name) in self._cache: - return self._cache[(x.name, y.name)] - - assert len(x.variable.shape) == len(y.variable.shape), (x.variable.shape, y.variable.shape) + if (x.name, y.name, dim_vars) in self._cache: + return self._cache[(x.name, y.name, dim_vars)] grid_mapping = variable.attrs.get("grid_mapping", None) + if grid_mapping is not None: + print(f"grid_mapping: {grid_mapping}") + print(self.ds[grid_mapping]) if grid_mapping is None: LOG.warning(f"No 'grid_mapping' attribute provided for '{variable.name}'") @@ -288,11 +283,19 @@ def _x_y_provided(self, x, y, variable): grid_mapping = self.ds.attrs["crs"] LOG.warning(f"Using CRS {grid_mapping} from global attributes") + grid = None if grid_mapping is not None: + + grid_mapping = dict(self.ds[grid_mapping].attrs) + if unstructured: - return UnstructuredProjectionGrid(x, y, grid_mapping) + grid = UnstructuredProjectionGrid(x, y, grid_mapping) else: - return MeshProjectionGrid(x, y, grid_mapping) + grid = MeshProjectionGrid(x, y, grid_mapping) + + if grid is not None: + self._cache[(x.name, y.name, dim_vars)] = grid + return grid LOG.error("Could not fine a candidate for 'grid_mapping'") raise NotImplementedError(f"Unstructured grid {x.name} {y.name}") @@ -340,12 +343,14 @@ def _is_time(self, c, *, axis, name, long_name, standard_name, units): if standard_name == "time": return TimeCoordinate(c) - if name == "time": + # That is the output of `cfgrib` for forecasts + if name == "time" and standard_name != "forecast_reference_time": return TimeCoordinate(c) def _is_date(self, c, *, axis, name, long_name, standard_name, units): if standard_name == "forecast_reference_time": return DateCoordinate(c) + if name == "forecast_reference_time": return DateCoordinate(c) diff --git a/src/anemoi/datasets/create/functions/sources/xarray/grid.py b/src/anemoi/datasets/create/functions/sources/xarray/grid.py index 36131640..1c639992 100644 --- a/src/anemoi/datasets/create/functions/sources/xarray/grid.py +++ b/src/anemoi/datasets/create/functions/sources/xarray/grid.py @@ -100,6 +100,7 @@ def transformer(self): data_crs = CRS.from_cf(self.projection) else: data_crs = self.projection + wgs84_crs = CRS.from_epsg(4326) # WGS84 return Transformer.from_crs(data_crs, wgs84_crs, always_xy=True) diff --git a/src/anemoi/datasets/create/functions/sources/xarray/time.py b/src/anemoi/datasets/create/functions/sources/xarray/time.py index ac48013e..281e6975 100644 --- a/src/anemoi/datasets/create/functions/sources/xarray/time.py +++ b/src/anemoi/datasets/create/functions/sources/xarray/time.py @@ -9,9 +9,12 @@ import datetime +import logging from anemoi.utils.dates import as_datetime +LOG = logging.getLogger(__name__) + class Time: @@ -36,7 +39,28 @@ def from_coordinates(cls, coordinates): if len(date_coordinate) == 1 and len(time_coordinate) == 0 and len(step_coordinate) == 1: return ForecastFromBaseTimeAndDate(date_coordinate[0], step_coordinate[0]) - raise NotImplementedError(f"{date_coordinate=} {time_coordinate=} {step_coordinate=}") + if len(date_coordinate) == 1 and len(time_coordinate) == 1 and len(step_coordinate) == 1: + return ForecastFromValidTimeAndStep(time_coordinate[0], step_coordinate[0], date_coordinate[0]) + + LOG.error("") + LOG.error(f"{len(date_coordinate)} date_coordinate") + for c in date_coordinate: + LOG.error(" %s %s %s %s", c, c.is_date, c.is_time, c.is_step) + # LOG.error(' %s', c.variable) + + LOG.error("") + LOG.error(f"{len(time_coordinate)} time_coordinate") + for c in time_coordinate: + LOG.error(" %s %s %s %s", c, c.is_date, c.is_time, c.is_step) + # LOG.error(' %s', c.variable) + + LOG.error("") + LOG.error(f"{len(step_coordinate)} step_coordinate") + for c in step_coordinate: + LOG.error(" %s %s %s %s", c, c.is_date, c.is_time, c.is_step) + # LOG.error(' %s', c.variable) + + raise NotImplementedError(f"{len(date_coordinate)=} {len(time_coordinate)=} {len(step_coordinate)=}") class Constant(Time): @@ -62,9 +86,10 @@ def fill_time_metadata(self, coords_values, metadata): class ForecastFromValidTimeAndStep(Time): - def __init__(self, time_coordinate, step_coordinate): + def __init__(self, time_coordinate, step_coordinate, date_coordinate=None): self.time_coordinate_name = time_coordinate.variable.name self.step_coordinate_name = step_coordinate.variable.name + self.date_coordinate_name = date_coordinate.variable.name if date_coordinate else None def fill_time_metadata(self, coords_values, metadata): valid_datetime = coords_values[self.time_coordinate_name] @@ -79,6 +104,16 @@ def fill_time_metadata(self, coords_values, metadata): metadata["date"] = as_datetime(base_datetime).strftime("%Y%m%d") metadata["time"] = as_datetime(base_datetime).strftime("%H%M") metadata["step"] = int(hours) + + # When date is present, it should be compatible with time and step + + if self.date_coordinate_name is not None: + # Not sure that this is the correct assumption + assert coords_values[self.date_coordinate_name] == base_datetime, ( + coords_values[self.date_coordinate_name], + base_datetime, + ) + return valid_datetime diff --git a/src/anemoi/datasets/testing.py b/src/anemoi/datasets/testing.py new file mode 100644 index 00000000..6ed19ce8 --- /dev/null +++ b/src/anemoi/datasets/testing.py @@ -0,0 +1,59 @@ +# (C) Copyright 2024 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +# A collection of functions to support pytest testing + +import logging + +LOG = logging.getLogger(__name__) + + +def assert_field_list(fs, size=None, start=None, end=None, constant=False, skip=None): + import numpy as np + + if size is None: + assert len(fs) > 0, fs + else: + assert len(fs) == size, (len(fs), size) + + first = fs[0] + last = fs[-1] + + if constant: + # TODO: add a check for constant fields + pass + else: + assert start is None or first.metadata("valid_datetime") == start, (first.metadata("valid_datetime"), start) + assert end is None or last.metadata("valid_datetime") == end, (last.metadata("valid_datetime"), end) + print(first.datetime()) + + print(last.metadata()) + + first = first + latitudes, longitudes = first.grid_points() + + assert len(latitudes.shape) == 1, latitudes.shape + assert len(longitudes.shape) == 1, longitudes.shape + + assert len(latitudes) == len(longitudes), (len(latitudes), len(longitudes)) + data = first.to_numpy(flatten=True) + + assert len(data) == len(latitudes), (len(data), len(latitudes)) + + north = np.max(latitudes) + south = np.min(latitudes) + east = np.max(longitudes) + west = np.min(longitudes) + + assert north >= south, (north, south) + assert east >= west, (east, west) + assert north <= 90, north + assert south >= -90, south + assert east <= 360, east + assert west >= -180, west diff --git a/tests/xarray/test_netcdf.py b/tests/xarray/test_netcdf.py index 18c49365..0a74e4da 100644 --- a/tests/xarray/test_netcdf.py +++ b/tests/xarray/test_netcdf.py @@ -18,16 +18,22 @@ "https://get.ecmwf.int/repository/test-data/metview/gallery/era5_2000_aug.nc": dict(length=3), "https://get.ecmwf.int/repository/test-data/metview/gallery/era5_ozone_1999.nc": dict(length=4), "https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/fa_ta850.nc": dict(length=37), - # 'https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/htessel_points.nc': dict(length=1), + "https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/htessel_points.nc": dict(length=1), "https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/test_single.nc": dict(length=1), - # 'https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/zgrid_rhgmet_metop_200701_R_2305_0010.nc': dict(length=1), - # 'https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/20210101-C3S-L2_GHG-GHG_PRODUCTS-TANSO2-GOSAT2-SRFP-DAILY-v2.0.0.nc': dict(length=1), + "https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/zgrid_rhgmet_metop_200701_R_2305_0010.nc": dict( + length=1 + ), + "https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/20210101-C3S-L2_GHG-GHG_PRODUCTS-TANSO2-GOSAT2-SRFP-DAILY-v2.0.0.nc": dict( + length=1 + ), "https://get.ecmwf.int/repository/test-data/earthkit-data/test-data/20220401-C3S-L3S_FIRE-BA-OLCI-AREA_3-fv1.1.nc": dict( length=3 ), - # 'https://github.com/ecmwf/magics-test/raw/master/test/efas/tamir.nc': dict(length=1), - # 'https://github.com/ecmwf/magics-test/raw/master/test/gallery/C3S_OZONE-L4-TC-ASSIM_MSR-201608-fv0020.nc': dict(length=1), - # 'https://github.com/ecmwf/magics-test/raw/master/test/gallery/avg_data.nc': dict(length=1), + "https://github.com/ecmwf/magics-test/raw/master/test/efas/tamir.nc": dict(length=1), + "https://github.com/ecmwf/magics-test/raw/master/test/gallery/C3S_OZONE-L4-TC-ASSIM_MSR-201608-fv0020.nc": dict( + length=1 + ), + "https://github.com/ecmwf/magics-test/raw/master/test/gallery/avg_data.nc": dict(length=1), "https://github.com/ecmwf/magics-test/raw/master/test/gallery/era5_2000_aug_1.nc": dict(length=3), "https://github.com/ecmwf/magics-test/raw/master/test/gallery/missing.nc": dict(length=20), "https://github.com/ecmwf/magics-test/raw/master/test/gallery/netcdf3_t_z.nc": dict(length=30), diff --git a/tests/xarray/test_opendap.py b/tests/xarray/test_opendap.py index 6c544b70..5844c72e 100644 --- a/tests/xarray/test_opendap.py +++ b/tests/xarray/test_opendap.py @@ -7,6 +7,7 @@ import xarray as xr from anemoi.datasets.create.functions.sources.xarray import XarrayFieldList +from anemoi.datasets.testing import assert_field_list def test_opendap(): @@ -16,10 +17,7 @@ def test_opendap(): ) fs = XarrayFieldList.from_xarray(ds) - - assert len(fs) == 79529 - - print(fs[0].datetime()) + assert_field_list(fs, 79529, "2023-01-01T00:00:00", "2023-01-03T18:00:00") if __name__ == "__main__": diff --git a/tests/xarray/test_samples.py b/tests/xarray/test_samples.py new file mode 100644 index 00000000..437d8e51 --- /dev/null +++ b/tests/xarray/test_samples.py @@ -0,0 +1,65 @@ +# (C) Copyright 2024 European Centre for Medium-Range Weather Forecasts. +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import pytest +import requests +import xarray as xr + +from anemoi.datasets.create.functions.sources.xarray import XarrayFieldList +from anemoi.datasets.testing import assert_field_list + +URL = "https://object-store.os-api.cci1.ecmwf.int/ml-tests/test-data/samples/" + +SAMPLES = list(range(23)) +SKIP = [0, 1, 2, 3, 4] + + +def _test_samples(n, check_skip=True): + + r = requests.get(f"{URL}sample-{n:04d}.json") + if r.status_code not in [200, 404]: + r.raise_for_status() + + if r.status_code == 404: + kwargs = {} + else: + kwargs = r.json() + + skip = kwargs.pop("skip", None) + if skip and check_skip: + pytest.skip(skip if isinstance(skip, str) else "Skipping test") + + open_zarr_kwargs = kwargs.pop("open_zarr_kwargs", {}) + + ds = xr.open_zarr(f"{URL}sample-{n:04d}.zarr", consolidated=True, **open_zarr_kwargs) + + print(ds) + + from_xarray_kwargs = kwargs.pop("from_xarray_kwargs", {}) + + fs = XarrayFieldList.from_xarray(ds, **from_xarray_kwargs) + + assert_field_list(fs, **kwargs) + + +@pytest.mark.parametrize("n", SAMPLES) +def test_samples(n): + _test_samples(n) + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1: + for n in sys.argv[1:]: + _test_samples(int(n), check_skip=False) + else: + for s in SAMPLES: + if s in SKIP: + continue + print("-------------------", s) + _test_samples(s, check_skip=False) diff --git a/tests/xarray/test_zarr.py b/tests/xarray/test_zarr.py index 24da2072..c34a188f 100644 --- a/tests/xarray/test_zarr.py +++ b/tests/xarray/test_zarr.py @@ -8,30 +8,7 @@ import xarray as xr from anemoi.datasets.create.functions.sources.xarray import XarrayFieldList - - -def _check(fs, size, start, end): - assert len(fs) == size - - first = fs[0] - last = fs[-1] - - assert first.metadata("valid_datetime") == start, (first.metadata("valid_datetime"), start) - assert last.metadata("valid_datetime") == end, (last.metadata("valid_datetime"), end) - - print(first.datetime()) - print(last.metadata()) - - first = first - latitudes, longitudes = first.grid_points() - - assert len(latitudes.shape) == 1, latitudes.shape - assert len(longitudes.shape) == 1, longitudes.shape - - assert len(latitudes) == len(longitudes), (len(latitudes), len(longitudes)) - data = first.to_numpy(flatten=True) - - assert len(data) == len(latitudes), (len(data), len(latitudes)) +from anemoi.datasets.testing import assert_field_list def test_arco_era5_1(): @@ -43,7 +20,7 @@ def test_arco_era5_1(): ) fs = XarrayFieldList.from_xarray(ds) - _check( + assert_field_list( fs, 128677526, "1959-01-01T00:00:00", @@ -60,7 +37,7 @@ def test_arco_era5_2(): ) fs = XarrayFieldList.from_xarray(ds) - _check( + assert_field_list( fs, 128677526, "1959-01-01T00:00:00", @@ -86,7 +63,7 @@ def test_weatherbench(): fs = XarrayFieldList.from_xarray(ds, flavour) - _check( + assert_field_list( fs, 2430240, "2020-01-01T06:00:00", diff --git a/tools/make-sample-dataset.py b/tools/make-sample-dataset.py new file mode 100755 index 00000000..85cab7ee --- /dev/null +++ b/tools/make-sample-dataset.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + + +import argparse +import os +import shutil + +import xarray as xr + +parser = argparse.ArgumentParser(description="Create a sample dataset") +parser.add_argument("input", type=str, help="Input file name") +parser.add_argument("output", type=str, help="Output file name") +args = parser.parse_args() + +if os.path.exists(args.output): + if os.path.isdir(args.output): + shutil.rmtree(args.output) + else: + os.unlink(args.output) + +if args.input.endswith(".zarr"): + ds = xr.open_zarr(args.input) +else: + ds = xr.open_dataset(args.input) + +if args.output.endswith(".zarr"): + ds.to_zarr(args.output, consolidated=True) +else: + ds.to_netcdf(args.output)