Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zarr V3 metadata fixes #248

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion virtualizarr/tests/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
chunks=(2, 2),
compressor=None,
filters=None,
fill_value=np.nan,
fill_value=0,
order="C",
),
chunkmanifest=manifest,
Expand Down
6 changes: 3 additions & 3 deletions virtualizarr/tests/test_manifests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_create_manifestarray(self):

def test_create_manifestarray_from_kerchunk_refs(self):
arr_refs = {
".zarray": '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}',
".zarray": '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":0,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}',
"0.0": ["test1.nc", 6144, 48],
}
marr = ManifestArray._from_kerchunk_refs(arr_refs)
Expand All @@ -46,13 +46,13 @@ def test_create_manifestarray_from_kerchunk_refs(self):
assert marr.chunks == (2, 3)
assert marr.dtype == np.dtype("int64")
assert marr.zarray.compressor is None
assert marr.zarray.fill_value is np.nan
assert marr.zarray.fill_value == 0
assert marr.zarray.filters is None
assert marr.zarray.order == "C"

def test_create_scalar_manifestarray_from_kerchunk_refs(self):
arr_refs = {
".zarray": '{"chunks":[],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[],"zarr_format":2}',
".zarray": '{"chunks":[],"compressor":null,"dtype":"<i8","fill_value":0,"filters":null,"order":"C","shape":[],"zarr_format":2}',
"0": ["test1.nc", 6144, 48],
}
marr = ManifestArray._from_kerchunk_refs(arr_refs)
Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/tests/test_readers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_dataset_from_df_refs():

assert da.data.zarray.compressor is None
assert da.data.zarray.filters is None
assert da.data.zarray.fill_value is np.nan
assert da.data.zarray.fill_value == 0
assert da.data.zarray.order == "C"

assert da.data.manifest.dict() == {
Expand Down
8 changes: 4 additions & 4 deletions virtualizarr/tests/test_writers/test_kerchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_accessor_to_kerchunk_dict(self):
chunks=(2, 3),
compressor=None,
filters=None,
fill_value=np.nan,
fill_value=0,
order="C",
),
)
Expand All @@ -30,7 +30,7 @@ def test_accessor_to_kerchunk_dict(self):
"refs": {
".zgroup": '{"zarr_format":2}',
".zattrs": "{}",
"a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
"a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":0,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
"a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
"a/0.0": ["test.nc", 6144, 48],
},
Expand All @@ -51,7 +51,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path):
chunks=(2, 3),
compressor=None,
filters=None,
fill_value=np.nan,
fill_value=0,
order="C",
),
)
Expand All @@ -69,7 +69,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path):
"refs": {
".zgroup": '{"zarr_format":2}',
".zattrs": "{}",
"a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
"a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":0,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
"a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
"a/0.0": ["test.nc", 6144, 48],
},
Expand Down
4 changes: 2 additions & 2 deletions virtualizarr/writers/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) ->
"configuration": {"chunk_shape": metadata.pop("chunks")},
}
metadata["chunk_key_encoding"] = {
"name": "default",
"configuration": {"separator": "/"},
"name": "v2",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems wrong? For writing v3 metadata?

In general if we're not planning to use this format any more (see #262 (comment)), how much of this PR do you want to keep @LDeakin ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presumably all the rest of the fixes are still relevant?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems wrong? For writing v3 metadata?

The chunk manifest example in zarr-developers/zarr-specs#287 and virtualizarr produces "0.0" style chunk key encoding, which is v2 with . separator. default with / would be "c/0/0".

If the chunk key encoding of the array and the chunk manifest matches, then the chunk-manifest-json storage transformer does not need to concern itself with chunk key encodings, which makes sense to me.

In general if we're not planning to use this format any more (see #262 (comment)), how much of this PR do you want to keep @LDeakin ?

Not fussed, this PR was just the minimal changes I needed to use the chunk-manifest-json as currently spec'd and produced by virtualizarr. I'd hope most of these changes would be superseded by bringing in zarr-python V3 as a dependency anyway.

I haven't looked thoroughly at the spec for icechunk yet, but do you see it replacing chunk-manifest-json entirely? Can the time travel stuff be decoupled from the chunk manifests?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The chunk manifest example in zarr-developers/zarr-specs#287 and virtualizarr produces "0.0" style chunk key encoding, which is v2 with . separator. default with / would be "c/0/0".

My intention was to test out writing to and reading from a v3-compatible json-based chunk manifest spec. If what I actually did looks more like v2 then that's my bad for not understanding the spec properly!

Not fussed, this PR was just the minimal changes I needed to use the chunk-manifest-json as currently spec'd and produced by virtualizarr. I'd hope most of these changes would be superseded by bringing in zarr-python V3 as a dependency anyway.

Okay thanks. Maybe we get virtualizarr working fully, then look at the updated diff, as I would expect @mpiannucci's efforts on icechunk compatibility should iron out similar concerns around fill values?

I'd hope most of these changes would be superseded by bringing in zarr-python V3 as a dependency anyway.

👍 We're close to being able to do that now that zarr-python v3 alpha (beta today actually) is out.

I haven't looked thoroughly at the spec for icechunk yet, but do you see it replacing chunk-manifest-json entirely?

I think that is Earthmover's intention.

Can the time travel stuff be decoupled from the chunk manifests?

In theory it probably could, but in practice unless there is a strong use case for using chunk manifests where you wouldn't also like to have all the other features of icechunk, I'm not really sure why you would bother separting them. All the features of icechunk are closely-related in that they all involve/require adding a new layer of indirection into the store, i.e. the manifests + snapshots (which are kind of like time-stamped consolidated metadata IIUC). This question deserves discussion on that zarr spec proposal issue though.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This question deserves discussion on that zarr spec proposal issue though.

I've asked in zarr-developers/zarr-specs#287 (comment)

"configuration": {"separator": "."},
}
metadata["codecs"] = zarray._v3_codec_pipeline()
metadata.pop("filters")
Expand Down
31 changes: 28 additions & 3 deletions virtualizarr/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,39 @@ def __post_init__(self) -> None:
if self.fill_value is None:
self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0)

# Handle non-finite fill values
if not isinstance(self.fill_value, list):
if self.fill_value is np.nan:
self.fill_value = "NaN"
elif self.fill_value is np.inf:
self.fill_value = "Infinity"
elif self.fill_value is -np.inf: # TODO: does this work?
self.fill_value = "-Infinity"
# TODO: Handle other data types (complex, etc.)

@property
def codec(self) -> Codec:
"""For comparison against other arrays."""
return Codec(compressor=self.compressor, filters=self.filters)

@classmethod
def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
dtype = np.dtype(decoded_arr_refs_zarray["dtype"])

# coerce type of fill_value as kerchunk can be inconsistent with this
fill_value = decoded_arr_refs_zarray["fill_value"]
if fill_value is None or fill_value == "NaN" or fill_value == "nan":
fill_value = np.nan
if dtype.kind == "f":
fill_value = np.nan
elif dtype.kind == "c":
fill_value = [np.nan, np.nan]
elif dtype.kind == "i":
fill_value = 0
else:
# TODO: Handle other data types
raise ValueError(
f"Fill value {fill_value} is not valid for dtype {dtype}"
)

compressor = decoded_arr_refs_zarray["compressor"]
zarr_format = int(decoded_arr_refs_zarray["zarr_format"])
Expand Down Expand Up @@ -185,8 +207,11 @@ def _v3_codec_pipeline(self) -> list:
# https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097
# "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec"
bytes = dict(
name="bytes", configuration={}
) # TODO need to handle endianess configuration
name="bytes",
configuration={
"endian": "little" # TODO need to handle endianess configuration, but little is a sensible default for now
},
)

# The order here is significant!
# [ArrayArray] -> ArrayBytes -> [BytesBytes]
Expand Down
Loading