Merge pull request #33 from grlee77/add_nested_data

Add nested data generation and tests (zarr-python and zarrita)
zarr-developers · Apr 21, 2021 · 7349e03 · 7349e03
2 parents 6786ca2 + a0f1984
commit 7349e03
Show file tree

Hide file tree

Showing 3 changed files with 150 additions and 42 deletions.
diff --git a/generate_data/generate_zarr.py b/generate_data/generate_zarr.py
@@ -14,28 +14,38 @@
 
 # TODO use more compressors from numcodecs and more blosc filter_ids
 def generate_zarr_format(compressors=['gzip', 'blosc', 'zlib', None]):
-    path = 'data/zarr.zr'
-    im = astronaut()
 
-    f = zarr.open(path, mode='w')
-    for compressor in compressors:
-        copts = COMPRESSION_OPTIONS.get(compressor, {})
-        if compressor is None:
-            name = "raw"
-        elif compressor == "blosc":
-            name = "%s/%s" % (compressor, copts.get("cname"))
-        else:
-            name = compressor
-        compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None
-        f.create_dataset(name, data=im, chunks=CHUNKS,
-                         compressor=compressor_impl)
+    for nested, StoreClass, store_kwargs in [
+        (False, zarr.storage.DirectoryStore, {}),
+        (False, zarr.storage.FSStore, {}),
+        (True, zarr.storage.NestedDirectoryStore, {}),
+        (True, zarr.storage.FSStore,
+         {'key_separator': '/', 'auto_mkdir': True}),
+    ]:
+
+        nested_str = '_nested' if nested else '_flat'
+        path = f'data/zarr_{StoreClass.__name__}{nested_str}.zr'
+        store = StoreClass(path, **store_kwargs)
+        im = astronaut()
+
+        f = zarr.open(store, mode='w')
+        for compressor in compressors:
+            copts = COMPRESSION_OPTIONS.get(compressor, {})
+            if compressor is None:
+                name = "raw"
+            elif compressor == "blosc":
+                name = "%s/%s" % (compressor, copts.get("cname"))
+            else:
+                name = compressor
+            compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None
+            f.create_dataset(name, data=im, chunks=CHUNKS,
+                             compressor=compressor_impl)
 
 
 def generate_n5_format(compressors=['gzip', None]):
-    path = 'data/zarr.n5'
     im = astronaut()
 
-    f = zarr.open(path, mode='w')
+    f = zarr.open('data/zarr.n5', mode='w')
     for compressor in compressors:
         name = compressor if compressor is not None else 'raw'
         compressor_impl = STR_TO_COMPRESSOR[compressor]() if compressor is not None else None

diff --git a/generate_data/generate_zarrita.py b/generate_data/generate_zarrita.py
@@ -12,10 +12,16 @@
 COMPRESSION_OPTIONS = {"blosc": {"cname": "lz4"}}
 
 
-def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None]):
+def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None],
+                        nested=True):
     im = astronaut()
-
-    h = zarrita.create_hierarchy('data/zarrita.zr3')
+    if nested:
+        chunk_separator = '/'
+        fname = 'data/zarrita_nested.zr3'
+    else:
+        chunk_separator = '.'
+        fname = 'data/zarrita.zr3'
+    h = zarrita.create_hierarchy(fname)
     for compressor in compressors:
         copts = COMPRESSION_OPTIONS.get(compressor, {})
         if compressor is None:
@@ -26,9 +32,11 @@ def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None]):
             name = compressor
         compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None
         a = h.create_array('/' + name, shape=im.shape, chunk_shape=CHUNKS,
-                           dtype=im.dtype, compressor=compressor_impl)
+                           chunk_separator=chunk_separator, dtype=im.dtype,
+                           compressor=compressor_impl)
         a[...] = im
 
 
 if __name__ == '__main__':
-    generate_zr3_format()
+    for nested in [False, True]:
+        generate_zr3_format(nested=nested)
diff --git a/test/test_read_all.py b/test/test_read_all.py
@@ -12,6 +12,25 @@
 The matrix of tests is automatically generated,
 and individual tests correctly fail on unavailable imports.
 Call ``pytest`` in the root directory to run all of the tests.
+
+The tests in this folder assume that the data generation scripts generate
+filenames named as follows:
+
+{writing_library}.{fmt}
+
+where {fmt} is '.n5', '.zr' or '.zr3'.
+
+For writers where multiple store and/or file nesting formats are tested, the
+following filenaming scheme is used in the generators:
+
+{writing_library}_{storage_class_name}_{nesting_type}.{fmt}
+
+'_{storage_class_name}' is optional and is currently used by the zarr-python
+implementations to indicate which storage class was used to write the data.
+
+'_{nesting_type}' should be either '_nested' or '_flat' to indicate if a
+nested or flat chunk storage scheme is used.
+
 """
 import os
 from typing import Dict, List
@@ -46,26 +65,41 @@
 }
 
 
-def read_with_zarr(fpath, ds_name):
+def read_with_zarr(fpath, ds_name, nested):
     import zarr
     if ds_name == "blosc":
         ds_name = "blosc/lz4"
-    return zarr.open(os.fspath(fpath))[ds_name][:]
+    if str(fpath).endswith('.zr'):
+        if nested:
+            if 'FSStore' in str(fpath):
+                store = zarr.storage.FSStore(
+                    os.fspath(fpath), key_separator='/', mode='r'
+                )
+            else:
+                store = zarr.storage.NestedDirectoryStore(os.fspath(fpath))
+        else:
+            if 'FSStore' in str(fpath):
+                store = zarr.storage.FSStore(os.fspath(fpath))
+            else:
+                store = zarr.storage.DirectoryStore(fpath)
+    else:
+        store = os.fspath(fpath)
+    return zarr.open(store)[ds_name][:]
 
 
-def read_with_pyn5(fpath, ds_name):
+def read_with_pyn5(fpath, ds_name, nested):
     import pyn5
     return pyn5.File(fpath)[ds_name][:]
 
 
-def read_with_z5py(fpath, ds_name):
+def read_with_z5py(fpath, ds_name, nested):
     import z5py
     if ds_name == "blosc":
         ds_name = "blosc/lz4"
     return z5py.File(fpath)[ds_name][:]
 
 
-def read_with_zarrita(fpath, ds_name):
+def read_with_zarrita(fpath, ds_name, nested):
     import zarrita
     if ds_name == "blosc":
         ds_name = "blosc/lz4"
@@ -85,6 +119,9 @@ def read_with_zarrita(fpath, ds_name):
 HERE = Path(__file__).resolve().parent
 DATA_DIR = HERE.parent / "data"
 
+# Optional filename strings indicating a specific storage class was used
+KNOWN_STORAGE_CLASSES = {"DirectoryStore", "FSStore", "NestedDirectoryStore"}
+
 
 def libraries_for_format(format: str):
     return {
@@ -102,23 +139,62 @@ def codecs_for_file(fpath: Path):
         return sorted(d.name for d in fpath.iterdir() if d.is_dir())
 
 
+def _get_write_attrs(file_stem: str):
+    """Parse a filename to determine the writing library name
+
+    If present in the filename, the storage class and nesting type are also
+    determined.
+    """
+    nested_str = ""
+    if "nested" in file_stem:
+        nested_str = "nested"
+        writing_library = file_stem.replace("_nested", "")
+    else:
+        writing_library = file_stem
+
+    if "_flat" in file_stem:
+        writing_library = file_stem.replace("_flat", "")
+
+    store_str = ""
+    for store_name in KNOWN_STORAGE_CLASSES:
+        _store_name = '_' + store_name
+        if _store_name in writing_library:
+            if store_str:
+                raise ValueError(
+                    f"multiple store names in file_stem: {file_stem}"
+                )
+            store_str = store_name
+            writing_library = writing_library.replace(_store_name, "")
+
+    return writing_library, store_str, nested_str
+
+
 def create_params():
-    argnames = ["fmt", "writing_library", "reading_library", "codec"]
+    argnames = ["fmt", "writing_library", "reading_library", "codec", "nested",
+                "store_name", "fpath"]
     params = []
     ids = []
     for fmt in EXTENSIONS:
-        for writing_library, fpath in libraries_for_format(fmt).items():
+        for file_stem, fpath in libraries_for_format(fmt).items():
+            writing_library, store_str, nested_str = _get_write_attrs(file_stem)
+            nested = nested_str == "nested"
             written_codecs = codecs_for_file(fpath)
             for reading_library, available_fmts in READABLE_CODECS.items():
                 available_codecs = available_fmts.get(fmt, [])
                 for codec in sorted(
                     set(available_codecs).intersection(written_codecs)
                 ):
                     params.append(
-                        (fmt, writing_library, reading_library, codec)
+                        (fmt, writing_library, reading_library, codec, nested,
+                         store_str, fpath)
                     )
+                    write_attrs = ', '.join(
+                        [s for s in (store_str, nested_str) if s != ""]
+                    )
+                    if write_attrs:
+                        write_attrs = ' (' + write_attrs + ')'
                     ids.append(
-                        f"read {writing_library} {fmt} using "
+                        f"read {writing_library}{write_attrs} {fmt} using "
                         f"{reading_library}, {codec}"
                     )
     return argnames, params, ids
@@ -127,22 +203,30 @@ def create_params():
 argnames, params, ids = create_params()
 
 
-def _get_read_fn(fmt, writing_library, reading_library):
-    fpath = DATA_DIR / f"{writing_library}{EXTENSIONS[fmt]}"
+def _get_read_fn(reading_library):
     read_fn = {
         "zarr": read_with_zarr,
         "pyn5": read_with_pyn5,
         "z5py": read_with_z5py,
         "zarrita": read_with_zarrita,
     }[reading_library]
-    return fpath, read_fn
+    return read_fn
 
 
 @pytest.mark.parametrize(argnames, params, ids=ids)
-def test_correct_read(fmt, writing_library, reading_library, codec):
+def test_correct_read(fmt, writing_library, reading_library, codec, nested,
+                      store_name, fpath):
+    if nested and reading_library == 'z5py':
+        pytest.skip("nested read not implemented in z5py")
+
     reference = imread(DATA_DIR / "reference_image.png")
-    fpath, read_fn = _get_read_fn(fmt, writing_library, reading_library)
-    test = read_fn(fpath, codec)
+    read_fn = _get_read_fn(reading_library)
+    if not os.path.exists(fpath):
+        raise RuntimeError(
+            f"file not found: {fpath}. Make sure you have generated the data "
+            "using 'make data'"
+        )
+    test = read_fn(fpath, codec, nested)
     assert test.shape == reference.shape
     assert np.allclose(test, reference)
 
@@ -151,11 +235,12 @@ def tabulate_test_results(params, per_codec_tables=False):
     reference = imread(DATA_DIR / "reference_image.png")
 
     all_results = {}
-    for fmt, writing_library, reading_library, codec in params:
-        fpath, read_fn = _get_read_fn(fmt, writing_library, reading_library)
+    for (fmt, writing_library, reading_library, codec, nested, store_name,
+            fpath) in params:
+        read_fn = _get_read_fn(reading_library)
         fail_type = None
         try:
-            test = read_fn(fpath, codec)
+            test = read_fn(fpath, codec, nested)
         except Exception as e:
             fail_type = f"{type(e).__name__}: {e}"
 
@@ -165,12 +250,17 @@ def tabulate_test_results(params, per_codec_tables=False):
         else:
             result = fail_type
 
+        nstr = 'nested' if nested else 'flat'
         if per_codec_tables:
             table_key = fmt, codec
-            inner_key = (writing_library, reading_library)
+            inner_key = (', '.join(writing_library, store_name, nstr), reading_library)
         else:
             table_key = fmt
-            inner_key = (', '.join((writing_library, codec)), reading_library)
+            if store_name:
+                key_attributes = (writing_library, codec, store_name, nstr)
+            else:
+                key_attributes = (writing_library, codec, nstr)
+            inner_key = (', '.join(key_attributes), reading_library)
 
         if table_key not in all_results:
             all_results[table_key] = {}
@@ -226,7 +316,7 @@ def result_to_table(result, use_emojis=True, fmt='md'):
                 df_val = pass_str if v else f'{fail_str}: mismatched'
             else:
                 df_val = pass_str if v else fail_str
-        df.at[k[0], k[1]] = df_val\
+        df.at[k[0], k[1]] = df_val
 
     if fmt == 'html':
         table = df.to_html()