diff --git a/generate_data/generate_zarr.py b/generate_data/generate_zarr.py index 7ad03ea8..2fb095e9 100644 --- a/generate_data/generate_zarr.py +++ b/generate_data/generate_zarr.py @@ -14,28 +14,38 @@ # TODO use more compressors from numcodecs and more blosc filter_ids def generate_zarr_format(compressors=['gzip', 'blosc', 'zlib', None]): - path = 'data/zarr.zr' - im = astronaut() - f = zarr.open(path, mode='w') - for compressor in compressors: - copts = COMPRESSION_OPTIONS.get(compressor, {}) - if compressor is None: - name = "raw" - elif compressor == "blosc": - name = "%s/%s" % (compressor, copts.get("cname")) - else: - name = compressor - compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None - f.create_dataset(name, data=im, chunks=CHUNKS, - compressor=compressor_impl) + for nested, StoreClass, store_kwargs in [ + (False, zarr.storage.DirectoryStore, {}), + (False, zarr.storage.FSStore, {}), + (True, zarr.storage.NestedDirectoryStore, {}), + (True, zarr.storage.FSStore, + {'key_separator': '/', 'auto_mkdir': True}), + ]: + + nested_str = '_nested' if nested else '_flat' + path = f'data/zarr_{StoreClass.__name__}{nested_str}.zr' + store = StoreClass(path, **store_kwargs) + im = astronaut() + + f = zarr.open(store, mode='w') + for compressor in compressors: + copts = COMPRESSION_OPTIONS.get(compressor, {}) + if compressor is None: + name = "raw" + elif compressor == "blosc": + name = "%s/%s" % (compressor, copts.get("cname")) + else: + name = compressor + compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None + f.create_dataset(name, data=im, chunks=CHUNKS, + compressor=compressor_impl) def generate_n5_format(compressors=['gzip', None]): - path = 'data/zarr.n5' im = astronaut() - f = zarr.open(path, mode='w') + f = zarr.open('data/zarr.n5', mode='w') for compressor in compressors: name = compressor if compressor is not None else 'raw' compressor_impl = STR_TO_COMPRESSOR[compressor]() if compressor is not None else None diff --git a/generate_data/generate_zarrita.py b/generate_data/generate_zarrita.py index f340d593..2765dea4 100644 --- a/generate_data/generate_zarrita.py +++ b/generate_data/generate_zarrita.py @@ -12,10 +12,16 @@ COMPRESSION_OPTIONS = {"blosc": {"cname": "lz4"}} -def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None]): +def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None], + nested=True): im = astronaut() - - h = zarrita.create_hierarchy('data/zarrita.zr3') + if nested: + chunk_separator = '/' + fname = 'data/zarrita_nested.zr3' + else: + chunk_separator = '.' + fname = 'data/zarrita.zr3' + h = zarrita.create_hierarchy(fname) for compressor in compressors: copts = COMPRESSION_OPTIONS.get(compressor, {}) if compressor is None: @@ -26,9 +32,11 @@ def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None]): name = compressor compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None a = h.create_array('/' + name, shape=im.shape, chunk_shape=CHUNKS, - dtype=im.dtype, compressor=compressor_impl) + chunk_separator=chunk_separator, dtype=im.dtype, + compressor=compressor_impl) a[...] = im if __name__ == '__main__': - generate_zr3_format() + for nested in [False, True]: + generate_zr3_format(nested=nested) diff --git a/test/test_read_all.py b/test/test_read_all.py index cc07405b..a8baa2f3 100644 --- a/test/test_read_all.py +++ b/test/test_read_all.py @@ -12,6 +12,25 @@ The matrix of tests is automatically generated, and individual tests correctly fail on unavailable imports. Call ``pytest`` in the root directory to run all of the tests. + +The tests in this folder assume that the data generation scripts generate +filenames named as follows: + +{writing_library}.{fmt} + +where {fmt} is '.n5', '.zr' or '.zr3'. + +For writers where multiple store and/or file nesting formats are tested, the +following filenaming scheme is used in the generators: + +{writing_library}_{storage_class_name}_{nesting_type}.{fmt} + +'_{storage_class_name}' is optional and is currently used by the zarr-python +implementations to indicate which storage class was used to write the data. + +'_{nesting_type}' should be either '_nested' or '_flat' to indicate if a +nested or flat chunk storage scheme is used. + """ import os from typing import Dict, List @@ -46,26 +65,41 @@ } -def read_with_zarr(fpath, ds_name): +def read_with_zarr(fpath, ds_name, nested): import zarr if ds_name == "blosc": ds_name = "blosc/lz4" - return zarr.open(os.fspath(fpath))[ds_name][:] + if str(fpath).endswith('.zr'): + if nested: + if 'FSStore' in str(fpath): + store = zarr.storage.FSStore( + os.fspath(fpath), key_separator='/', mode='r' + ) + else: + store = zarr.storage.NestedDirectoryStore(os.fspath(fpath)) + else: + if 'FSStore' in str(fpath): + store = zarr.storage.FSStore(os.fspath(fpath)) + else: + store = zarr.storage.DirectoryStore(fpath) + else: + store = os.fspath(fpath) + return zarr.open(store)[ds_name][:] -def read_with_pyn5(fpath, ds_name): +def read_with_pyn5(fpath, ds_name, nested): import pyn5 return pyn5.File(fpath)[ds_name][:] -def read_with_z5py(fpath, ds_name): +def read_with_z5py(fpath, ds_name, nested): import z5py if ds_name == "blosc": ds_name = "blosc/lz4" return z5py.File(fpath)[ds_name][:] -def read_with_zarrita(fpath, ds_name): +def read_with_zarrita(fpath, ds_name, nested): import zarrita if ds_name == "blosc": ds_name = "blosc/lz4" @@ -85,6 +119,9 @@ def read_with_zarrita(fpath, ds_name): HERE = Path(__file__).resolve().parent DATA_DIR = HERE.parent / "data" +# Optional filename strings indicating a specific storage class was used +KNOWN_STORAGE_CLASSES = {"DirectoryStore", "FSStore", "NestedDirectoryStore"} + def libraries_for_format(format: str): return { @@ -102,12 +139,45 @@ def codecs_for_file(fpath: Path): return sorted(d.name for d in fpath.iterdir() if d.is_dir()) +def _get_write_attrs(file_stem: str): + """Parse a filename to determine the writing library name + + If present in the filename, the storage class and nesting type are also + determined. + """ + nested_str = "" + if "nested" in file_stem: + nested_str = "nested" + writing_library = file_stem.replace("_nested", "") + else: + writing_library = file_stem + + if "_flat" in file_stem: + writing_library = file_stem.replace("_flat", "") + + store_str = "" + for store_name in KNOWN_STORAGE_CLASSES: + _store_name = '_' + store_name + if _store_name in writing_library: + if store_str: + raise ValueError( + f"multiple store names in file_stem: {file_stem}" + ) + store_str = store_name + writing_library = writing_library.replace(_store_name, "") + + return writing_library, store_str, nested_str + + def create_params(): - argnames = ["fmt", "writing_library", "reading_library", "codec"] + argnames = ["fmt", "writing_library", "reading_library", "codec", "nested", + "store_name", "fpath"] params = [] ids = [] for fmt in EXTENSIONS: - for writing_library, fpath in libraries_for_format(fmt).items(): + for file_stem, fpath in libraries_for_format(fmt).items(): + writing_library, store_str, nested_str = _get_write_attrs(file_stem) + nested = nested_str == "nested" written_codecs = codecs_for_file(fpath) for reading_library, available_fmts in READABLE_CODECS.items(): available_codecs = available_fmts.get(fmt, []) @@ -115,10 +185,16 @@ def create_params(): set(available_codecs).intersection(written_codecs) ): params.append( - (fmt, writing_library, reading_library, codec) + (fmt, writing_library, reading_library, codec, nested, + store_str, fpath) ) + write_attrs = ', '.join( + [s for s in (store_str, nested_str) if s != ""] + ) + if write_attrs: + write_attrs = ' (' + write_attrs + ')' ids.append( - f"read {writing_library} {fmt} using " + f"read {writing_library}{write_attrs} {fmt} using " f"{reading_library}, {codec}" ) return argnames, params, ids @@ -127,22 +203,30 @@ def create_params(): argnames, params, ids = create_params() -def _get_read_fn(fmt, writing_library, reading_library): - fpath = DATA_DIR / f"{writing_library}{EXTENSIONS[fmt]}" +def _get_read_fn(reading_library): read_fn = { "zarr": read_with_zarr, "pyn5": read_with_pyn5, "z5py": read_with_z5py, "zarrita": read_with_zarrita, }[reading_library] - return fpath, read_fn + return read_fn @pytest.mark.parametrize(argnames, params, ids=ids) -def test_correct_read(fmt, writing_library, reading_library, codec): +def test_correct_read(fmt, writing_library, reading_library, codec, nested, + store_name, fpath): + if nested and reading_library == 'z5py': + pytest.skip("nested read not implemented in z5py") + reference = imread(DATA_DIR / "reference_image.png") - fpath, read_fn = _get_read_fn(fmt, writing_library, reading_library) - test = read_fn(fpath, codec) + read_fn = _get_read_fn(reading_library) + if not os.path.exists(fpath): + raise RuntimeError( + f"file not found: {fpath}. Make sure you have generated the data " + "using 'make data'" + ) + test = read_fn(fpath, codec, nested) assert test.shape == reference.shape assert np.allclose(test, reference) @@ -151,11 +235,12 @@ def tabulate_test_results(params, per_codec_tables=False): reference = imread(DATA_DIR / "reference_image.png") all_results = {} - for fmt, writing_library, reading_library, codec in params: - fpath, read_fn = _get_read_fn(fmt, writing_library, reading_library) + for (fmt, writing_library, reading_library, codec, nested, store_name, + fpath) in params: + read_fn = _get_read_fn(reading_library) fail_type = None try: - test = read_fn(fpath, codec) + test = read_fn(fpath, codec, nested) except Exception as e: fail_type = f"{type(e).__name__}: {e}" @@ -165,12 +250,17 @@ def tabulate_test_results(params, per_codec_tables=False): else: result = fail_type + nstr = 'nested' if nested else 'flat' if per_codec_tables: table_key = fmt, codec - inner_key = (writing_library, reading_library) + inner_key = (', '.join(writing_library, store_name, nstr), reading_library) else: table_key = fmt - inner_key = (', '.join((writing_library, codec)), reading_library) + if store_name: + key_attributes = (writing_library, codec, store_name, nstr) + else: + key_attributes = (writing_library, codec, nstr) + inner_key = (', '.join(key_attributes), reading_library) if table_key not in all_results: all_results[table_key] = {} @@ -226,7 +316,7 @@ def result_to_table(result, use_emojis=True, fmt='md'): df_val = pass_str if v else f'{fail_str}: mismatched' else: df_val = pass_str if v else fail_str - df.at[k[0], k[1]] = df_val\ + df.at[k[0], k[1]] = df_val if fmt == 'html': table = df.to_html()