Skip to content

Commit

Permalink
Merge pull request #33 from grlee77/add_nested_data
Browse files Browse the repository at this point in the history
Add nested data generation and tests (zarr-python and zarrita)
  • Loading branch information
joshmoore authored Apr 21, 2021
2 parents 6786ca2 + a0f1984 commit 7349e03
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 42 deletions.
42 changes: 26 additions & 16 deletions generate_data/generate_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,38 @@

# TODO use more compressors from numcodecs and more blosc filter_ids
def generate_zarr_format(compressors=['gzip', 'blosc', 'zlib', None]):
path = 'data/zarr.zr'
im = astronaut()

f = zarr.open(path, mode='w')
for compressor in compressors:
copts = COMPRESSION_OPTIONS.get(compressor, {})
if compressor is None:
name = "raw"
elif compressor == "blosc":
name = "%s/%s" % (compressor, copts.get("cname"))
else:
name = compressor
compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None
f.create_dataset(name, data=im, chunks=CHUNKS,
compressor=compressor_impl)
for nested, StoreClass, store_kwargs in [
(False, zarr.storage.DirectoryStore, {}),
(False, zarr.storage.FSStore, {}),
(True, zarr.storage.NestedDirectoryStore, {}),
(True, zarr.storage.FSStore,
{'key_separator': '/', 'auto_mkdir': True}),
]:

nested_str = '_nested' if nested else '_flat'
path = f'data/zarr_{StoreClass.__name__}{nested_str}.zr'
store = StoreClass(path, **store_kwargs)
im = astronaut()

f = zarr.open(store, mode='w')
for compressor in compressors:
copts = COMPRESSION_OPTIONS.get(compressor, {})
if compressor is None:
name = "raw"
elif compressor == "blosc":
name = "%s/%s" % (compressor, copts.get("cname"))
else:
name = compressor
compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None
f.create_dataset(name, data=im, chunks=CHUNKS,
compressor=compressor_impl)


def generate_n5_format(compressors=['gzip', None]):
path = 'data/zarr.n5'
im = astronaut()

f = zarr.open(path, mode='w')
f = zarr.open('data/zarr.n5', mode='w')
for compressor in compressors:
name = compressor if compressor is not None else 'raw'
compressor_impl = STR_TO_COMPRESSOR[compressor]() if compressor is not None else None
Expand Down
18 changes: 13 additions & 5 deletions generate_data/generate_zarrita.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,16 @@
COMPRESSION_OPTIONS = {"blosc": {"cname": "lz4"}}


def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None]):
def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None],
nested=True):
im = astronaut()

h = zarrita.create_hierarchy('data/zarrita.zr3')
if nested:
chunk_separator = '/'
fname = 'data/zarrita_nested.zr3'
else:
chunk_separator = '.'
fname = 'data/zarrita.zr3'
h = zarrita.create_hierarchy(fname)
for compressor in compressors:
copts = COMPRESSION_OPTIONS.get(compressor, {})
if compressor is None:
Expand All @@ -26,9 +32,11 @@ def generate_zr3_format(compressors=['gzip', 'blosc', 'zlib', None]):
name = compressor
compressor_impl = STR_TO_COMPRESSOR[compressor](**copts) if compressor is not None else None
a = h.create_array('/' + name, shape=im.shape, chunk_shape=CHUNKS,
dtype=im.dtype, compressor=compressor_impl)
chunk_separator=chunk_separator, dtype=im.dtype,
compressor=compressor_impl)
a[...] = im


if __name__ == '__main__':
generate_zr3_format()
for nested in [False, True]:
generate_zr3_format(nested=nested)
132 changes: 111 additions & 21 deletions test/test_read_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@
The matrix of tests is automatically generated,
and individual tests correctly fail on unavailable imports.
Call ``pytest`` in the root directory to run all of the tests.
The tests in this folder assume that the data generation scripts generate
filenames named as follows:
{writing_library}.{fmt}
where {fmt} is '.n5', '.zr' or '.zr3'.
For writers where multiple store and/or file nesting formats are tested, the
following filenaming scheme is used in the generators:
{writing_library}_{storage_class_name}_{nesting_type}.{fmt}
'_{storage_class_name}' is optional and is currently used by the zarr-python
implementations to indicate which storage class was used to write the data.
'_{nesting_type}' should be either '_nested' or '_flat' to indicate if a
nested or flat chunk storage scheme is used.
"""
import os
from typing import Dict, List
Expand Down Expand Up @@ -46,26 +65,41 @@
}


def read_with_zarr(fpath, ds_name):
def read_with_zarr(fpath, ds_name, nested):
import zarr
if ds_name == "blosc":
ds_name = "blosc/lz4"
return zarr.open(os.fspath(fpath))[ds_name][:]
if str(fpath).endswith('.zr'):
if nested:
if 'FSStore' in str(fpath):
store = zarr.storage.FSStore(
os.fspath(fpath), key_separator='/', mode='r'
)
else:
store = zarr.storage.NestedDirectoryStore(os.fspath(fpath))
else:
if 'FSStore' in str(fpath):
store = zarr.storage.FSStore(os.fspath(fpath))
else:
store = zarr.storage.DirectoryStore(fpath)
else:
store = os.fspath(fpath)
return zarr.open(store)[ds_name][:]


def read_with_pyn5(fpath, ds_name):
def read_with_pyn5(fpath, ds_name, nested):
import pyn5
return pyn5.File(fpath)[ds_name][:]


def read_with_z5py(fpath, ds_name):
def read_with_z5py(fpath, ds_name, nested):
import z5py
if ds_name == "blosc":
ds_name = "blosc/lz4"
return z5py.File(fpath)[ds_name][:]


def read_with_zarrita(fpath, ds_name):
def read_with_zarrita(fpath, ds_name, nested):
import zarrita
if ds_name == "blosc":
ds_name = "blosc/lz4"
Expand All @@ -85,6 +119,9 @@ def read_with_zarrita(fpath, ds_name):
HERE = Path(__file__).resolve().parent
DATA_DIR = HERE.parent / "data"

# Optional filename strings indicating a specific storage class was used
KNOWN_STORAGE_CLASSES = {"DirectoryStore", "FSStore", "NestedDirectoryStore"}


def libraries_for_format(format: str):
return {
Expand All @@ -102,23 +139,62 @@ def codecs_for_file(fpath: Path):
return sorted(d.name for d in fpath.iterdir() if d.is_dir())


def _get_write_attrs(file_stem: str):
"""Parse a filename to determine the writing library name
If present in the filename, the storage class and nesting type are also
determined.
"""
nested_str = ""
if "nested" in file_stem:
nested_str = "nested"
writing_library = file_stem.replace("_nested", "")
else:
writing_library = file_stem

if "_flat" in file_stem:
writing_library = file_stem.replace("_flat", "")

store_str = ""
for store_name in KNOWN_STORAGE_CLASSES:
_store_name = '_' + store_name
if _store_name in writing_library:
if store_str:
raise ValueError(
f"multiple store names in file_stem: {file_stem}"
)
store_str = store_name
writing_library = writing_library.replace(_store_name, "")

return writing_library, store_str, nested_str


def create_params():
argnames = ["fmt", "writing_library", "reading_library", "codec"]
argnames = ["fmt", "writing_library", "reading_library", "codec", "nested",
"store_name", "fpath"]
params = []
ids = []
for fmt in EXTENSIONS:
for writing_library, fpath in libraries_for_format(fmt).items():
for file_stem, fpath in libraries_for_format(fmt).items():
writing_library, store_str, nested_str = _get_write_attrs(file_stem)
nested = nested_str == "nested"
written_codecs = codecs_for_file(fpath)
for reading_library, available_fmts in READABLE_CODECS.items():
available_codecs = available_fmts.get(fmt, [])
for codec in sorted(
set(available_codecs).intersection(written_codecs)
):
params.append(
(fmt, writing_library, reading_library, codec)
(fmt, writing_library, reading_library, codec, nested,
store_str, fpath)
)
write_attrs = ', '.join(
[s for s in (store_str, nested_str) if s != ""]
)
if write_attrs:
write_attrs = ' (' + write_attrs + ')'
ids.append(
f"read {writing_library} {fmt} using "
f"read {writing_library}{write_attrs} {fmt} using "
f"{reading_library}, {codec}"
)
return argnames, params, ids
Expand All @@ -127,22 +203,30 @@ def create_params():
argnames, params, ids = create_params()


def _get_read_fn(fmt, writing_library, reading_library):
fpath = DATA_DIR / f"{writing_library}{EXTENSIONS[fmt]}"
def _get_read_fn(reading_library):
read_fn = {
"zarr": read_with_zarr,
"pyn5": read_with_pyn5,
"z5py": read_with_z5py,
"zarrita": read_with_zarrita,
}[reading_library]
return fpath, read_fn
return read_fn


@pytest.mark.parametrize(argnames, params, ids=ids)
def test_correct_read(fmt, writing_library, reading_library, codec):
def test_correct_read(fmt, writing_library, reading_library, codec, nested,
store_name, fpath):
if nested and reading_library == 'z5py':
pytest.skip("nested read not implemented in z5py")

reference = imread(DATA_DIR / "reference_image.png")
fpath, read_fn = _get_read_fn(fmt, writing_library, reading_library)
test = read_fn(fpath, codec)
read_fn = _get_read_fn(reading_library)
if not os.path.exists(fpath):
raise RuntimeError(
f"file not found: {fpath}. Make sure you have generated the data "
"using 'make data'"
)
test = read_fn(fpath, codec, nested)
assert test.shape == reference.shape
assert np.allclose(test, reference)

Expand All @@ -151,11 +235,12 @@ def tabulate_test_results(params, per_codec_tables=False):
reference = imread(DATA_DIR / "reference_image.png")

all_results = {}
for fmt, writing_library, reading_library, codec in params:
fpath, read_fn = _get_read_fn(fmt, writing_library, reading_library)
for (fmt, writing_library, reading_library, codec, nested, store_name,
fpath) in params:
read_fn = _get_read_fn(reading_library)
fail_type = None
try:
test = read_fn(fpath, codec)
test = read_fn(fpath, codec, nested)
except Exception as e:
fail_type = f"{type(e).__name__}: {e}"

Expand All @@ -165,12 +250,17 @@ def tabulate_test_results(params, per_codec_tables=False):
else:
result = fail_type

nstr = 'nested' if nested else 'flat'
if per_codec_tables:
table_key = fmt, codec
inner_key = (writing_library, reading_library)
inner_key = (', '.join(writing_library, store_name, nstr), reading_library)
else:
table_key = fmt
inner_key = (', '.join((writing_library, codec)), reading_library)
if store_name:
key_attributes = (writing_library, codec, store_name, nstr)
else:
key_attributes = (writing_library, codec, nstr)
inner_key = (', '.join(key_attributes), reading_library)

if table_key not in all_results:
all_results[table_key] = {}
Expand Down Expand Up @@ -226,7 +316,7 @@ def result_to_table(result, use_emojis=True, fmt='md'):
df_val = pass_str if v else f'{fail_str}: mismatched'
else:
df_val = pass_str if v else fail_str
df.at[k[0], k[1]] = df_val\
df.at[k[0], k[1]] = df_val

if fmt == 'html':
table = df.to_html()
Expand Down

0 comments on commit 7349e03

Please sign in to comment.