From 38278c8399564e2eeafaa5f44ecabba23df99ecd Mon Sep 17 00:00:00 2001 From: John Readey Date: Fri, 1 Dec 2023 09:19:13 +0800 Subject: [PATCH] Custom filter support (#159) * add support for bitshuffle * bump version --- h5pyd/_apps/utillib.py | 47 +++++++++++++++++++++++++++++++++++++----- h5pyd/_hl/filters.py | 19 +++++++++++++++-- h5pyd/version.py | 6 +++--- setup.py | 4 ++-- 4 files changed, 64 insertions(+), 12 deletions(-) diff --git a/h5pyd/_apps/utillib.py b/h5pyd/_apps/utillib.py index d03bd1d..860ce25 100755 --- a/h5pyd/_apps/utillib.py +++ b/h5pyd/_apps/utillib.py @@ -28,6 +28,13 @@ MIN_CHUNK_SIZE = 1 * 1024 * 1024 MAC_CHUNK_SIZE = 8 * 1024 * 1024 +H5Z_FILTER_MAP = { 32001: "blosclz", + 32004: "lz4", + 32008: "bitshuffle", + 32015: "zstd", +} + + # check if hdf5 library version supports chunk iteration hdf_library_version = h5py.version.hdf5_version_tuple library_has_chunk_iter = (hdf_library_version >= (1, 14, 0) or (hdf_library_version < (1, 12, 0) and (hdf_library_version >= (1, 10, 10)))) @@ -761,8 +768,8 @@ def create_chunktable(dset, dset_dims, ctx): chunk_key += str(index[dim] // chunk_dims[dim]) if dim < rank - 1: chunk_key += "_" - logging.debug(f"adding chunk_key: {chunk_key}") - chunk_map[chunk_key] = (chunk_info.byte_offset, chunk_info.size) + logging.debug(f"adding chunk_key: {chunk_key}") + chunk_map[chunk_key] = (chunk_info.byte_offset, chunk_info.size) chunks["class"] = "H5D_CHUNKED_REF" if not extend: @@ -1121,6 +1128,7 @@ def create_dataset(dobj, ctx): # or vlen pass else: + logging.debug(f"filter setup for {dobj.name}") if not ctx["ignorefilters"]: kwargs["compression"] = dobj.compression kwargs["compression_opts"] = dobj.compression_opts @@ -1134,7 +1142,7 @@ def create_dataset(dobj, ctx): # TBD: it would be better if HSDS could let us know what filters # are supported (like it does with compressors) - # For now, just hard-code fletcher32 and scaleoffset to be ignored + # For now, just hard-ccreate_datasetcreate_datasetode fletcher32 and scaleoffset to be ignored if dobj.fletcher32: msg = f"fletcher32 filter used by dataset: {dobj.name} is not " msg += "supported by HSDS, this filter will not be used" @@ -1144,7 +1152,35 @@ def create_dataset(dobj, ctx): msg = f"scaleoffset filter used by dataset: {dobj.name} is not " msg += "supported by HSDS, this filter will not be used" logging.warning(msg) - # kwargs["scaleoffset"] = dobj.scaleoffset + + if is_h5py(dobj) and not kwargs.get("compression"): + # apply any custom filters as long as they are supported in HSDS + for filter_id in dobj._filters: + filter_opts = dobj._filters[filter_id] + try: + filter_id = int(filter_id) + except ValueError: + msg = "unrecognized filter id: {filter_id} for {dobj.name}, ignoring" + logging.warning(msg) + + if not isinstance(filter_id, int): + continue + + if filter_id in H5Z_FILTER_MAP: + filter_name = H5Z_FILTER_MAP[filter_id] + if filter_name == "bitshuffle": + kwargs["shuffle"] = filter_name + logging.info(f"using bitshuffle on {dobj.name}") + else: + # supported non-standard compressor + kwargs["compression"] = filter_name + logging.info(f"using compressor: {filter_name} for {dobj.name}") + kwargs["compression_opts"] = filter_opts + logging.info(f"compression_opts: {filter_opts}") + else: + logging.warning(f"filter id {filter_id} for {dobj.name} not supported") + + # kwargs["scaleoffset"] = dobj.scaleoffset # setting the fillvalue is failing in some cases # see: https://github.com/HDFGroup/h5pyd/issues/119 # don't set fill value for reference types @@ -1501,6 +1537,7 @@ def load_file( logging.info(f"input file: {fin.filename}") logging.info(f"output file: {fout.filename}") + logging.info(f"dataload: {dataload}") if dataload != "ingest": if not dataload: logging.info("no data load") @@ -1508,7 +1545,7 @@ def load_file( if not s3path: logging.error("s3path expected to be set") sys.exit(1) - logging.info("using s3path") + logging.info(f"using s3path: {s3path}") else: logging.error(f"unexpected dataload value: {dataload}") sys.exit(1) diff --git a/h5pyd/_hl/filters.py b/h5pyd/_hl/filters.py index 80aa1e6..6fcd326 100644 --- a/h5pyd/_hl/filters.py +++ b/h5pyd/_hl/filters.py @@ -177,8 +177,23 @@ def rq_tuple(tpl, name): filters.append(filter_scaleoffset) if shuffle: - filter_shuffle = {"class": "H5Z_FILTER_SHUFFLE"} - filter_shuffle["id"] = 2 + if isinstance(shuffle, int) and shuffle == 32008: + bitshuffle = True + elif isinstance(shuffle, str) and shuffle == "bitshuffle": + bitshuffle = True + else: + bitshuffle = False + + if bitshuffle: + filter_shuffle = {"class": "H5Z_FILTER_BITSHUFFLE"} + filter_shuffle["id"] = 32008 + filter_shuffle["name"] = "bitshuffle" + + else: + # regular shuffle filter + filter_shuffle = {"class": "H5Z_FILTER_SHUFFLE"} + filter_shuffle["id"] = 2 + filter_shuffle["name"] = "shuffle" filters.append(filter_shuffle) if compression == "gzip": diff --git a/h5pyd/version.py b/h5pyd/version.py index d235102..b07e66b 100644 --- a/h5pyd/version.py +++ b/h5pyd/version.py @@ -16,7 +16,7 @@ import sys import numpy -version = "0.16.0" +version = "0.17.0" hdf5_version = "REST" @@ -28,8 +28,8 @@ else ("",) ) -api_version_tuple = (0, 16, 0) -api_version = "0.16.0" +api_version_tuple = (0, 17, 0) +api_version = "0.17.0" __doc__ = """\ This is h5pyd **%s** diff --git a/setup.py b/setup.py index e48ae4d..645552d 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ setup( name="h5pyd", - version="0.16.0", + version="0.17.0", description="h5py compatible client lib for HDF REST API", long_description=long_description, url="http://github.com/HDFGroup/h5pyd", @@ -70,4 +70,4 @@ "hsstat = h5pyd._apps.hsstat:main", ] }, -) \ No newline at end of file +)