From 65f4eeb202984ffc45dfae1b2451064f41f26b0f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:10:15 -0700 Subject: [PATCH 01/22] Update version number and authors in 'pyproject.toml' --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 47bf6dbf..92d608f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,9 @@ [tool.poetry] name = "hashstore" -version = "1.0.0" +version = "1.1.0" description = "HashStore, an object storage system using content identifiers." -authors = ["Dou Mok ", "Matt Jones "] +authors = ["Dou Mok ", "Matt Jones ", + "Matthew Brooke", "Jing Tao", "Jeanette Clark", "Ian M. Nesbitt"] readme = "README.md" keywords = ["filesystem", "object storage", "hashstore", "storage"] classifiers = [ From a08a6b10e4eb889e59442068fcb211b4f0fa30e7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:46:25 -0700 Subject: [PATCH 02/22] Review and clean up 'filehashstore_interface' test module for 'store_object' & 'tag_object' pytests and resolve todo items for missing 'tag_object' pytests --- .../test_filehashstore_interface.py | 166 +++++++++++------- 1 file changed, 107 insertions(+), 59 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 3ada26eb..321b923a 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -14,6 +14,8 @@ NonMatchingObjSize, PidRefsDoesNotExist, UnsupportedAlgorithm, + HashStoreRefsAlreadyExists, + PidRefsAlreadyExistsError, ) # pylint: disable=W0212 @@ -29,12 +31,11 @@ def test_store_object_refs_files_and_object(pids, store): """Test store object stores objects and creates reference files.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(pid, path) assert object_metadata.cid == pids[pid][store.algorithm] - assert store._count(entity) == 3 + assert store._count("objects") == 3 assert store._count("pid") == 3 assert store._count("cid") == 3 @@ -42,12 +43,11 @@ def test_store_object_refs_files_and_object(pids, store): def test_store_object_only_object(pids, store): """Test store object stores an object only (no reference files will be created)""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) object_metadata = store.store_object(data=path) assert object_metadata.cid == pids[pid][store.algorithm] - assert store._count(entity) == 3 + assert store._count("objects") == 3 assert store._count("pid") == 0 assert store._count("cid") == 0 @@ -55,36 +55,33 @@ def test_store_object_only_object(pids, store): def test_store_object_files_path(pids, store): """Test store object when given a path object.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir + pid.replace("/", "_")) _object_metadata = store.store_object(pid, path) - assert store._exists(entity, pids[pid][store.algorithm]) - assert store._count(entity) == 3 + assert store._exists("objects", pids[pid][store.algorithm]) + assert store._count("objects") == 3 def test_store_object_files_string(pids, store): """Test store object when given a string object.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path_string = test_dir + pid.replace("/", "_") _object_metadata = store.store_object(pid, path_string) - assert store._exists(entity, pids[pid][store.algorithm]) - assert store._count(entity) == 3 + assert store._exists("objects", pids[pid][store.algorithm]) + assert store._count("objects") == 3 def test_store_object_files_input_stream(pids, store): """Test store object when given a stream object.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") _object_metadata = store.store_object(pid, input_stream) input_stream.close() - assert store._exists(entity, pids[pid][store.algorithm]) - assert store._count(entity) == 3 + assert store._exists("objects", pids[pid][store.algorithm]) + assert store._count("objects") == 3 def test_store_object_cid(pids, store): @@ -170,6 +167,23 @@ def test_store_object_data_incorrect_type_empty_spaces(store): store.store_object(pid, data=path) +def test_store_object_data_incorrect_type_special_characters(store): + """Test store object raises error when data is empty string with special characters""" + pid = "jtao.1700.1" + path = " \n\t" + with pytest.raises(TypeError): + store.store_object(pid, data=path) + + +def test_store_object_data_incorrect_type_path_with_special_character(store): + """Test store object raises error when data path contains special characters.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + "\n" + with pytest.raises(ValueError): + store.store_object("", path) + + def test_store_object_additional_algorithm_invalid(store): """Test store object raises error when supplied with unsupported algorithm.""" test_dir = "tests/testdata/" @@ -183,20 +197,18 @@ def test_store_object_additional_algorithm_invalid(store): def test_store_object_additional_algorithm_hyphen_uppercase(pids, store): """Test store object accepts an additional algo that's supported in uppercase.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid algorithm_with_hyphen_and_upper = "SHA-384" object_metadata = store.store_object(pid, path, algorithm_with_hyphen_and_upper) sha256_cid = object_metadata.hex_digests.get("sha384") assert sha256_cid == pids[pid]["sha384"] - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): """Test store object accepts an additional algo that's supported in lowercase.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3-256" @@ -206,13 +218,12 @@ def test_store_object_additional_algorithm_hyphen_lowercase(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_additional_algorithm_underscore(pids, store): """Test store object accepts an additional algo that's supported with underscore.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid algorithm_other = "sha3_256" @@ -222,13 +233,12 @@ def test_store_object_additional_algorithm_underscore(pids, store): "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) assert additional_sha3_256_hex_digest == sha3_256_checksum - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_checksum_correct(store): """Test store object does not throw exception with good checksum.""" test_dir = "tests/testdata/" - entity = "objects" pid = "jtao.1700.1" path = test_dir + pid checksum_algo = "sha3_256" @@ -238,7 +248,7 @@ def test_store_object_checksum_correct(store): _object_metadata = store.store_object( pid, path, checksum=checksum_correct, checksum_algorithm=checksum_algo ) - assert store._count(entity) == 1 + assert store._count("objects") == 1 def test_store_object_checksum_correct_and_additional_algo(store): @@ -285,18 +295,6 @@ def test_store_object_checksum_correct_and_additional_algo_duplicate(store): assert object_metadata.hex_digests.get("sha3_256") == checksum_correct -def test_store_object_checksum_algorithm_empty(store): - """Test store object raises error when checksum supplied with no checksum_algorithm.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - checksum_correct = ( - "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" - ) - with pytest.raises(ValueError): - store.store_object(pid, path, checksum=checksum_correct, checksum_algorithm="") - - def test_store_object_checksum_empty(store): """Test store object raises error when checksum_algorithm supplied with an empty checksum.""" @@ -323,21 +321,6 @@ def test_store_object_checksum_empty_spaces(store): ) -def test_store_object_checksum_algorithm_empty_spaces(store): - """Test store object raises error when checksum is supplied and with empty - spaces as the checksum_algorithm.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - checksum_correct = ( - "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" - ) - with pytest.raises(ValueError): - store.store_object( - pid, path, checksum=checksum_correct, checksum_algorithm=" " - ) - - def test_store_object_checksum_incorrect_checksum(store): """Test store object raises error when supplied with incorrect checksum.""" test_dir = "tests/testdata/" @@ -368,19 +351,60 @@ def test_store_object_checksum_unsupported_checksum_algo(store): ) +def test_store_object_checksum_algorithm_empty(store): + """Test store object raises error when checksum supplied with no checksum_algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + with pytest.raises(ValueError): + store.store_object(pid, path, checksum=checksum_correct, checksum_algorithm="") + + +def test_store_object_checksum_algorithm_empty_spaces(store): + """Test store object raises error when checksum is supplied and with empty + spaces as the checksum_algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + with pytest.raises(ValueError): + store.store_object( + pid, path, checksum=checksum_correct, checksum_algorithm=" " + ) + + +def test_store_object_checksum_algorithm_special_character(store): + """Test store object raises error when checksum is supplied and with special characters + as the checksum_algorithm.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + checksum_correct = ( + "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" + ) + with pytest.raises(ValueError): + store.store_object( + pid, path, checksum=checksum_correct, checksum_algorithm="\n" + ) + + def test_store_object_duplicate_does_not_store_duplicate(store): """Test that storing duplicate object does not store object twice.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid - entity = "objects" # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob pid_that_refs_existing_cid = "dou.test.1" _object_metadata_two = store.store_object(pid_that_refs_existing_cid, path) # Confirm only one object exists and the tmp file created is deleted - assert store._count(entity) == 1 + assert store._count("objects") == 1 def test_store_object_duplicate_object_references_file_count(store): @@ -401,11 +425,12 @@ def test_store_object_duplicate_object_references_file_count(store): assert store._count("pid") == 3 # Confirm that there are 1 cid reference files assert store._count("cid") == 1 + assert store._count("objects") == 1 def test_store_object_duplicate_object_references_file_content(pids, store): """Test that storing duplicate object but different pid updates the cid refs file - with the correct amount of pids.""" + with the correct amount of pids and content.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid @@ -417,13 +442,17 @@ def test_store_object_duplicate_object_references_file_content(pids, store): # Store with third pid pid_three = "dou.test.2" store.store_object(pid_three, path) - # Confirm the content of the cid refence files + # Confirm the content of the cid reference files cid_ref_abs_path = store._get_hashstore_cid_refs_path(pids[pid][store.algorithm]) + cid_count = 0 with open(cid_ref_abs_path, "r", encoding="utf8") as f: for _, line in enumerate(f, start=1): + cid_count += 1 value = line.strip() assert value == pid or value == pid_two or value == pid_three + assert cid_count == 3 + def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, store): """Test store duplicate object throws exception when the data to validate against @@ -431,7 +460,6 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor test_dir = "tests/testdata/" pid = "jtao.1700.1" path = test_dir + pid - entity = "objects" # Store first blob _object_metadata_one = store.store_object(pid, path) # Store second blob @@ -439,10 +467,10 @@ def test_store_object_duplicate_raises_error_with_bad_validation_data(pids, stor _object_metadata_two = store.store_object( pid, path, checksum="nonmatchingchecksum", checksum_algorithm="sha256" ) - assert store._count(entity) == 1 + assert store._count("objects") == 1 # Confirm tmp files created during this process was handled assert store._count("tmp") == 0 - assert store._exists(entity, pids[pid][store.algorithm]) + assert store._exists("objects", pids[pid][store.algorithm]) def test_store_object_with_obj_file_size(store, pids): @@ -466,6 +494,7 @@ def test_store_object_with_obj_file_size_incorrect(store, pids): path = test_dir + pid.replace("/", "_") with pytest.raises(NonMatchingObjSize): store.store_object(pid, path, expected_object_size=obj_file_size) + assert store._count("objects") == 0 def test_store_object_with_obj_file_size_non_integer(store, pids): @@ -756,14 +785,33 @@ def test_tag_object_pid_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 -# TODO: Add tag_ojbect test for HashStoreRefsAlreadyExists -# TODO: Add tag_ojbect test for PidRefsAlreadyExistsError +def test_tag_object_hashstore_refs_already_exist(pids, store): + """Confirm that tag throws HashStoreRefsAlreadyExists when refs already exist""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + with pytest.raises(HashStoreRefsAlreadyExists): + store.tag_object(pid, object_metadata.cid) + + +def test_tag_object_pid_refs_already_exist(pids, store): + """Confirm that tag throws PidRefsAlreadyExistsError when a pid refs already exists""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + cid_refs_file_path = store._get_hashstore_cid_refs_path(object_metadata.cid) + os.remove(cid_refs_file_path) + + with pytest.raises(PidRefsAlreadyExistsError): + store.tag_object(pid, "adifferentcid") def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" - entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" @@ -777,7 +825,7 @@ def test_store_metadata(pids, store): store._get_store_path("metadata") / rel_path / metadata_document_name ) assert metadata_cid == str(full_path) - assert store._count(entity) == 3 + assert store._count("metadata") == 3 def test_store_metadata_one_pid_multiple_docs_correct_location(store): From 2e67b39a6c24111804495768d52a8b178f0610c9 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:47:24 -0700 Subject: [PATCH 03/22] Re-organize 'delete_if_invalid_object' in 'filehashstore' module to match java library interface order --- src/hashstore/filehashstore.py | 48 +++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 088d25d4..588d3043 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -587,6 +587,30 @@ def store_object( return object_metadata + def tag_object(self, pid, cid): + logging.debug( + "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", + cid, + pid, + ) + self._check_string(pid, "pid") + self._check_string(cid, "cid") + + try: + self._store_hashstore_refs_files(pid, cid) + except HashStoreRefsAlreadyExists as hrae: + err_msg = ( + f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " + "already exist. " + str(hrae) + ) + raise HashStoreRefsAlreadyExists(err_msg) + except PidRefsAlreadyExistsError as praee: + err_msg = ( + f"FileHashStore - tag_object: A pid can only reference one cid. " + + str(praee) + ) + raise PidRefsAlreadyExistsError(err_msg) + def delete_if_invalid_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): @@ -633,30 +657,6 @@ def delete_if_invalid_object( object_metadata.cid, ) - def tag_object(self, pid, cid): - logging.debug( - "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", - cid, - pid, - ) - self._check_string(pid, "pid") - self._check_string(cid, "cid") - - try: - self._store_hashstore_refs_files(pid, cid) - except HashStoreRefsAlreadyExists as hrae: - err_msg = ( - f"FileHashStore - tag_object: reference files for pid: {pid} and {cid} " - "already exist. " + str(hrae) - ) - raise HashStoreRefsAlreadyExists(err_msg) - except PidRefsAlreadyExistsError as praee: - err_msg = ( - f"FileHashStore - tag_object: A pid can only reference one cid. " - + str(praee) - ) - raise PidRefsAlreadyExistsError(err_msg) - def store_metadata(self, pid, metadata, format_id=None): logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid From 7ff4722b784c16b106dae535a7cd61e01f5468d1 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 10:52:47 -0700 Subject: [PATCH 04/22] Re-organize and clean up 'delete_if_invalid_object' method pytests in 'filehashstore' interface test module --- .../test_filehashstore_interface.py | 277 +++++++++--------- 1 file changed, 140 insertions(+), 137 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 321b923a..2fb45f2a 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -809,6 +809,146 @@ def test_tag_object_pid_refs_already_exist(pids, store): store.tag_object(pid, "adifferentcid") +def test_delete_if_invalid_object(pids, store): + """Test delete_if_invalid_object does not throw exception given good arguments.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + store.delete_if_invalid_object( + object_metadata, checksum, checksum_algorithm, expected_file_size + ) + assert store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_supported_other_algo_not_in_default(pids, store): + """Test delete_if_invalid_object does not throw exception when supported add algo is + supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + supported_algo = "sha224" + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = pids[pid][supported_algo] + expected_file_size = object_metadata.obj_size + store.delete_if_invalid_object( + object_metadata, checksum, supported_algo, expected_file_size + ) + assert store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_exception_incorrect_object_metadata_type(pids, store): + """Test delete_if_invalid_object throws exception when incorrect obj type is given to + object_metadata arg.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + with pytest.raises(ValueError): + store.delete_if_invalid_object( + "not_object_metadata", checksum, checksum_algorithm, expected_file_size + ) + + +def test_delete_if_invalid_object_exception_incorrect_size(pids, store): + """Test delete_if_invalid_object throws exception when incorrect size is supplied and that data + object is deleted as we are storing without a pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + + with pytest.raises(NonMatchingObjSize): + store.delete_if_invalid_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + + assert not store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_exception_incorrect_size_object_exists(pids, store): + """Test delete_if_invalid_object throws exception when incorrect size is supplied and that data + object is not deleted since it already exists (a cid refs file is present).""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, data=path) + # Store again without pid and wrong object size + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + checksum_algorithm = store.algorithm + + with pytest.raises(NonMatchingObjSize): + store.delete_if_invalid_object( + object_metadata, checksum, checksum_algorithm, 1000 + ) + + assert store._exists("objects", object_metadata.cid) + assert store._count("tmp") == 0 + + +def test_delete_if_invalid_object_exception_incorrect_checksum(pids, store): + """Test delete_if_invalid_object throws exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum_algorithm = store.algorithm + expected_file_size = object_metadata.obj_size + + with pytest.raises(NonMatchingChecksum): + store.delete_if_invalid_object( + object_metadata, "abc123", checksum_algorithm, expected_file_size + ) + + assert not store._exists("objects", object_metadata.cid) + + +def test_delete_if_invalid_object_exception_incorrect_checksum_algo(pids, store): + """Test delete_if_invalid_object throws exception when unsupported algorithm is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(UnsupportedAlgorithm): + store.delete_if_invalid_object( + object_metadata, checksum, "md2", expected_file_size + ) + + assert store._exists("objects", object_metadata.cid) + assert store._count("tmp") == 0 + + +def test_delete_if_invalid_object_exception_supported_other_algo_bad_checksum( + pids, store +): + """Test delete_if_invalid_object throws exception when incorrect checksum is supplied.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum = object_metadata.hex_digests.get(store.algorithm) + expected_file_size = object_metadata.obj_size + with pytest.raises(NonMatchingChecksum): + store.delete_if_invalid_object( + object_metadata, checksum, "sha224", expected_file_size + ) + + assert not store._exists("objects", object_metadata.cid) + + def test_store_metadata(pids, store): """Test store metadata.""" test_dir = "tests/testdata/" @@ -1193,143 +1333,6 @@ def test_delete_object_pid_none(store): store.delete_object(pid) -def test_delete_invalid_object(pids, store): - """Test delete_invalid_object does not throw exception given good arguments.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - store.delete_if_invalid_object( - object_metadata, checksum, checksum_algorithm, expected_file_size - ) - assert store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_supported_other_algo_not_in_default(pids, store): - """Test delete_invalid_object does not throw exception when supported add algo is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - supported_algo = "sha224" - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = pids[pid][supported_algo] - expected_file_size = object_metadata.obj_size - store.delete_if_invalid_object( - object_metadata, checksum, supported_algo, expected_file_size - ) - assert store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_exception_incorrect_object_metadata_type(pids, store): - """Test delete_invalid_object throws exception when incorrect class type is given to - object_metadata arg.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - with pytest.raises(ValueError): - store.delete_if_invalid_object( - "not_object_metadata", checksum, checksum_algorithm, expected_file_size - ) - - -def test_delete_invalid_object_exception_incorrect_size(pids, store): - """Test delete_invalid_object throws exception when incorrect size is supplied and that data - object is deleted as we are storing without a pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - - with pytest.raises(NonMatchingObjSize): - store.delete_if_invalid_object( - object_metadata, checksum, checksum_algorithm, 1000 - ) - - assert not store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_exception_incorrect_size_object_exists(pids, store): - """Test delete_invalid_object throws exception when incorrect size is supplied and that data - object is not deleted since it already exists (a cid refs file is present).""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - store.store_object(pid, data=path) - # Store again without pid and wrong object size - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - checksum_algorithm = store.algorithm - - with pytest.raises(NonMatchingObjSize): - store.delete_if_invalid_object( - object_metadata, checksum, checksum_algorithm, 1000 - ) - - assert store._exists("objects", object_metadata.cid) - assert store._count("tmp") == 0 - - -def test_delete_invalid_object_exception_incorrect_checksum(pids, store): - """Test delete_invalid_object throws exception when incorrect checksum is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum_algorithm = store.algorithm - expected_file_size = object_metadata.obj_size - - with pytest.raises(NonMatchingChecksum): - store.delete_if_invalid_object( - object_metadata, "abc123", checksum_algorithm, expected_file_size - ) - - assert not store._exists("objects", object_metadata.cid) - - -def test_delete_invalid_object_exception_incorrect_checksum_algo(pids, store): - """Test delete_invalid_object throws exception when unsupported algorithm is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - expected_file_size = object_metadata.obj_size - with pytest.raises(UnsupportedAlgorithm): - store.delete_if_invalid_object( - object_metadata, checksum, "md2", expected_file_size - ) - - assert store._exists("objects", object_metadata.cid) - assert store._count("tmp") == 0 - - -def test_delete_invalid_object_exception_supported_other_algo_bad_checksum(pids, store): - """Test delete_invalid_object throws exception when incorrect checksum is supplied.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(data=path) - checksum = object_metadata.hex_digests.get(store.algorithm) - expected_file_size = object_metadata.obj_size - with pytest.raises(NonMatchingChecksum): - store.delete_if_invalid_object( - object_metadata, checksum, "sha224", expected_file_size - ) - - assert not store._exists("objects", object_metadata.cid) - - def test_delete_metadata(pids, store): """Test delete_metadata successfully deletes metadata.""" test_dir = "tests/testdata/" From 73226da4449c456a349e86a2a96a59cc81236ebd Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 12:55:08 -0700 Subject: [PATCH 05/22] Review and clean-up 'store_metadata' pytests --- .../test_filehashstore_interface.py | 92 ++++++++++--------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 2fb45f2a..b87dc0f4 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -950,13 +950,13 @@ def test_delete_if_invalid_object_exception_supported_other_algo_bad_checksum( def test_store_metadata(pids, store): - """Test store metadata.""" + """Test store_metadata.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) @@ -964,12 +964,12 @@ def test_store_metadata(pids, store): full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) - assert metadata_cid == str(full_path) + assert stored_metadata_path == str(full_path) assert store._count("metadata") == 3 def test_store_metadata_one_pid_multiple_docs_correct_location(store): - """Test store metadata for a pid with multiple metadata documents.""" + """Test store_metadata for a pid with multiple metadata documents.""" test_dir = "tests/testdata/" entity = "metadata" pid = "jtao.1700.1" @@ -980,29 +980,31 @@ def test_store_metadata_one_pid_multiple_docs_correct_location(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_cid3 = store.store_metadata(pid, syspath, format_id3) - metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) + metadata_document_name = store._computehash(pid + format_id) metadata_document_name3 = store._computehash(pid + format_id3) metadata_document_name4 = store._computehash(pid + format_id4) full_path = store._get_store_path("metadata") / rel_path / metadata_document_name full_path3 = store._get_store_path("metadata") / rel_path / metadata_document_name3 full_path4 = store._get_store_path("metadata") / rel_path / metadata_document_name4 - assert metadata_cid == str(full_path) - assert metadata_cid3 == str(full_path3) - assert metadata_cid4 == str(full_path4) + + assert stored_metadata_path == str(full_path) + assert stored_metadata_path3 == str(full_path3) + assert stored_metadata_path4 == str(full_path4) assert store._count(entity) == 3 def test_store_metadata_default_format_id(pids, store): - """Test store metadata returns expected id when storing with default format_id.""" + """Test store_metadata returns expected id when storing with default format_id.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath) + stored_metadata_path = store.store_metadata(pid, syspath) # Manually calculate expected path metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) @@ -1010,24 +1012,24 @@ def test_store_metadata_default_format_id(pids, store): full_path = ( store._get_store_path("metadata") / rel_path / metadata_document_name ) - assert metadata_cid == str(full_path) + assert stored_metadata_path == str(full_path) def test_store_metadata_files_string(pids, store): - """Test store metadata with a string object to the metadata.""" + """Test store_metadata with a string object to the metadata.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) - metadata_cid = store.store_metadata(pid, syspath_string, format_id) - assert store._exists(entity, metadata_cid) + stored_metadata_path = store.store_metadata(pid, syspath_string, format_id) + assert store._exists(entity, stored_metadata_path) assert store._count(entity) == 3 def test_store_metadata_files_input_stream(pids, store): - """Test store metadata with an input stream to metadata.""" + """Test store_metadata with a stream to the metadata.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1035,13 +1037,13 @@ def test_store_metadata_files_input_stream(pids, store): filename = pid.replace("/", "_") + ".xml" syspath_string = str(Path(test_dir) / filename) syspath_stream = io.open(syspath_string, "rb") - _metadata_cid = store.store_metadata(pid, syspath_stream, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath_stream, format_id) syspath_stream.close() assert store._count(entity) == 3 def test_store_metadata_pid_empty(store): - """Test store metadata raises error with an empty string as the pid.""" + """Test store_metadata raises error with an empty string as the pid.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "" @@ -1052,7 +1054,7 @@ def test_store_metadata_pid_empty(store): def test_store_metadata_pid_empty_spaces(store): - """Test store metadata raises error with empty spaces as the pid.""" + """Test store_metadata raises error with empty spaces as the pid.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " @@ -1063,7 +1065,7 @@ def test_store_metadata_pid_empty_spaces(store): def test_store_metadata_pid_format_id_spaces(store): - """Test store metadata raises error with empty spaces as the format_id.""" + """Test store_metadata raises error with empty spaces as the format_id.""" test_dir = "tests/testdata/" format_id = " " pid = "jtao.1700.1" @@ -1074,7 +1076,7 @@ def test_store_metadata_pid_format_id_spaces(store): def test_store_metadata_metadata_empty(store): - """Test store metadata raises error with empty spaces as the metadata path.""" + """Test store_metadata raises error with empty spaces as the metadata path.""" pid = "jtao.1700.1" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" syspath_string = " " @@ -1083,7 +1085,7 @@ def test_store_metadata_metadata_empty(store): def test_store_metadata_metadata_none(store): - """Test store metadata raises error with empty None metadata path.""" + """Test store_metadata raises error with empty None metadata path.""" pid = "jtao.1700.1" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" syspath_string = None @@ -1092,7 +1094,7 @@ def test_store_metadata_metadata_none(store): def test_store_metadata_metadata_path(pids, store): - """Test store metadata returns expected path to metadata document.""" + """Test store_metadata returns expected path to metadata document.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): @@ -1100,13 +1102,13 @@ def test_store_metadata_metadata_path(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_path = store._get_hashstore_metadata_path(metadata_cid) - assert metadata_cid == metadata_path + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + metadata_path = store._get_hashstore_metadata_path(stored_metadata_path) + assert stored_metadata_path == metadata_path def test_store_metadata_thread_lock(store): - """Test store metadata thread lock.""" + """Test store_metadata thread lock.""" test_dir = "tests/testdata/" entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1172,7 +1174,7 @@ def test_retrieve_metadata(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) metadata_stream = store.retrieve_metadata(pid, format_id) metadata_content = metadata_stream.read().decode("utf-8") metadata_stream.close() @@ -1188,7 +1190,7 @@ def test_retrieve_metadata_default_format_id(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath) + _stored_metadata_path = store.store_metadata(pid, syspath) metadata_stream = store.retrieve_metadata(pid) metadata_content = metadata_stream.read().decode("utf-8") metadata_stream.close() @@ -1222,7 +1224,7 @@ def test_retrieve_metadata_format_id_empty(store): def test_retrieve_metadata_format_id_empty_spaces(store): - """Test retrieve_metadata raises error when supplied with empty spaces asthe format_id.""" + """Test retrieve_metadata raises error when supplied with empty spaces as the format_id.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): @@ -1238,7 +1240,7 @@ def test_delete_object_object_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count("objects") == 0 @@ -1253,7 +1255,7 @@ def test_delete_object_metadata_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count("metadata") == 0 @@ -1267,7 +1269,7 @@ def test_delete_object_all_refs_files_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) assert store._count("pid") == 0 assert store._count("cid") == 0 @@ -1282,7 +1284,7 @@ def test_delete_object_pid_refs_file_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_object(pid) pid_refs_file_path = store._get_hashstore_pid_refs_path(pid) assert not os.path.exists(pid_refs_file_path) @@ -1297,7 +1299,7 @@ def test_delete_object_cid_refs_file_deleted(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) cid = object_metadata.cid store.delete_object(pid) cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) @@ -1343,7 +1345,7 @@ def test_delete_metadata(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_metadata(pid, format_id) assert store._count(entity) == 0 @@ -1359,9 +1361,9 @@ def test_delete_metadata_one_pid_multiple_metadata_documents(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - _metadata_cid = store.store_metadata(pid, syspath, format_id) - _metadata_cid3 = store.store_metadata(pid, syspath, format_id3) - _metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + _stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) store.delete_metadata(pid) assert store._count(entity) == 0 @@ -1377,9 +1379,9 @@ def test_delete_metadata_specific_pid_multiple_metadata_documents(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - _metadata_cid = store.store_metadata(pid, syspath, format_id) - _metadata_cid3 = store.store_metadata(pid, syspath, format_id3) - _metadata_cid4 = store.store_metadata(pid, syspath, format_id4) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + _stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) store.delete_metadata(pid, format_id4) assert store._count(entity) == 2 @@ -1401,7 +1403,7 @@ def test_delete_metadata_default_format_id(store, pids): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath) + _stored_metadata_path = store.store_metadata(pid, syspath) store.delete_metadata(pid) assert store._count(entity) == 0 @@ -1439,7 +1441,7 @@ def test_get_hex_digest(store): filename = pid + ".xml" syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _stored_metadata_path = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) From 742eb883a0aa5ad306851cd54596ebaf61903dd2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 12:59:12 -0700 Subject: [PATCH 06/22] Review and clean-up 'retrieve_object' pytests --- tests/filehashstore/test_filehashstore_interface.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index b87dc0f4..1e5fd213 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1137,13 +1137,9 @@ def test_store_metadata_thread_lock(store): def test_retrieve_object(pids, store): """Test retrieve_object returns a stream to the correct object data.""" test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - store.store_metadata(pid, syspath, format_id) obj_stream = store.retrieve_object(pid) sha256_hex = store._computehash(obj_stream) obj_stream.close() From 6a1b072ace6fc6dc63f933e6bbd293dd4f3db07e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:01:27 -0700 Subject: [PATCH 07/22] Review and clean-up 'retrieve_metadata' pytests --- tests/filehashstore/test_filehashstore_interface.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 1e5fd213..528bf972 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1166,10 +1166,8 @@ def test_retrieve_metadata(store): test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" - path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath, format_id) metadata_stream = store.retrieve_metadata(pid, format_id) metadata_content = metadata_stream.read().decode("utf-8") @@ -1182,10 +1180,8 @@ def test_retrieve_metadata_default_format_id(store): """Test retrieve_metadata retrieves expected metadata without a format_id.""" test_dir = "tests/testdata/" pid = "jtao.1700.1" - path = test_dir + pid filename = pid + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath) metadata_stream = store.retrieve_metadata(pid) metadata_content = metadata_stream.read().decode("utf-8") @@ -1195,16 +1191,15 @@ def test_retrieve_metadata_default_format_id(store): def test_retrieve_metadata_bytes_pid_invalid(store): - """Test retrieve_metadata raises error when supplied with bad pid.""" + """Test retrieve_metadata raises exception when supplied with pid with no system metadata.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - pid = "jtao.1700.1" - pid_does_not_exist = pid + "test" + pid_does_not_exist = "jtao.1700.1.metadata.does.not.exist" with pytest.raises(ValueError): store.retrieve_metadata(pid_does_not_exist, format_id) def test_retrieve_metadata_bytes_pid_empty(store): - """Test retrieve_metadata raises error when supplied with empty pid.""" + """Test retrieve_metadata raises exception when supplied with empty pid.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " with pytest.raises(ValueError): @@ -1220,7 +1215,7 @@ def test_retrieve_metadata_format_id_empty(store): def test_retrieve_metadata_format_id_empty_spaces(store): - """Test retrieve_metadata raises error when supplied with empty spaces as the format_id.""" + """Test retrieve_metadata raises exception when supplied with empty spaces as the format_id.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): From 1d5b21574ae59507e08ea616022e0301f2caf69a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:04:24 -0700 Subject: [PATCH 08/22] Review and clean-up 'delete_object' pytests --- tests/filehashstore/test_filehashstore_interface.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 528bf972..31caaf87 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1237,8 +1237,7 @@ def test_delete_object_object_deleted(pids, store): def test_delete_object_metadata_deleted(pids, store): - """Test delete_object successfully deletes relevant metadata - files and refs files.""" + """Test delete_object successfully deletes associated metadata files.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): @@ -1251,7 +1250,7 @@ def test_delete_object_metadata_deleted(pids, store): assert store._count("metadata") == 0 -def test_delete_object_all_refs_files_deleted(pids, store): +def test_delete_object_refs_files_deleted(pids, store): """Test delete_object successfully deletes refs files.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1305,11 +1304,11 @@ def test_delete_object_cid_refs_file_with_pid_refs_remaining(pids, store): object_metadata = store.store_object(pid, path) cid = object_metadata.cid cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) - # pylint: disable=W0212 store._update_refs_file(cid_refs_abs_path, "dou.test.1", "add") store.delete_object(pid) cid_refs_file_path = store._get_hashstore_cid_refs_path(cid) assert os.path.exists(cid_refs_file_path) + assert store._count("cid") == 3 def test_delete_object_pid_empty(store): From cf1f11c815f0342f519cbfd6235d7b8f01a29f57 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:08:24 -0700 Subject: [PATCH 09/22] Review and clean-up 'delete_metadata' pytests --- .../test_filehashstore_interface.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 31caaf87..236ed6f9 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1328,21 +1328,18 @@ def test_delete_object_pid_none(store): def test_delete_metadata(pids, store): """Test delete_metadata successfully deletes metadata.""" test_dir = "tests/testdata/" - entity = "metadata" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath, format_id) store.delete_metadata(pid, format_id) - assert store._count(entity) == 0 + assert store._count("metadata") == 0 def test_delete_metadata_one_pid_multiple_metadata_documents(store): """Test delete_metadata for a pid with multiple metadata documents deletes - all metadata files as expected.""" + all associated metadata files as expected.""" test_dir = "tests/testdata/" entity = "metadata" pid = "jtao.1700.1" @@ -1369,11 +1366,13 @@ def test_delete_metadata_specific_pid_multiple_metadata_documents(store): format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" format_id3 = "http://ns.dataone.org/service/types/v3.0" format_id4 = "http://ns.dataone.org/service/types/v4.0" - _stored_metadata_path = store.store_metadata(pid, syspath, format_id) - _stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + stored_metadata_path3 = store.store_metadata(pid, syspath, format_id3) _stored_metadata_path4 = store.store_metadata(pid, syspath, format_id4) store.delete_metadata(pid, format_id4) assert store._count(entity) == 2 + assert os.path.exists(stored_metadata_path) + assert os.path.exists(stored_metadata_path3) def test_delete_metadata_does_not_exist(pids, store): @@ -1387,19 +1386,16 @@ def test_delete_metadata_does_not_exist(pids, store): def test_delete_metadata_default_format_id(store, pids): """Test delete_metadata deletes successfully with default format_id.""" test_dir = "tests/testdata/" - entity = "metadata" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _object_metadata = store.store_object(pid, path) _stored_metadata_path = store.store_metadata(pid, syspath) store.delete_metadata(pid) - assert store._count(entity) == 0 + assert store._count("metadata") == 0 def test_delete_metadata_pid_empty(store): - """Test delete_object raises error when empty pid supplied.""" + """Test delete_metadata raises error when empty pid supplied.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = " " with pytest.raises(ValueError): @@ -1407,7 +1403,7 @@ def test_delete_metadata_pid_empty(store): def test_delete_metadata_pid_none(store): - """Test delete_object raises error when pid is 'None'.""" + """Test delete_metadata raises error when pid is 'None'.""" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = None with pytest.raises(ValueError): @@ -1415,7 +1411,7 @@ def test_delete_metadata_pid_none(store): def test_delete_metadata_format_id_empty(store): - """Test delete_object raises error when empty format_id supplied.""" + """Test delete_metadata raises error when empty format_id supplied.""" format_id = " " pid = "jtao.1700.1" with pytest.raises(ValueError): From 56a43155be9c1346a81adcdfefa5ba91a9450d4a Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:11:36 -0700 Subject: [PATCH 10/22] Review and clean-up 'get_hex_digest' pytests --- tests/filehashstore/test_filehashstore_interface.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 236ed6f9..791d9d82 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1421,13 +1421,9 @@ def test_delete_metadata_format_id_empty(store): def test_get_hex_digest(store): """Test get_hex_digest for expected value.""" test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" pid = "jtao.1700.1" path = test_dir + pid - filename = pid + ".xml" - syspath = Path(test_dir) / filename _object_metadata = store.store_object(pid, path) - _stored_metadata_path = store.store_metadata(pid, syspath, format_id) sha3_256_hex_digest = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) From 9ea95726b4a674e95b8396ed3ec1a5a997a78324 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:26:32 -0700 Subject: [PATCH 11/22] Fix bug in 'find_object' where sysmeta path was not returned due to incorrect method call (isdir vs. exists), and clean-up & review find_object pytests --- src/hashstore/filehashstore.py | 2 +- tests/filehashstore/test_filehashstore.py | 234 ++++++++++++---------- 2 files changed, 129 insertions(+), 107 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 588d3043..503e0236 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1237,7 +1237,7 @@ def _find_object(self, pid): "pid_refs_path": pid_ref_abs_path, "sysmeta_path": ( sysmeta_full_path - if os.path.isdir(sysmeta_full_path) + if os.path.exists(sysmeta_full_path) else "Does not exist." ), } diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index ec02eaeb..3bcf5a70 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -42,8 +42,9 @@ def test_init_directories_created(store): def test_init_existing_store_incorrect_algorithm_format(store): - """Confirm that exception is thrown when store_algorithm is not a DataONE - controlled value.""" + """Confirm that exception is thrown when store_algorithm is not a DataONE controlled value ( + the string must exactly match the expected format). DataONE uses the library of congress + vocabulary to standardize algorithm types.""" properties = { "store_path": store.root + "/incorrect_algo_format", "store_depth": 3, @@ -180,7 +181,6 @@ def test_validate_properties(store): "store_algorithm": "SHA-256", "store_metadata_namespace": "https://ns.dataone.org/service/types/v2.0#SystemMetadata", } - # pylint: disable=W0212 assert store._validate_properties(properties) @@ -193,7 +193,6 @@ def test_validate_properties_missing_key(store): "store_algorithm": "SHA-256", } with pytest.raises(KeyError): - # pylint: disable=W0212 store._validate_properties(properties) @@ -207,7 +206,6 @@ def test_validate_properties_key_value_is_none(store): "store_metadata_namespace": None, } with pytest.raises(ValueError): - # pylint: disable=W0212 store._validate_properties(properties) @@ -215,7 +213,6 @@ def test_validate_properties_incorrect_type(store): """Confirm exception raised when a bad properties value is given.""" properties = "etc/filehashstore/hashstore.yaml" with pytest.raises(ValueError): - # pylint: disable=W0212 store._validate_properties(properties) @@ -228,7 +225,6 @@ def test_set_default_algorithms_missing_yaml(store, pids): store._store_and_validate_data(pid, path) os.remove(store.hashstore_configuration_yaml) with pytest.raises(FileNotFoundError): - # pylint: disable=W0212 store._set_default_algorithms() @@ -238,11 +234,10 @@ def test_set_default_algorithms_missing_yaml(store, pids): def test_store_and_validate_data_files_path(pids, store): """Test _store_and_validate_data accepts path object for the path arg.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = Path(test_dir) / pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - assert store._exists(entity, object_metadata.cid) + assert store._exists("objects", object_metadata.cid) def test_store_and_validate_data_files_string(pids, store): @@ -252,20 +247,19 @@ def test_store_and_validate_data_files_string(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") object_metadata = store._store_and_validate_data(pid, path) - assert store._exists(entity, object_metadata.cid) + assert store._exists("objects", object_metadata.cid) def test_store_and_validate_data_files_stream(pids, store): """Test _store_and_validate_data accepts stream for the path arg.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") object_metadata = store._store_and_validate_data(pid, input_stream) input_stream.close() - assert store._exists(entity, object_metadata.cid) - assert store._count(entity) == 3 + assert store._exists("objects", object_metadata.cid) + assert store._count("objects") == 3 def test_store_and_validate_data_cid(pids, store): @@ -373,6 +367,127 @@ def test_store_data_only_hex_digests(pids, store): assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] +def test_find_object_no_sysmeta(pids, store): + """Test _find_object returns the correct content and expected value for non-existent sysmeta.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert obj_info_dict["sysmeta_path"] == "Does not exist." + + +def test_find_object_sysmeta(pids, store): + """Test _find_object returns the correct content along with the sysmeta path""" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert str(obj_info_dict["sysmeta_path"]) == stored_metadata_path + + +def test_find_object_refs_exist_but_obj_not_found(pids, store): + """Test _find_object throws exception when refs file exist but the object does not.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + cid = store._find_object(pid).get("cid") + obj_path = store._get_hashstore_data_object_path(cid) + os.remove(obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store._find_object(pid) + + +def test_find_object_cid_refs_not_found(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not exist.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + # Place the wrong cid into the pid refs file that has already been created + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + pid_ref_file.seek(0) + pid_ref_file.write("intentionally.wrong.pid") + pid_ref_file.truncate() + + with pytest.raises(OrphanPidRefsFileFound): + store._find_object(pid) + + +def test_find_object_cid_refs_does_not_contain_pid(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not contain the pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._get_hashstore_cid_refs_path( + object_metadata.hex_digests.get("sha256") + ) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store._find_object(pid) + + +def test_find_object_pid_refs_not_found(store): + """Test _find_object throws exception when a pid refs file does not exist.""" + with pytest.raises(PidRefsDoesNotExist): + store._find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test _find_object throws exception when pid is None.""" + with pytest.raises(ValueError): + store._find_object(None) + + +def test_find_object_pid_empty(store): + """Test _find_object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store._find_object("") + + def test_move_and_get_checksums_id(pids, store): """Test _move_and_get_checksums returns correct id.""" test_dir = "tests/testdata/" @@ -1250,99 +1365,6 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi store._verify_hashstore_references(pid, cid) -def test_find_object(pids, store): - """Test _find_object returns the correct content.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - obj_info_dict = store._find_object(pid) - retrieved_cid = obj_info_dict["cid"] - - assert retrieved_cid == object_metadata.hex_digests.get("sha256") - - data_object_path = store._get_hashstore_data_object_path(retrieved_cid) - assert data_object_path == obj_info_dict["cid_object_path"] - - cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) - assert cid_refs_path == obj_info_dict["cid_refs_path"] - - pid_refs_path = store._get_hashstore_pid_refs_path(pid) - assert pid_refs_path == obj_info_dict["pid_refs_path"] - - assert obj_info_dict["sysmeta_path"] == "Does not exist." - - -def test_find_object_refs_exist_but_obj_not_found(pids, store): - """Test _find_object throws exception when refs file exist but the object does not.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - store.store_object(pid, path) - - cid = store._find_object(pid).get("cid") - obj_path = store._get_hashstore_data_object_path(cid) - os.remove(obj_path) - - with pytest.raises(RefsFileExistsButCidObjMissing): - store._find_object(pid) - - -def test_find_object_cid_refs_not_found(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not exist.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) - - # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.seek(0) - pid_ref_file.write("intentionally.wrong.pid") - pid_ref_file.truncate() - - with pytest.raises(OrphanPidRefsFileFound): - store._find_object(pid) - - -def test_find_object_cid_refs_does_not_contain_pid(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not contain the pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - - # Remove the pid from the cid refs file - cid_ref_abs_path = store._get_hashstore_cid_refs_path( - object_metadata.hex_digests.get("sha256") - ) - store._update_refs_file(cid_ref_abs_path, pid, "remove") - - with pytest.raises(PidNotFoundInCidRefsFile): - store._find_object(pid) - - -def test_find_object_pid_refs_not_found(store): - """Test _find_object throws exception when a pid refs file does not exist.""" - with pytest.raises(PidRefsDoesNotExist): - store._find_object("dou.test.1") - - -def test_find_object_pid_none(store): - """Test _find_object throws exception when pid is None.""" - with pytest.raises(ValueError): - store._find_object(None) - - -def test_find_object_pid_empty(store): - """Test _find_object throws exception when pid is empty.""" - with pytest.raises(ValueError): - store._find_object("") - - def test_clean_algorithm(store): """Check that algorithm values get formatted as expected.""" algorithm_underscore = "sha_256" From 2b0346a6422dbeef82a36b9e45705b813a40328f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:31:45 -0700 Subject: [PATCH 12/22] Re-organize 'find_object' pytests and filehashstore code placement --- src/hashstore/filehashstore.py | 104 +++++----- tests/filehashstore/test_filehashstore.py | 241 +++++++++++----------- 2 files changed, 172 insertions(+), 173 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 503e0236..b34a8194 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1129,58 +1129,6 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def _store_and_validate_data( - self, - pid, - file, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - file_size_to_validate=None, - ): - """Store contents of `file` on disk, validate the object's parameters if provided, - and tag/reference the object. - - :param str pid: Authority-based identifier. - :param mixed file: Readable object or path to file. - :param str additional_algorithm: Optional algorithm value to include when returning - hex digests. - :param str checksum: Optional checksum to validate object against hex digest before moving - to permanent location. - :param str checksum_algorithm: Algorithm value of the given checksum. - :param int file_size_to_validate: Expected size of the object. - - :return: ObjectMetadata - object that contains the object id, object file size, - and hex digest dictionary. - """ - stream = Stream(file) - - logging.debug( - "FileHashStore - put_object: Request to put object for pid: %s", pid - ) - with closing(stream): - ( - object_cid, - obj_file_size, - hex_digest_dict, - ) = self._move_and_get_checksums( - pid, - stream, - additional_algorithm, - checksum, - checksum_algorithm, - file_size_to_validate, - ) - - object_metadata = ObjectMetadata( - pid, object_cid, obj_file_size, hex_digest_dict - ) - logging.debug( - "FileHashStore - put_object: Successfully put object for pid: %s", - pid, - ) - return object_metadata - def _find_object(self, pid): """Check if an object referenced by a pid exists and retrieve its content identifier. The `find_object` method validates the existence of an object based on the provided @@ -1266,6 +1214,58 @@ def _find_object(self, pid): logging.error(err_msg) raise PidRefsDoesNotExist(err_msg) + def _store_and_validate_data( + self, + pid, + file, + additional_algorithm=None, + checksum=None, + checksum_algorithm=None, + file_size_to_validate=None, + ): + """Store contents of `file` on disk, validate the object's parameters if provided, + and tag/reference the object. + + :param str pid: Authority-based identifier. + :param mixed file: Readable object or path to file. + :param str additional_algorithm: Optional algorithm value to include when returning + hex digests. + :param str checksum: Optional checksum to validate object against hex digest before moving + to permanent location. + :param str checksum_algorithm: Algorithm value of the given checksum. + :param int file_size_to_validate: Expected size of the object. + + :return: ObjectMetadata - object that contains the object id, object file size, + and hex digest dictionary. + """ + stream = Stream(file) + + logging.debug( + "FileHashStore - put_object: Request to put object for pid: %s", pid + ) + with closing(stream): + ( + object_cid, + obj_file_size, + hex_digest_dict, + ) = self._move_and_get_checksums( + pid, + stream, + additional_algorithm, + checksum, + checksum_algorithm, + file_size_to_validate, + ) + + object_metadata = ObjectMetadata( + pid, object_cid, obj_file_size, hex_digest_dict + ) + logging.debug( + "FileHashStore - put_object: Successfully put object for pid: %s", + pid, + ) + return object_metadata + def _store_data_only(self, data): """Store an object to HashStore and return a metadata object containing the content identifier, object file size and hex digests dictionary of the default algorithms. This diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 3bcf5a70..ed77839b 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -231,6 +231,126 @@ def test_set_default_algorithms_missing_yaml(store, pids): # Tests for FileHashStore Core Methods +def test_find_object_no_sysmeta(pids, store): + """Test _find_object returns the correct content and expected value for non-existent sysmeta.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert obj_info_dict["sysmeta_path"] == "Does not exist." + + +def test_find_object_sysmeta(pids, store): + """Test _find_object returns the correct content along with the sysmeta path""" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + stored_metadata_path = store.store_metadata(pid, syspath, format_id) + + obj_info_dict = store._find_object(pid) + retrieved_cid = obj_info_dict["cid"] + + assert retrieved_cid == object_metadata.hex_digests.get("sha256") + + data_object_path = store._get_hashstore_data_object_path(retrieved_cid) + assert data_object_path == obj_info_dict["cid_object_path"] + + cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) + assert cid_refs_path == obj_info_dict["cid_refs_path"] + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + assert pid_refs_path == obj_info_dict["pid_refs_path"] + + assert str(obj_info_dict["sysmeta_path"]) == stored_metadata_path + + +def test_find_object_refs_exist_but_obj_not_found(pids, store): + """Test _find_object throws exception when refs file exist but the object does not.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + store.store_object(pid, path) + + cid = store._find_object(pid).get("cid") + obj_path = store._get_hashstore_data_object_path(cid) + os.remove(obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store._find_object(pid) + + +def test_find_object_cid_refs_not_found(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not exist.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + _object_metadata = store.store_object(pid, path) + + # Place the wrong cid into the pid refs file that has already been created + pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) + with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: + pid_ref_file.seek(0) + pid_ref_file.write("intentionally.wrong.pid") + pid_ref_file.truncate() + + with pytest.raises(OrphanPidRefsFileFound): + store._find_object(pid) + + +def test_find_object_cid_refs_does_not_contain_pid(pids, store): + """Test _find_object throws exception when pid refs file is found (and contains a cid) + but the cid refs file does not contain the pid.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid, path) + + # Remove the pid from the cid refs file + cid_ref_abs_path = store._get_hashstore_cid_refs_path( + object_metadata.hex_digests.get("sha256") + ) + store._update_refs_file(cid_ref_abs_path, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store._find_object(pid) + + +def test_find_object_pid_refs_not_found(store): + """Test _find_object throws exception when a pid refs file does not exist.""" + with pytest.raises(PidRefsDoesNotExist): + store._find_object("dou.test.1") + + +def test_find_object_pid_none(store): + """Test _find_object throws exception when pid is None.""" + with pytest.raises(ValueError): + store._find_object(None) + + +def test_find_object_pid_empty(store): + """Test _find_object throws exception when pid is empty.""" + with pytest.raises(ValueError): + store._find_object("") + def test_store_and_validate_data_files_path(pids, store): """Test _store_and_validate_data accepts path object for the path arg.""" test_dir = "tests/testdata/" @@ -367,127 +487,6 @@ def test_store_data_only_hex_digests(pids, store): assert object_metadata.hex_digests.get("sha512") == pids[pid]["sha512"] -def test_find_object_no_sysmeta(pids, store): - """Test _find_object returns the correct content and expected value for non-existent sysmeta.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - obj_info_dict = store._find_object(pid) - retrieved_cid = obj_info_dict["cid"] - - assert retrieved_cid == object_metadata.hex_digests.get("sha256") - - data_object_path = store._get_hashstore_data_object_path(retrieved_cid) - assert data_object_path == obj_info_dict["cid_object_path"] - - cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) - assert cid_refs_path == obj_info_dict["cid_refs_path"] - - pid_refs_path = store._get_hashstore_pid_refs_path(pid) - assert pid_refs_path == obj_info_dict["pid_refs_path"] - - assert obj_info_dict["sysmeta_path"] == "Does not exist." - - -def test_find_object_sysmeta(pids, store): - """Test _find_object returns the correct content along with the sysmeta path""" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - stored_metadata_path = store.store_metadata(pid, syspath, format_id) - - obj_info_dict = store._find_object(pid) - retrieved_cid = obj_info_dict["cid"] - - assert retrieved_cid == object_metadata.hex_digests.get("sha256") - - data_object_path = store._get_hashstore_data_object_path(retrieved_cid) - assert data_object_path == obj_info_dict["cid_object_path"] - - cid_refs_path = store._get_hashstore_cid_refs_path(retrieved_cid) - assert cid_refs_path == obj_info_dict["cid_refs_path"] - - pid_refs_path = store._get_hashstore_pid_refs_path(pid) - assert pid_refs_path == obj_info_dict["pid_refs_path"] - - assert str(obj_info_dict["sysmeta_path"]) == stored_metadata_path - - -def test_find_object_refs_exist_but_obj_not_found(pids, store): - """Test _find_object throws exception when refs file exist but the object does not.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - store.store_object(pid, path) - - cid = store._find_object(pid).get("cid") - obj_path = store._get_hashstore_data_object_path(cid) - os.remove(obj_path) - - with pytest.raises(RefsFileExistsButCidObjMissing): - store._find_object(pid) - - -def test_find_object_cid_refs_not_found(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not exist.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _object_metadata = store.store_object(pid, path) - - # Place the wrong cid into the pid refs file that has already been created - pid_ref_abs_path = store._get_hashstore_pid_refs_path(pid) - with open(pid_ref_abs_path, "w", encoding="utf8") as pid_ref_file: - pid_ref_file.seek(0) - pid_ref_file.write("intentionally.wrong.pid") - pid_ref_file.truncate() - - with pytest.raises(OrphanPidRefsFileFound): - store._find_object(pid) - - -def test_find_object_cid_refs_does_not_contain_pid(pids, store): - """Test _find_object throws exception when pid refs file is found (and contains a cid) - but the cid refs file does not contain the pid.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid, path) - - # Remove the pid from the cid refs file - cid_ref_abs_path = store._get_hashstore_cid_refs_path( - object_metadata.hex_digests.get("sha256") - ) - store._update_refs_file(cid_ref_abs_path, pid, "remove") - - with pytest.raises(PidNotFoundInCidRefsFile): - store._find_object(pid) - - -def test_find_object_pid_refs_not_found(store): - """Test _find_object throws exception when a pid refs file does not exist.""" - with pytest.raises(PidRefsDoesNotExist): - store._find_object("dou.test.1") - - -def test_find_object_pid_none(store): - """Test _find_object throws exception when pid is None.""" - with pytest.raises(ValueError): - store._find_object(None) - - -def test_find_object_pid_empty(store): - """Test _find_object throws exception when pid is empty.""" - with pytest.raises(ValueError): - store._find_object("") - - def test_move_and_get_checksums_id(pids, store): """Test _move_and_get_checksums returns correct id.""" test_dir = "tests/testdata/" From cdf377b356fc894fe00a49772c0a7b7c44eb29a7 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:49:08 -0700 Subject: [PATCH 13/22] Cleanup 'filehashstore' pytests module part 1 via reorganization and deleting redundant comments and code --- src/hashstore/filehashstore.py | 7 +- tests/filehashstore/test_filehashstore.py | 403 +++++++++++----------- 2 files changed, 196 insertions(+), 214 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index b34a8194..4cfe9ee0 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1327,12 +1327,11 @@ def _move_and_get_checksums( checksum_algorithm=None, file_size_to_validate=None, ): - """Copy the contents of `stream` onto disk with an optional file - extension appended. The copy process uses a temporary file to store the - initial contents and returns a dictionary of algorithms and their + """Copy the contents of the `Stream` object onto disk. The copy process uses a temporary + file to store the initial contents and returns a dictionary of algorithms and their hex digest values. If the file already exists, the method will immediately raise an exception. If an algorithm and checksum are provided, it will proceed to - validate the object (and delete the tmpFile if the hex digest stored does + validate the object (and delete the temporary file created if the hex digest stored does not match what is provided). :param Optional[str] pid: Authority-based identifier. diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index ed77839b..a457d01e 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -351,6 +351,7 @@ def test_find_object_pid_empty(store): with pytest.raises(ValueError): store._find_object("") + def test_store_and_validate_data_files_path(pids, store): """Test _store_and_validate_data accepts path object for the path arg.""" test_dir = "tests/testdata/" @@ -493,7 +494,6 @@ def test_move_and_get_checksums_id(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 ( move_id, _, @@ -509,7 +509,6 @@ def test_move_and_get_checksums_file_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 ( _, tmp_file_size, @@ -525,7 +524,6 @@ def test_move_and_get_checksums_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 ( _, _, @@ -542,26 +540,22 @@ def test_move_and_get_checksums_hex_digests(pids, store): def test_move_and_get_checksums_does_not_store_duplicate(pids, store): """Test _move_and_get_checksums does not store duplicate objects.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 store._move_and_get_checksums(pid, input_stream) input_stream.close() for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 store._move_and_get_checksums(pid, input_stream) input_stream.close() - assert store._count(entity) == 3 + assert store._count("objects") == 3 def test_move_and_get_checksums_raises_error_with_nonmatching_checksum(pids, store): """Test _move_and_get_checksums raises error when incorrect checksum supplied.""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") @@ -574,18 +568,17 @@ def test_move_and_get_checksums_raises_error_with_nonmatching_checksum(pids, sto checksum_algorithm="sha256", ) input_stream.close() - assert store._count(entity) == 0 + assert store._count("objects") == 0 def test_move_and_get_checksums_incorrect_file_size(pids, store): - """Test move and get checksum raises error with an incorrect file size.""" + """Test _move_and_get_checksums raises error with an incorrect file size.""" test_dir = "tests/testdata/" for pid in pids.keys(): with pytest.raises(NonMatchingObjSize): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") incorrect_file_size = 1000 - # pylint: disable=W0212 (_, _, _, _,) = store._move_and_get_checksums( pid, input_stream, file_size_to_validate=incorrect_file_size ) @@ -602,7 +595,6 @@ def test_write_to_tmp_file_and_get_hex_digests_additional_algo(store): checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=checksum_algo ) @@ -621,7 +613,6 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_algo(store): checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, checksum_algorithm=checksum_algo ) @@ -644,7 +635,6 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo(stor checksum_correct = ( "b748069cd0116ba59638e5f3500bbff79b41d6184bc242bd71f5cbbb8cf484cf" ) - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, @@ -666,7 +656,6 @@ def test_write_to_tmp_file_and_get_hex_digests_checksum_and_additional_algo_dupl additional_algo = "sha224" checksum_algo = "sha224" checksum_correct = "9b3a96f434f3c894359193a63437ef86fbd5a1a1a6cc37f1d5013ac1" - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=additional_algo, @@ -682,7 +671,6 @@ def test_write_to_tmp_file_and_get_hex_digests_file_size(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 _, _, tmp_file_size = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert tmp_file_size == pids[pid]["file_size_bytes"] @@ -694,7 +682,6 @@ def test_write_to_tmp_file_and_get_hex_digests_hex_digests(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 hex_digests, _, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert hex_digests.get("md5") == pids[pid]["md5"] @@ -710,7 +697,6 @@ def test_write_to_tmp_file_and_get_hex_digests_tmpfile_object(pids, store): for pid in pids.keys(): path = test_dir + pid.replace("/", "_") input_stream = io.open(path, "rb") - # pylint: disable=W0212 _, tmp_file_name, _ = store._write_to_tmp_file_and_get_hex_digests(input_stream) input_stream.close() assert os.path.isfile(tmp_file_name) is True @@ -724,12 +710,10 @@ def test_write_to_tmp_file_and_get_hex_digests_with_unsupported_algorithm(pids, input_stream = io.open(path, "rb") algo = "md2" with pytest.raises(UnsupportedAlgorithm): - # pylint: disable=W0212 _, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, additional_algorithm=algo ) with pytest.raises(UnsupportedAlgorithm): - # pylint: disable=W0212 _, _, _ = store._write_to_tmp_file_and_get_hex_digests( input_stream, checksum_algorithm=algo ) @@ -740,7 +724,6 @@ def test_mktmpfile(store): """Test that _mktmpfile creates and returns a tmp file.""" path = store.root + "/doutest/tmp/" store._create_path(path) - # pylint: disable=W0212 tmp = store._mktmpfile(path) assert os.path.exists(tmp.name) @@ -849,6 +832,195 @@ def test_store_hashstore_refs_files_refs_not_found_cid_refs_found(store): assert store._count("cid") == 1 +def test_untag_object(pids, store): + """Test _untag_object untags successfully.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + assert store._count("objects") == 3 + + +def test_untag_object_pid_not_locked(pids, store): + """Test _untag_object throws exception when pid is not locked""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + with pytest.raises(IdentifierNotLocked): + store._untag_object(pid, cid) + + +def test_untag_object_cid_not_locked(pids, store): + """Test _untag_object throws exception with cid is not locked""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + with pytest.raises(IdentifierNotLocked): + store._synchronize_referenced_locked_pids(pid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + + +def test_untag_object_orphan_pid_refs_file_found(store): + """Test _untag_object removes an orphan pid refs file""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + # Remove cid refs file + cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) + os.remove(cid_refs_abs_path) + + with pytest.raises(OrphanPidRefsFileFound): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + + +def test_untag_object_orphan_refs_exist_but_data_object_not_found(store): + """Test _untag_object removes orphaned pid and cid refs files""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + # Remove cid refs file + data_obj_path = store._get_hashstore_data_object_path(cid) + os.remove(data_obj_path) + + with pytest.raises(RefsFileExistsButCidObjMissing): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + + +def test_untag_object_refs_found_but_pid_not_in_cid_refs(store): + """Test _untag_object removes pid refs file whose pid is not found in the cid refs file.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + pid_two = pid + ".dou" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid_two, path) + cid = object_metadata.cid + + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + # Remove pid from cid refs + cid_refs_file = store._get_hashstore_cid_refs_path(cid) + # First remove the pid + store._update_refs_file(cid_refs_file, pid, "remove") + + with pytest.raises(PidNotFoundInCidRefsFile): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + +def test_untag_object_pid_refs_file_does_not_exist(store): + """Test _untag_object removes pid from cid refs file since the pid refs file does not exist, + and does not delete the cid refs file because a reference is still present.""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + pid_two = pid + ".dou" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + _object_metadata_two = store.store_object(pid_two, path) + cid = object_metadata.cid + + assert store._count("pid") == 2 + assert store._count("cid") == 1 + + # Remove pid from cid refs + pid_refs_file = store._get_hashstore_pid_refs_path(pid) + os.remove(pid_refs_file) + + with pytest.raises(PidRefsDoesNotExist): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + +def test_untag_object_pid_refs_file_does_not_exist_and_cid_refs_is_empty(store): + """Test '_untag_object' removes pid from cid refs file since the pid refs file does not exist, + and deletes the cid refs file because it contains no more references (after the pid called + with '_untag_object' is removed from the cid refs).""" + test_dir = "tests/testdata/" + pid = "jtao.1700.1" + path = test_dir + pid + object_metadata = store.store_object(pid, path) + cid = object_metadata.cid + + assert store._count("pid") == 1 + assert store._count("cid") == 1 + + # Remove pid from cid refs + pid_refs_file = store._get_hashstore_pid_refs_path(pid) + os.remove(pid_refs_file) + + with pytest.raises(PidRefsDoesNotExist): + store._find_object(pid) + + store._synchronize_referenced_locked_pids(pid) + store._synchronize_object_locked_cids(cid) + store._untag_object(pid, cid) + store._release_reference_locked_pids(pid) + store._release_object_locked_cids(cid) + + assert store._count("pid") == 0 + assert store._count("cid") == 0 + + def test_delete_marked_files(store): """Test that _delete_marked_files removes all items from a given list""" pid = "jtao.1700.1" @@ -1527,195 +1699,6 @@ def test_delete_with_object_metadata_id(pids, store): assert store._count(entity) == 0 -def test_untag_object(pids, store): - """Test _untag_object untags successfully.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - assert store._count("cid") == 0 - assert store._count("objects") == 3 - - -def test_untag_object_pid_not_locked(pids, store): - """Test _untag_object throws exception when pid is not locked""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - with pytest.raises(IdentifierNotLocked): - store._untag_object(pid, cid) - - -def test_untag_object_cid_not_locked(pids, store): - """Test _untag_object throws exception with cid is not locked""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - with pytest.raises(IdentifierNotLocked): - store._synchronize_referenced_locked_pids(pid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - - -def test_untag_object_orphan_pid_refs_file_found(store): - """Test _untag_object removes an orphan pid refs file""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - # Remove cid refs file - cid_refs_abs_path = store._get_hashstore_cid_refs_path(cid) - os.remove(cid_refs_abs_path) - - with pytest.raises(OrphanPidRefsFileFound): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - - -def test_untag_object_orphan_refs_exist_but_data_object_not_found(store): - """Test _untag_object removes orphaned pid and cid refs files""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - # Remove cid refs file - data_obj_path = store._get_hashstore_data_object_path(cid) - os.remove(data_obj_path) - - with pytest.raises(RefsFileExistsButCidObjMissing): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - assert store._count("cid") == 0 - - -def test_untag_object_refs_found_but_pid_not_in_cid_refs(store): - """Test _untag_object removes pid refs file whose pid is not found in the cid refs file.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - pid_two = pid + ".dou" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - _object_metadata_two = store.store_object(pid_two, path) - cid = object_metadata.cid - - assert store._count("pid") == 2 - assert store._count("cid") == 1 - - # Remove pid from cid refs - cid_refs_file = store._get_hashstore_cid_refs_path(cid) - # First remove the pid - store._update_refs_file(cid_refs_file, pid, "remove") - - with pytest.raises(PidNotFoundInCidRefsFile): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - -def test_untag_object_pid_refs_file_does_not_exist(store): - """Test _untag_object removes pid from cid refs file since the pid refs file does not exist, - and does not delete the cid refs file because a reference is still present.""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - pid_two = pid + ".dou" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - _object_metadata_two = store.store_object(pid_two, path) - cid = object_metadata.cid - - assert store._count("pid") == 2 - assert store._count("cid") == 1 - - # Remove pid from cid refs - pid_refs_file = store._get_hashstore_pid_refs_path(pid) - os.remove(pid_refs_file) - - with pytest.raises(PidRefsDoesNotExist): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - -def test_untag_object_pid_refs_file_does_not_exist_and_cid_refs_is_empty(store): - """Test '_untag_object' removes pid from cid refs file since the pid refs file does not exist, - and deletes the cid refs file because it contains no more references (after the pid called - with '_untag_object' is removed from the cid refs).""" - test_dir = "tests/testdata/" - pid = "jtao.1700.1" - path = test_dir + pid - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - assert store._count("pid") == 1 - assert store._count("cid") == 1 - - # Remove pid from cid refs - pid_refs_file = store._get_hashstore_pid_refs_path(pid) - os.remove(pid_refs_file) - - with pytest.raises(PidRefsDoesNotExist): - store._find_object(pid) - - store._synchronize_referenced_locked_pids(pid) - store._synchronize_object_locked_cids(cid) - store._untag_object(pid, cid) - store._release_reference_locked_pids(pid) - store._release_object_locked_cids(cid) - - assert store._count("pid") == 0 - assert store._count("cid") == 0 - - def test_create_path(pids, store): """Test makepath creates folder successfully.""" for pid in pids: From b9d216cb735c1a7b75441db52e41f3ef4a2938ca Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 13:54:07 -0700 Subject: [PATCH 14/22] Cleanup remaining 'filehashstore' pytests module for core methods --- tests/filehashstore/test_filehashstore.py | 128 +++++++++++----------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index a457d01e..304f3d06 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1021,6 +1021,64 @@ def test_untag_object_pid_refs_file_does_not_exist_and_cid_refs_is_empty(store): assert store._count("cid") == 0 +def test_put_metadata_with_path(pids, store): + """Test _put_metadata with path object for the path arg.""" + entity = "metadata" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_stored_path = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_stored_path) + assert store._count(entity) == 3 + + +def test_put_metadata_with_string(pids, store): + """Test_put metadata with string for the path arg.""" + entity = "metadata" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = str(Path(test_dir) / filename) + metadata_stored_path = store._put_metadata(syspath, pid, format_id) + assert store._exists(entity, metadata_stored_path) + assert store._count(entity) == 3 + + +def test_put_metadata_stored_path(pids, store): + """Test put metadata returns correct path to the metadata stored.""" + test_dir = "tests/testdata/" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + metadata_document_name = store._computehash(pid + format_id) + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + metadata_stored_path = store._put_metadata(syspath, pid, metadata_document_name) + + # Manually calculate expected path + metadata_directory = store._computehash(pid) + rel_path = "/".join(store._shard(metadata_directory)) + full_path = ( + store._get_store_path("metadata") / rel_path / metadata_document_name + ) + assert metadata_stored_path == full_path + + +def test_mktmpmetadata(pids, store): + """Test mktmpmetadata creates tmpFile.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + sys_stream = io.open(syspath, "rb") + # pylint: disable=W0212 + tmp_name = store._mktmpmetadata(sys_stream) + sys_stream.close() + assert os.path.exists(tmp_name) + + def test_delete_marked_files(store): """Test that _delete_marked_files removes all items from a given list""" pid = "jtao.1700.1" @@ -1251,64 +1309,6 @@ def test_update_refs_file_empty_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -def test_put_metadata_with_path(pids, store): - """Test _put_metadata with path object for the path arg.""" - entity = "metadata" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - metadata_cid = store._put_metadata(syspath, pid, format_id) - assert store._exists(entity, metadata_cid) - assert store._count(entity) == 3 - - -def test_put_metadata_with_string(pids, store): - """Test_put metadata with string for the path arg.""" - entity = "metadata" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = str(Path(test_dir) / filename) - metadata_cid = store._put_metadata(syspath, pid, format_id) - assert store._exists(entity, metadata_cid) - assert store._count(entity) == 3 - - -def test_put_metadata_cid(pids, store): - """Test put metadata returns correct id.""" - test_dir = "tests/testdata/" - format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" - for pid in pids.keys(): - metadata_document_name = store._computehash(pid + format_id) - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - metadata_cid = store._put_metadata(syspath, pid, metadata_document_name) - - # Manually calculate expected path - metadata_directory = store._computehash(pid) - rel_path = "/".join(store._shard(metadata_directory)) - full_path = ( - store._get_store_path("metadata") / rel_path / metadata_document_name - ) - assert metadata_cid == full_path - - -def test_mktmpmetadata(pids, store): - """Test mktmpmetadata creates tmpFile.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - filename = pid.replace("/", "_") + ".xml" - syspath = Path(test_dir) / filename - sys_stream = io.open(syspath, "rb") - # pylint: disable=W0212 - tmp_name = store._mktmpmetadata(sys_stream) - sys_stream.close() - assert os.path.exists(tmp_name) - - # Tests for FileHashStore Utility & Supporting Methods @@ -1621,8 +1621,8 @@ def test_exists_metadata_files_path(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) - assert store._exists(entity, metadata_cid) + metadata_stored_path = store.store_metadata(pid, syspath, format_id) + assert store._exists(entity, metadata_stored_path) def test_exists_object_with_nonexistent_file(store): @@ -1680,7 +1680,7 @@ def test_delete_object_only_cid_refs_file_exists(pids, store): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename object_metadata = store.store_object(pid, path) - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _metadata_stored_path = store.store_metadata(pid, syspath, format_id) store._delete_object_only(object_metadata.cid) assert store._count(entity) == 3 assert store._count("pid") == 3 @@ -1745,8 +1745,8 @@ def test_get_real_path_with_metadata_id(store, pids): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - metadata_cid = store.store_metadata(pid, syspath, format_id) - metadata_abs_path = store._get_hashstore_metadata_path(metadata_cid) + metadata_stored_path = store.store_metadata(pid, syspath, format_id) + metadata_abs_path = store._get_hashstore_metadata_path(metadata_stored_path) assert os.path.exists(metadata_abs_path) @@ -1800,7 +1800,7 @@ def test_get_hashstore_metadata_path_metadata(pids, store): for pid in pids.keys(): filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - _metadata_cid = store.store_metadata(pid, syspath, format_id) + _metadata_stored_path = store.store_metadata(pid, syspath, format_id) metadata_directory = store._computehash(pid) metadata_document_name = store._computehash(pid + format_id) From 15825050ef06fe3262432f182e7aaf151e17ad8e Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Fri, 20 Sep 2024 14:08:31 -0700 Subject: [PATCH 15/22] Cleanuo 'filehashstore' utility and supporting methods pytests part 1 and add new pytest for '_is_string_in_refs_file' --- tests/filehashstore/test_filehashstore.py | 30 +++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 304f3d06..2d34ccf0 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1079,6 +1079,9 @@ def test_mktmpmetadata(pids, store): assert os.path.exists(tmp_name) +# Tests for FileHashStore Utility & Supporting Methods + + def test_delete_marked_files(store): """Test that _delete_marked_files removes all items from a given list""" pid = "jtao.1700.1" @@ -1183,14 +1186,14 @@ def test_validate_and_check_cid_lock_identifier_not_locked(store): def test_write_refs_file_ref_type_cid(store): - """Test that write_refs_file writes a reference file.""" + """Test that write_refs_file writes a reference file when given a 'cid' update_type.""" tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, "test_pid", "cid") assert os.path.exists(tmp_cid_refs_file) -def test_write_refs_file_ref_type_cid_content(pids, store): - """Test that write_refs_file writes the expected content.""" +def test_write_refs_file_ref_type_content_cid(pids, store): + """Test that write_refs_file writes the expected content when given a 'cid' update_type.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") @@ -1201,7 +1204,7 @@ def test_write_refs_file_ref_type_cid_content(pids, store): def test_write_refs_file_ref_type_pid(pids, store): - """Test that write_pid_refs_file writes a reference file.""" + """Test that write_pid_refs_file writes a reference file when given a 'pid' update_type.""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" @@ -1210,7 +1213,7 @@ def test_write_refs_file_ref_type_pid(pids, store): def test_write_refs_file_ref_type_content_pid(pids, store): - """Test that write_pid_refs_file writes the expected content.""" + """Test that write_refs_file writes the expected content when given a 'pid' update_type""" for pid in pids.keys(): cid = pids[pid]["sha256"] tmp_root_path = store._get_store_path("refs") / "tmp" @@ -1256,8 +1259,8 @@ def test_update_refs_file_content_multiple(pids, store): assert line_count == 6 -def test_update_refs_file_content_pid_exists(pids, store): - """Test that _update_refs_file does add a pid to a refs file that already +def test_update_refs_file_deduplicates_pid_already_found(pids, store): + """Test that _update_refs_file does not add a pid to a refs file that already contains the pid.""" for pid in pids.keys(): tmp_root_path = store._get_store_path("refs") / "tmp" @@ -1309,7 +1312,18 @@ def test_update_refs_file_empty_file(pids, store): assert os.path.getsize(tmp_cid_refs_file) == 0 -# Tests for FileHashStore Utility & Supporting Methods +def test_is_string_in_refs_file(pids, store): + """Test that _update_refs_file leaves a file empty when removing the last pid.""" + for pid in pids.keys(): + tmp_root_path = store._get_store_path("refs") / "tmp" + tmp_cid_refs_file = store._write_refs_file(tmp_root_path, pid, "cid") + + cid_reference_list = [pid] + for i in range(0, 5): + store._update_refs_file(tmp_cid_refs_file, f"dou.test.{i}", "add") + cid_reference_list.append(f"dou.test.{i}") + + assert store._is_string_in_refs_file("dou.test.2", tmp_cid_refs_file) is True def test_verify_object_information(pids, store): From d58d5e85b7bc1efcf1411247d6bbc9dd72c0df5f Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 23 Sep 2024 16:07:57 -0700 Subject: [PATCH 16/22] Revise exception messaging in '_verify_object_information', review pytests and add new pytest --- src/hashstore/filehashstore.py | 5 ++-- tests/conftest.py | 3 +++ tests/filehashstore/test_filehashstore.py | 31 ++++++++++++++++++----- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 4cfe9ee0..c66aa686 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2169,8 +2169,9 @@ def _verify_object_information( exception_string = ( "FileHashStore - _verify_object_information: checksum_algorithm" + f" ({checksum_algorithm}) cannot be found in the default hex digests" - + " dict, but is supported. New checksum calculated but does not match" - + " what has been provided." + + f" dict, but is supported. New checksum calculated: " + f"{hex_digest_calculated}, does not match what has been provided: " + + checksum ) logging.debug(exception_string) raise NonMatchingChecksum(exception_string) diff --git a/tests/conftest.py b/tests/conftest.py index 27b1c8fa..e10a83e0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -55,6 +55,7 @@ def init_pids(): "sha256": "4d198171eef969d553d4c9537b1811a7b078f9a3804fc978a761bc014c05972c", "sha384": "d5953bd802fa74edea72eb941ead7a27639e62792fedc065d6c81de6c613b5b8739ab1f90e7f24a7500d154a727ed7c2", "sha512": "e9bcd6b91b102ef5803d1bd60c7a5d2dbec1a2baf5f62f7da60de07607ad6797d6a9b740d97a257fd2774f2c26503d455d8f2a03a128773477dfa96ab96a2e54", + "blake2s": "5895fa29c17f8768d613984bb86791e5fcade7643c15e84663c03be89205d81e", }, "jtao.1700.1": { "file_size_bytes": 8724, @@ -65,6 +66,7 @@ def init_pids(): "sha256": "94f9b6c88f1f458e410c30c351c6384ea42ac1b5ee1f8430d3e365e43b78a38a", "sha384": "a204678330fcdc04980c9327d4e5daf01ab7541e8a351d49a7e9c5005439dce749ada39c4c35f573dd7d307cca11bea8", "sha512": "bf9e7f4d4e66bd082817d87659d1d57c2220c376cd032ed97cadd481cf40d78dd479cbed14d34d98bae8cebc603b40c633d088751f07155a94468aa59e2ad109", + "blake2s": "8978c46ee4cc5d1d79698752fd663c60c817d58d6aea901843bf4fc2cb173bef", }, "urn:uuid:1b35d0a5-b17a-423b-a2ed-de2b18dc367a": { "file_size_bytes": 18699, @@ -75,6 +77,7 @@ def init_pids(): "sha256": "4473516a592209cbcd3a7ba4edeebbdb374ee8e4a49d19896fafb8f278dc25fa", "sha384": "b1023a9be5aa23a102be9bce66e71f1f1c7a6b6b03e3fc603e9cd36b4265671e94f9cc5ce3786879740536994489bc26", "sha512": "c7fac7e8aacde8546ddb44c640ad127df82830bba6794aea9952f737c13a81d69095865ab3018ed2a807bf9222f80657faf31cfde6c853d7b91e617e148fec76", + "blake2s": "c8c9aea2f7ddcfaf8db93ce95f18e467b6293660d1a0b08137636a3c92896765", }, } return test_pids diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 2d34ccf0..719e8325 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1336,7 +1336,6 @@ def test_verify_object_information(pids, store): checksum = object_metadata.hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm expected_file_size = object_metadata.obj_size - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1359,7 +1358,6 @@ def test_verify_object_information_incorrect_size(pids, store): checksum = hex_digests.get(store.algorithm) checksum_algorithm = store.algorithm with pytest.raises(NonMatchingObjSize): - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1385,7 +1383,6 @@ def test_verify_object_information_incorrect_size_with_pid(pids, store): expected_file_size = object_metadata.obj_size objects_tmp_folder = store.objects + "/tmp" - # pylint: disable=W0212 tmp_file = store._mktmpfile(objects_tmp_folder) assert os.path.isfile(tmp_file.name) with pytest.raises(NonMatchingObjSize): @@ -1415,7 +1412,6 @@ def test_verify_object_information_missing_key_in_hex_digests_unsupported_algo( checksum_algorithm = "md10" expected_file_size = object_metadata.obj_size with pytest.raises(UnsupportedAlgorithm): - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1432,7 +1428,7 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( pids, store ): """Test _verify_object_information throws exception when algorithm is not found - in hex digests but is supported, however the checksum calculated does not match.""" + in hex digests but is supported, and the checksum calculated does not match.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -1441,7 +1437,6 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( checksum_algorithm = "blake2s" expected_file_size = object_metadata.obj_size with pytest.raises(NonMatchingChecksum): - # pylint: disable=W0212 store._verify_object_information( None, checksum, @@ -1454,6 +1449,30 @@ def test_verify_object_information_missing_key_in_hex_digests_supported_algo( ) +def test_verify_object_information_missing_key_in_hex_digests_matching_checksum( + pids, store +): + """Test _verify_object_information does not throw exception when algorithm is not found + in hex digests but is supported, and the checksum calculated matches.""" + test_dir = "tests/testdata/" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(data=path) + checksum_algorithm = "blake2s" + checksum = pids[pid][checksum_algorithm] + expected_file_size = object_metadata.obj_size + store._verify_object_information( + None, + checksum, + checksum_algorithm, + "objects", + object_metadata.hex_digests, + None, + expected_file_size, + expected_file_size, + ) + + def test_verify_hashstore_references_pid_refs_file_missing(pids, store): """Test _verify_hashstore_references throws exception when pid refs file is missing.""" for pid in pids.keys(): From d49cdfac8b5204e601d47bf672e6a858d09b5fa0 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Mon, 23 Sep 2024 17:51:20 -0700 Subject: [PATCH 17/22] Refactor private '_delete' method and clean-up remaining pytests --- src/hashstore/filehashstore.py | 61 +++--- tests/filehashstore/test_filehashstore.py | 241 +++++++++++----------- 2 files changed, 150 insertions(+), 152 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index c66aa686..3fabd431 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2572,7 +2572,11 @@ def _delete(self, entity, file): elif entity == "objects": realpath = self._get_hashstore_data_object_path(file) elif entity == "metadata": - realpath = self._get_hashstore_metadata_path(file) + try: + realpath = self._get_hashstore_metadata_path(file) + except FileNotFoundError: + # Swallow file not found exceptions for metadata + realpath = None elif os.path.exists(file): # Check if the given path is an absolute path realpath = file @@ -2580,13 +2584,10 @@ def _delete(self, entity, file): raise IOError( f"FileHashStore - delete(): Could not locate file: {file}" ) - except FileNotFoundError: - realpath = None - - try: if realpath is not None: os.remove(realpath) - except OSError as err: + + except Exception as err: exception_string = ( f"FileHashStore - delete(): Unexpected {err=}, {type(err)=}" ) @@ -2604,6 +2605,30 @@ def _create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" + def _get_store_path(self, entity): + """Return a path object to the root directory of the requested hashstore directory type + + :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". + Note, "cid" and "pid" are refs specific directories. + + :return: Path to requested store entity type + :rtype: Path + """ + if entity == "objects": + return Path(self.objects) + elif entity == "metadata": + return Path(self.metadata) + elif entity == "refs": + return Path(self.refs) + elif entity == "cid": + return Path(self.cids) + elif entity == "pid": + return Path(self.pids) + else: + raise ValueError( + f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" + ) + def _build_hashstore_data_object_path(self, hash_id): """Build the absolute file path for a given content identifier @@ -2699,30 +2724,6 @@ def _get_hashstore_cid_refs_path(self, cid): cid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) return cid_ref_file_abs_path - def _get_store_path(self, entity): - """Return a path object to the root directory of the requested hashstore directory type - - :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". - Note, "cid" and "pid" are refs specific directories. - - :return: Path to requested store entity type - :rtype: Path - """ - if entity == "objects": - return Path(self.objects) - elif entity == "metadata": - return Path(self.metadata) - elif entity == "refs": - return Path(self.refs) - elif entity == "cid": - return Path(self.cids) - elif entity == "pid": - return Path(self.pids) - else: - raise ValueError( - f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" - ) - # Synchronization Methods def _release_object_locked_pids(self, pid): diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 719e8325..4c150b7f 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1560,15 +1560,41 @@ def test_verify_hashstore_references_cid_refs_file_with_multiple_refs_missing_pi tmp_pid_refs_file = store._write_refs_file(tmp_root_path, cid, "pid") shutil.move(tmp_pid_refs_file, pid_ref_abs_path) - cid_reference_list = [pid] for i in range(0, 5): store._update_refs_file(cid_ref_abs_path, f"dou.test.{i}", "add") - cid_reference_list.append(f"dou.test.{i}") with pytest.raises(CidRefsContentError): store._verify_hashstore_references(pid, cid) +def test_delete_object_only(pids, store): + """Test _delete_object successfully deletes only object.""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + object_metadata = store.store_object(pid=None, data=path) + store._delete_object_only(object_metadata.cid) + assert store._count(entity) == 0 + + +def test_delete_object_only_cid_refs_file_exists(pids, store): + """Test _delete_object does not delete object if a cid refs file still exists.""" + test_dir = "tests/testdata/" + entity = "objects" + format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" + for pid in pids.keys(): + path = test_dir + pid.replace("/", "_") + filename = pid.replace("/", "_") + ".xml" + syspath = Path(test_dir) / filename + object_metadata = store.store_object(pid, path) + _metadata_stored_path = store.store_metadata(pid, syspath, format_id) + store._delete_object_only(object_metadata.cid) + assert store._count(entity) == 3 + assert store._count("pid") == 3 + assert store._count("cid") == 3 + + def test_clean_algorithm(store): """Check that algorithm values get formatted as expected.""" algorithm_underscore = "sha_256" @@ -1600,28 +1626,27 @@ def test_computehash(pids, store): assert pids[pid]["sha256"] == obj_sha256_hash -def test_get_store_path_object(store): - """Check get_store_path for object path.""" - # pylint: disable=W0212 - path_objects = store._get_store_path("objects") - path_objects_string = str(path_objects) - assert path_objects_string.endswith("/metacat/hashstore/objects") - - -def test_get_store_path_metadata(store): - """Check get_store_path for metadata path.""" - # pylint: disable=W0212 - path_metadata = store._get_store_path("metadata") - path_metadata_string = str(path_metadata) - assert path_metadata_string.endswith("/metacat/hashstore/metadata") +def test_shard(store): + """Test shard creates list.""" + hash_id = "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e" + predefined_list = [ + "0d", + "55", + "5e", + "d77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", + ] + sharded_list = store._shard(hash_id) + assert predefined_list == sharded_list -def test_get_store_path_refs(store): - """Check get_store_path for refs path.""" - # pylint: disable=W0212 - path_metadata = store._get_store_path("refs") - path_metadata_string = str(path_metadata) - assert path_metadata_string.endswith("/metacat/hashstore/refs") +def test_count(pids, store): + """Check that count returns expected number of objects.""" + test_dir = "tests/testdata/" + entity = "objects" + for pid in pids.keys(): + path_string = test_dir + pid.replace("/", "_") + store._store_and_validate_data(pid, path_string) + assert store._count(entity) == 3 def test_exists_object_with_object_metadata_id(pids, store): @@ -1666,19 +1691,6 @@ def test_exists_object_with_nonexistent_file(store): assert does_not_exist is False -def test_shard(store): - """Test shard creates list.""" - hash_id = "0d555ed77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e" - predefined_list = [ - "0d", - "55", - "5e", - "d77052d7e166017f779cbc193357c3a5006ee8b8457230bcf7abcef65e", - ] - sharded_list = store._shard(hash_id) - assert predefined_list == sharded_list - - def test_open_objects(pids, store): """Test open returns a stream.""" test_dir = "tests/testdata/" @@ -1692,44 +1704,52 @@ def test_open_objects(pids, store): io_buffer.close() -def test_delete_object_only(pids, store): - """Test _delete_object successfully deletes only object.""" +def test_private_delete_objects(pids, store): + """Confirm _delete deletes for entity type 'objects'""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store.store_object(pid=None, data=path) - store._delete_object_only(object_metadata.cid) - assert store._count(entity) == 0 + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + store._delete("objects", object_metadata.cid) + assert store._count("objects") == 0 -def test_delete_object_only_cid_refs_file_exists(pids, store): - """Test _delete_object does not delete object if a cid refs file still exists.""" + +def test_private_delete_metadata(pids, store): + """Confirm _delete deletes for entity type 'metadata'""" test_dir = "tests/testdata/" - entity = "objects" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") filename = pid.replace("/", "_") + ".xml" syspath = Path(test_dir) / filename - object_metadata = store.store_object(pid, path) - _metadata_stored_path = store.store_metadata(pid, syspath, format_id) - store._delete_object_only(object_metadata.cid) - assert store._count(entity) == 3 - assert store._count("pid") == 3 - assert store._count("cid") == 3 + store.store_metadata(pid, syspath, format_id) + # Manually calculate expected path + metadata_directory = store._computehash(pid) + metadata_document_name = store._computehash(pid + format_id) + rel_path = ( + Path("/".join(store._shard(metadata_directory))) / metadata_document_name + ) + + store._delete("metadata", rel_path) + + assert store._count("metadata") == 0 -def test_delete_with_object_metadata_id(pids, store): - """Check objects are deleted after calling delete with object id.""" + +def test_private_delete_absolute_path(pids, store): + """Confirm _delete deletes for absolute paths'""" test_dir = "tests/testdata/" - entity = "objects" for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store._store_and_validate_data(pid, path) - object_metadata_id = object_metadata.cid - store._delete(entity, object_metadata_id) - assert store._count(entity) == 0 + path = Path(test_dir + pid.replace("/", "_")) + object_metadata = store.store_object(pid, path) + + cid_refs_path = store._get_hashstore_cid_refs_path(object_metadata.cid) + store._delete("other", cid_refs_path) + assert store._count("cid") == 0 + + pid_refs_path = store._get_hashstore_pid_refs_path(pid) + store._delete("other", pid_refs_path) + assert store._count("pid") == 0 def test_create_path(pids, store): @@ -1742,15 +1762,39 @@ def test_create_path(pids, store): assert os.path.isdir(pid_directory) -def test_get_real_path_file_does_not_exist(store): - """Test get_real_path returns None when object does not exist.""" +def test_get_store_path_object(store): + """Check get_store_path for object path.""" + # pylint: disable=W0212 + path_objects = store._get_store_path("objects") + path_objects_string = str(path_objects) + assert path_objects_string.endswith("/metacat/hashstore/objects") + + +def test_get_store_path_metadata(store): + """Check get_store_path for metadata path.""" + # pylint: disable=W0212 + path_metadata = store._get_store_path("metadata") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/hashstore/metadata") + + +def test_get_store_path_refs(store): + """Check get_store_path for refs path.""" + # pylint: disable=W0212 + path_metadata = store._get_store_path("refs") + path_metadata_string = str(path_metadata) + assert path_metadata_string.endswith("/metacat/hashstore/refs") + + +def test_get_hashstore_data_object_path_file_does_not_exist(store): + """Test _get_hashstore_data_object_path returns None when object does not exist.""" test_path = "tests/testdata/helloworld.txt" with pytest.raises(FileNotFoundError): store._get_hashstore_data_object_path(test_path) -def test_get_real_path_with_object_id(store, pids): - """Test get_real_path returns absolute path given an object id.""" +def test_get_hashstore_data_object_path_with_object_id(store, pids): + """Test _get_hashstore_data_object_path returns absolute path given an object id.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -1759,20 +1803,8 @@ def test_get_real_path_with_object_id(store, pids): assert os.path.exists(obj_abs_path) -def test_get_real_path_with_object_id_sharded(pids, store): - """Test exists method with a sharded path (relative path).""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - object_metadata = store._store_and_validate_data(pid, path) - object_metadata_shard = store._shard(object_metadata.cid) - object_metadata_shard_path = "/".join(object_metadata_shard) - obj_abs_path = store._get_hashstore_data_object_path(object_metadata_shard_path) - assert os.path.exists(obj_abs_path) - - -def test_get_real_path_with_metadata_id(store, pids): - """Test get_real_path returns absolute path given a metadata id.""" +def test_get_hashstore_metadata_path_absolute_path(store, pids): + """Test _get_hashstore_metadata_path returns absolute path given a metadata id.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" for pid in pids.keys(): @@ -1783,50 +1815,7 @@ def test_get_real_path_with_metadata_id(store, pids): assert os.path.exists(metadata_abs_path) -def test_build_hashstore_data_object_path(store, pids): - """Test _build_hashstore_data_object_path builds the hashstore data object file path.""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = test_dir + pid.replace("/", "_") - _ = store._store_and_validate_data(pid, path) - # pylint: disable=W0212 - abs_path = store._build_hashstore_data_object_path(pids[pid][store.algorithm]) - assert os.path.exists(abs_path) - - -def test_count(pids, store): - """Check that count returns expected number of objects.""" - test_dir = "tests/testdata/" - entity = "objects" - for pid in pids.keys(): - path_string = test_dir + pid.replace("/", "_") - store._store_and_validate_data(pid, path_string) - assert store._count(entity) == 3 - - -def test_cast_to_bytes(store): - """Test _to_bytes returns bytes.""" - string = "teststring" - # pylint: disable=W0212 - string_bytes = store._cast_to_bytes(string) - assert isinstance(string_bytes, bytes) - - -def test_get_hashstore_data_object_path(pids, store): - """Confirm resolve path returns correct object path""" - test_dir = "tests/testdata/" - for pid in pids.keys(): - path = Path(test_dir + pid.replace("/", "_")) - object_metadata = store.store_object(pid, path) - cid = object_metadata.cid - - obj_resolved_path = store._get_hashstore_data_object_path(cid) - calculated_obj_path = store.objects + "/" + "/".join(store._shard(cid)) - - assert calculated_obj_path == obj_resolved_path - - -def test_get_hashstore_metadata_path_metadata(pids, store): +def test_get_hashstore_metadata_path_relative_path(pids, store): """Confirm resolve path returns correct metadata path.""" test_dir = "tests/testdata/" format_id = "https://ns.dataone.org/service/types/v2.0#SystemMetadata" @@ -1904,6 +1893,14 @@ def test_check_string(store): store._check_string(tab_line, "tab_line") +def test_cast_to_bytes(store): + """Test _to_bytes returns bytes.""" + string = "teststring" + # pylint: disable=W0212 + string_bytes = store._cast_to_bytes(string) + assert isinstance(string_bytes, bytes) + + def test_stream_reads_file(pids): """Test that a stream can read a file and yield its contents.""" test_dir = "tests/testdata/" From 3ab23098beff6e87e3e4847f644fb8ab9a157d28 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 09:27:01 -0700 Subject: [PATCH 18/22] Revert 'pyproject.toml' version and add version to 'hashstore/__init__.py' to prepare use of poetry bump version --- pyproject.toml | 2 +- src/hashstore/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 92d608f9..7aa46011 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "1.1.0" +version = "1.0.0" description = "HashStore, an object storage system using content identifiers." authors = ["Dou Mok ", "Matt Jones ", "Matthew Brooke", "Jing Tao", "Jeanette Clark", "Ian M. Nesbitt"] diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index a841efa3..dcf03937 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -19,3 +19,4 @@ from hashstore.hashstore import HashStore, HashStoreFactory __all__ = ("HashStore", "HashStoreFactory") +__version__ = "1.0.0" From 22624db56d237c69043e6d60b621a20c5aec46b8 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 09:28:03 -0700 Subject: [PATCH 19/22] Add poetry bump version code to 'pyproject.toml' --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 7aa46011..41e45a0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,8 @@ python = ">=3.9" pathlib = ">=1.0.1" pyyaml = ">=6.0" +[tool.poetry_bumpversion.file."src/hashstore/__init__.py"] + [tool.poetry.group.dev.dependencies] pytest = ">=7.2.0" black = ">=22.10.0" From 50b8f8d044fc463b568e6288e6bfe7d6b867f9d5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 09:31:15 -0700 Subject: [PATCH 20/22] Update version to '1.1.0' via 'poetry version minor' --- pyproject.toml | 2 +- src/hashstore/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 41e45a0a..13d2e42b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hashstore" -version = "1.0.0" +version = "1.1.0" description = "HashStore, an object storage system using content identifiers." authors = ["Dou Mok ", "Matt Jones ", "Matthew Brooke", "Jing Tao", "Jeanette Clark", "Ian M. Nesbitt"] diff --git a/src/hashstore/__init__.py b/src/hashstore/__init__.py index dcf03937..be656f5e 100644 --- a/src/hashstore/__init__.py +++ b/src/hashstore/__init__.py @@ -19,4 +19,4 @@ from hashstore.hashstore import HashStore, HashStoreFactory __all__ = ("HashStore", "HashStoreFactory") -__version__ = "1.0.0" +__version__ = "1.1.0" From e78071195661616b455bb91953c840b4cf4e53f2 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 10:13:51 -0700 Subject: [PATCH 21/22] Update 'README.md' --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 163fa1c4..eeff4e9d 100644 --- a/README.md +++ b/README.md @@ -311,9 +311,9 @@ To install `hashstore` locally, create a virtual environment for python 3.9+, install poetry, and then install or build the package with `poetry install` or `poetry build`, respectively. -To run tests, navigate to the root directory and run `pytest -s`. The test suite contains tests that +To run tests, navigate to the root directory and run `pytest`. The test suite contains tests that take a longer time to run (relating to the storage of large files) - to execute all tests, run -`pytest --run-slow`. To see detailed +`pytest --run-slow`. ## HashStore Client From 44b1f79675fa75f876b8cdc310235a337528cd55 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Tue, 24 Sep 2024 12:15:33 -0700 Subject: [PATCH 22/22] Add type hints to all method signatures, fix minor issues related to typing inconsistencies and revise affected pytests --- src/hashstore/filehashstore.py | 280 ++++++++++-------- tests/filehashstore/test_filehashstore.py | 9 +- .../test_filehashstore_interface.py | 2 +- 3 files changed, 166 insertions(+), 125 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 3fabd431..357e8774 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -11,6 +11,7 @@ import inspect import fcntl import yaml +from typing import List, Dict, Union, Optional, IO, Tuple, Set, Any from dataclasses import dataclass from pathlib import Path from contextlib import closing @@ -190,7 +191,9 @@ def __init__(self, properties=None): # Configuration and Related Methods @staticmethod - def _load_properties(hashstore_yaml_path, hashstore_required_prop_keys): + def _load_properties( + hashstore_yaml_path: str, hashstore_required_prop_keys: List[str] + ) -> Dict[str, Union[str, int]]: """Get and return the contents of the current HashStore configuration. :return: HashStore properties with the following keys (and values): @@ -222,7 +225,7 @@ def _load_properties(hashstore_yaml_path, hashstore_required_prop_keys): ) return hashstore_yaml_dict - def _write_properties(self, properties): + def _write_properties(self, properties: Dict[str, Union[str, int]]) -> None: """Writes 'hashstore.yaml' to FileHashStore's root directory with the respective properties object supplied. @@ -290,8 +293,11 @@ def _write_properties(self, properties): @staticmethod def _build_hashstore_yaml_string( - store_depth, store_width, store_algorithm, store_metadata_namespace - ): + store_depth: int, + store_width: int, + store_algorithm: str, + store_metadata_namespace: str, + ) -> str: """Build a YAML string representing the configuration for a HashStore. :param int store_depth: Depth when sharding an object's hex digest. @@ -338,7 +344,9 @@ def _build_hashstore_yaml_string( """ return hashstore_configuration_yaml - def _verify_hashstore_properties(self, properties, prop_store_path): + def _verify_hashstore_properties( + self, properties: Dict[str, Union[str, int]], prop_store_path: str + ) -> None: """Determines whether FileHashStore can instantiate by validating a set of arguments and throwing exceptions. HashStore will not instantiate if an existing configuration file's properties (`hashstore.yaml`) are different from what is supplied - or if an @@ -392,7 +400,9 @@ def _verify_hashstore_properties(self, properties, prop_store_path): logging.critical(exception_string) raise RuntimeError(exception_string) - def _validate_properties(self, properties): + def _validate_properties( + self, properties: Dict[str, Union[str, int]] + ) -> Dict[str, Union[str, int]]: """Validate a properties dictionary by checking if it contains all the required keys and non-None values. @@ -496,13 +506,13 @@ def lookup_algo(algo_to_translate): def store_object( self, - pid=None, - data=None, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - expected_object_size=None, - ): + pid: Optional[str] = None, + data: Optional[Union[str, bytes]] = None, + additional_algorithm: Optional[str] = None, + checksum: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + expected_object_size: Optional[int] = None, + ) -> "ObjectMetadata": if pid is None and self._check_arg_data(data): # If no pid is supplied, store the object only without tagging logging.debug("FileHashStore - store_object: Request to store data only.") @@ -587,7 +597,7 @@ def store_object( return object_metadata - def tag_object(self, pid, cid): + def tag_object(self, pid: str, cid: str) -> None: logging.debug( "FileHashStore - tag_object: Tagging object cid: %s with pid: %s.", cid, @@ -612,8 +622,12 @@ def tag_object(self, pid, cid): raise PidRefsAlreadyExistsError(err_msg) def delete_if_invalid_object( - self, object_metadata, checksum, checksum_algorithm, expected_file_size - ): + self, + object_metadata: "ObjectMetadata", + checksum: str, + checksum_algorithm: str, + expected_file_size: int, + ) -> None: self._check_string(checksum, "checksum") self._check_string(checksum_algorithm, "checksum_algorithm") self._check_integer(expected_file_size) @@ -657,7 +671,9 @@ def delete_if_invalid_object( object_metadata.cid, ) - def store_metadata(self, pid, metadata, format_id=None): + def store_metadata( + self, pid: str, metadata: Union[str, bytes], format_id: Optional[str] = None + ) -> str: logging.debug( "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) @@ -717,7 +733,7 @@ def store_metadata(self, pid, metadata, format_id=None): self.metadata_locked_docs_th.remove(pid_doc) self.metadata_condition_th.notify() - def retrieve_object(self, pid): + def retrieve_object(self, pid: str) -> IO[bytes]: logging.debug( "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, @@ -746,7 +762,7 @@ def retrieve_object(self, pid): return obj_stream - def retrieve_metadata(self, pid, format_id=None): + def retrieve_metadata(self, pid: str, format_id: Optional[str] = None) -> IO[bytes]: logging.debug( "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, @@ -777,7 +793,7 @@ def retrieve_metadata(self, pid, format_id=None): logging.error(exception_string) raise ValueError(exception_string) - def delete_object(self, pid): + def delete_object(self, pid: str) -> None: logging.debug( "FileHashStore - delete_object: Request to delete object for id: %s", pid ) @@ -854,7 +870,7 @@ def delete_object(self, pid): self._rename_path_for_deletion(pid_ref_abs_path) ) # Remove pid from cid reference file - self._update_refs_file(cid_ref_abs_path, pid, "remove") + self._update_refs_file(Path(cid_ref_abs_path), pid, "remove") # Delete cid reference file and object only if the cid refs file is empty if os.path.getsize(cid_ref_abs_path) == 0: debug_msg = ( @@ -926,7 +942,7 @@ def delete_object(self, pid): return except RefsFileExistsButCidObjMissing: # Add pid refs file to be permanently deleted - pid_ref_abs_path = str(self._get_hashstore_pid_refs_path(pid)) + pid_ref_abs_path = self._get_hashstore_pid_refs_path(pid) objects_to_delete.append( self._rename_path_for_deletion(pid_ref_abs_path) ) @@ -934,8 +950,8 @@ def delete_object(self, pid): pid_refs_cid = self._read_small_file_content(pid_ref_abs_path) cid_ref_abs_str = str(self._get_hashstore_cid_refs_path(pid_refs_cid)) # Remove if the pid refs is found - if self._is_string_in_refs_file(pid, cid_ref_abs_str): - self._update_refs_file(cid_ref_abs_str, pid, "remove") + if self._is_string_in_refs_file(pid, Path(cid_ref_abs_str)): + self._update_refs_file(Path(cid_ref_abs_str), pid, "remove") # Remove metadata files if they exist self.delete_metadata(pid) # Remove all files confirmed for deletion @@ -970,7 +986,7 @@ def delete_object(self, pid): self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() - def delete_metadata(self, pid, format_id=None): + def delete_metadata(self, pid: str, format_id: Optional[str] = None) -> None: logging.debug( "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, @@ -1100,7 +1116,7 @@ def delete_metadata(self, pid, format_id=None): self.metadata_locked_docs_th.remove(pid_doc) self.metadata_condition_th.notify() - def get_hex_digest(self, pid, algorithm): + def get_hex_digest(self, pid: str, algorithm: str) -> str: logging.debug( "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, @@ -1129,7 +1145,7 @@ def get_hex_digest(self, pid, algorithm): # FileHashStore Core Methods - def _find_object(self, pid): + def _find_object(self, pid: str) -> Dict[str, str]: """Check if an object referenced by a pid exists and retrieve its content identifier. The `find_object` method validates the existence of an object based on the provided pid and returns the associated content identifier. @@ -1157,7 +1173,7 @@ def _find_object(self, pid): cid_ref_abs_path = self._get_hashstore_cid_refs_path(pid_refs_cid) if os.path.exists(cid_ref_abs_path): # Check that the pid is actually found in the cid reference file - if self._is_string_in_refs_file(pid, str(cid_ref_abs_path)): + if self._is_string_in_refs_file(pid, cid_ref_abs_path): # Object must also exist in order to return the cid retrieved if not self._exists("objects", pid_refs_cid): err_msg = ( @@ -1216,13 +1232,13 @@ def _find_object(self, pid): def _store_and_validate_data( self, - pid, - file, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - file_size_to_validate=None, - ): + pid: str, + file: Union[str, bytes], + additional_algorithm: Optional[str] = None, + checksum: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + file_size_to_validate: Optional[int] = None, + ) -> "ObjectMetadata": """Store contents of `file` on disk, validate the object's parameters if provided, and tag/reference the object. @@ -1266,7 +1282,7 @@ def _store_and_validate_data( ) return object_metadata - def _store_data_only(self, data): + def _store_data_only(self, data: Union[str, bytes]) -> "ObjectMetadata": """Store an object to HashStore and return a metadata object containing the content identifier, object file size and hex digests dictionary of the default algorithms. This method does not validate the object and writes directly to `/objects` after the hex @@ -1320,13 +1336,13 @@ def _store_data_only(self, data): def _move_and_get_checksums( self, - pid, - stream, - additional_algorithm=None, - checksum=None, - checksum_algorithm=None, - file_size_to_validate=None, - ): + pid: Optional[str], + stream: "Stream", + additional_algorithm: Optional[str] = None, + checksum: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + file_size_to_validate: Optional[int] = None, + ) -> Tuple[str, int, Dict[str, str]]: """Copy the contents of the `Stream` object onto disk. The copy process uses a temporary file to store the initial contents and returns a dictionary of algorithms and their hex digest values. If the file already exists, the method will immediately @@ -1477,8 +1493,11 @@ def _move_and_get_checksums( return object_cid, tmp_file_size, hex_digests def _write_to_tmp_file_and_get_hex_digests( - self, stream, additional_algorithm=None, checksum_algorithm=None - ): + self, + stream: "Stream", + additional_algorithm: Optional[str] = None, + checksum_algorithm: Optional[str] = None, + ) -> Tuple[Dict[str, str], str, int]: """Create a named temporary file from a `Stream` object and return its filename and a dictionary of its algorithms and hex digests. If an additional and/or checksum algorithm is provided, it will add the respective hex digest to the dictionary if @@ -1491,6 +1510,7 @@ def _write_to_tmp_file_and_get_hex_digests( :return: tuple - hex_digest_dict, tmp.name - hex_digest_dict (dict): Algorithms and their hex digests. - tmp.name (str): Name of the temporary file created and written into. + - tmp_file_size (int): Size of the data object """ # Review additional hash object to digest and create new list algorithm_list_to_calculate = self._refine_algorithm_list( @@ -1567,7 +1587,7 @@ def _write_to_tmp_file_and_get_hex_digests( ) logging.error(exception_string) - def _mktmpfile(self, path): + def _mktmpfile(self, path: Path) -> IO[bytes]: """Create a temporary file at the given path ready to be written. :param Path path: Path to the file location. @@ -1596,7 +1616,7 @@ def delete_tmp_file(): os.umask(oldmask) return tmp - def _store_hashstore_refs_files(self, pid, cid): + def _store_hashstore_refs_files(self, pid: str, cid: str) -> None: """Create the pid refs file and create/update cid refs files in HashStore to establish the relationship between a 'pid' and a 'cid'. @@ -1711,7 +1731,7 @@ def _store_hashstore_refs_files(self, pid, cid): self._release_object_locked_cids(cid) self._release_reference_locked_pids(pid) - def _untag_object(self, pid, cid): + def _untag_object(self, pid: str, cid: str) -> None: """Untags a data object in HashStore by deleting the 'pid reference file' and removing the 'pid' from the 'cid reference file'. This method will never delete a data object. `_untag_object` will attempt to proceed with as much of the untagging process as @@ -1837,7 +1857,9 @@ def _untag_object(self, pid, cid): ) logging.warning(warn_msg) - def _put_metadata(self, metadata, pid, metadata_doc_name): + def _put_metadata( + self, metadata: Union[str, bytes], pid: str, metadata_doc_name: str + ) -> Path: """Store contents of metadata to `[self.root]/metadata` using the hash of the given PID and format ID as the permanent address. @@ -1895,7 +1917,7 @@ def _put_metadata(self, metadata, pid, metadata_doc_name): logging.error(exception_string) raise FileNotFoundError(exception_string) - def _mktmpmetadata(self, stream): + def _mktmpmetadata(self, stream: "Stream") -> str: """Create a named temporary file with `stream` (metadata). :param Stream stream: Metadata stream. @@ -1925,7 +1947,7 @@ def _mktmpmetadata(self, stream): # FileHashStore Utility & Supporting Methods @staticmethod - def _delete_marked_files(delete_list): + def _delete_marked_files(delete_list: list[str]) -> None: """Delete all the file paths in a given delete list. :param list delete_list: Persistent or authority-based identifier. @@ -1940,7 +1962,9 @@ def _delete_marked_files(delete_list): else: raise ValueError("delete_marked_files: list cannot be None") - def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): + def _mark_pid_refs_file_for_deletion( + self, pid: str, delete_list: List[str], pid_refs_path: Path + ) -> None: """Attempt to rename a pid refs file and add the renamed file to a provided list. :param str pid: Persistent or authority-based identifier. @@ -1957,7 +1981,9 @@ def _mark_pid_refs_file_for_deletion(self, pid, delete_list, pid_refs_path): ) logging.error(err_msg) - def _remove_pid_and_handle_cid_refs_deletion(self, pid, delete_list, cid_refs_path): + def _remove_pid_and_handle_cid_refs_deletion( + self, pid: str, delete_list: List[str], cid_refs_path: Path + ) -> None: """Attempt to remove a pid from a 'cid refs file' and add the 'cid refs file' to the delete list if it is empty. @@ -1979,7 +2005,9 @@ def _remove_pid_and_handle_cid_refs_deletion(self, pid, delete_list, cid_refs_pa ) logging.error(err_msg) - def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): + def _validate_and_check_cid_lock( + self, pid: str, cid: str, cid_to_check: str + ) -> None: """Confirm that the two content identifiers provided are equal and is locked to ensure thread safety. @@ -1998,7 +2026,7 @@ def _validate_and_check_cid_lock(self, pid, cid, cid_to_check): raise ValueError(err_msg) self._check_object_locked_cids(cid) - def _write_refs_file(self, path, ref_id, ref_type): + def _write_refs_file(self, path: Path, ref_id: str, ref_type: str) -> str: """Write a reference file in the supplied path into a temporary file. All `pid` or `cid` reference files begin with a single identifier, with the difference being that a cid reference file can potentially contain multiple @@ -2034,7 +2062,9 @@ def _write_refs_file(self, path, ref_id, ref_type): logging.error(exception_string) raise err - def _update_refs_file(self, refs_file_path, ref_id, update_type): + def _update_refs_file( + self, refs_file_path: Path, ref_id: str, update_type: str + ) -> None: """Add or remove an existing ref from a refs file. :param path refs_file_path: Absolute path to the refs file. @@ -2090,7 +2120,7 @@ def _update_refs_file(self, refs_file_path, ref_id, update_type): raise err @staticmethod - def _is_string_in_refs_file(ref_id, refs_file_path): + def _is_string_in_refs_file(ref_id: str, refs_file_path: Path) -> bool: """Check a reference file for a ref_id (`cid` or `pid`). :param str ref_id: Authority-based, persistent identifier or content identifier @@ -2109,15 +2139,15 @@ def _is_string_in_refs_file(ref_id, refs_file_path): def _verify_object_information( self, - pid, - checksum, - checksum_algorithm, - entity, - hex_digests, - tmp_file_name, - tmp_file_size, - file_size_to_validate, - ): + pid: Optional[str], + checksum: str, + checksum_algorithm: str, + entity: str, + hex_digests: Dict[str, str], + tmp_file_name: Optional[str], + tmp_file_size: int, + file_size_to_validate: int, + ) -> None: """Evaluates an object's integrity - if there is a mismatch, deletes the object in question and raises an exception. @@ -2198,12 +2228,12 @@ def _verify_object_information( def _verify_hashstore_references( self, - pid, - cid, - pid_refs_path=None, - cid_refs_path=None, - additional_log_string=None, - ): + pid: str, + cid: str, + pid_refs_path: Optional[Path] = None, + cid_refs_path: Optional[Path] = None, + additional_log_string: Optional[str] = None, + ) -> None: """Verifies that the supplied pid and pid reference file and content have been written successfully. @@ -2227,7 +2257,7 @@ def _verify_hashstore_references( if not os.path.exists(pid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Pid refs file missing: " - + pid_refs_path + + str(pid_refs_path) + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) @@ -2235,7 +2265,7 @@ def _verify_hashstore_references( if not os.path.exists(cid_refs_path): exception_string = ( "FileHashStore - _verify_hashstore_references: Cid refs file missing: " - + cid_refs_path + + str(cid_refs_path) + f" . Additional Context: {additional_log_string}" ) logging.error(exception_string) @@ -2262,7 +2292,7 @@ def _verify_hashstore_references( logging.error(exception_string) raise CidRefsContentError(exception_string) - def _delete_object_only(self, cid): + def _delete_object_only(self, cid: str) -> None: """Attempt to delete an object based on the given content identifier (cid). If the object has any pids references and/or a cid refs file exists, the object will not be deleted. @@ -2314,8 +2344,11 @@ def _delete_object_only(self, cid): self.object_cid_condition_th.notify() def _check_arg_algorithms_and_checksum( - self, additional_algorithm, checksum, checksum_algorithm - ): + self, + additional_algorithm: Optional[str], + checksum: Optional[str], + checksum_algorithm: Optional[str], + ) -> Tuple[Optional[str], Optional[str]]: """Determines whether the caller has supplied the necessary arguments to validate an object with a checksum value. @@ -2343,7 +2376,7 @@ def _check_arg_algorithms_and_checksum( checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked - def _check_arg_format_id(self, format_id, method): + def _check_arg_format_id(self, format_id: str, method: str) -> str: """Determines the metadata namespace (format_id) to use for storing, retrieving, and deleting metadata. @@ -2364,7 +2397,9 @@ def _check_arg_format_id(self, format_id, method): checked_format_id = format_id return checked_format_id - def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): + def _refine_algorithm_list( + self, additional_algorithm: Optional[str], checksum_algorithm: Optional[str] + ) -> Set[str]: """Create the final list of hash algorithms to calculate. :param str additional_algorithm: Additional algorithm. @@ -2397,7 +2432,7 @@ def _refine_algorithm_list(self, additional_algorithm, checksum_algorithm): algorithm_list_to_calculate = set(algorithm_list_to_calculate) return algorithm_list_to_calculate - def _clean_algorithm(self, algorithm_string): + def _clean_algorithm(self, algorithm_string: str) -> str: """Format a string and ensure that it is supported and compatible with the Python `hashlib` library. @@ -2427,7 +2462,9 @@ def _clean_algorithm(self, algorithm_string): raise UnsupportedAlgorithm(exception_string) return cleaned_string - def _computehash(self, stream, algorithm=None): + def _computehash( + self, stream: Union["Stream", str, IO[bytes]], algorithm: Optional[str] = None + ) -> str: """Compute the hash of a file-like object (or string) using the store algorithm by default or with an optional supported algorithm. @@ -2448,7 +2485,7 @@ def _computehash(self, stream, algorithm=None): hex_digest = hashobj.hexdigest() return hex_digest - def _shard(self, checksum): + def _shard(self, checksum: str) -> List[str]: """Splits the given checksum into a list of tokens of length `self.width`, followed by the remainder. @@ -2468,7 +2505,7 @@ def _shard(self, checksum): :rtype: list """ - def compact(items): + def compact(items: List[Any]) -> List[Any]: """Return only truthy elements of `items`.""" # truthy_items = [] # for item in items: @@ -2486,7 +2523,7 @@ def compact(items): return hierarchical_list - def _count(self, entity): + def _count(self, entity: str) -> int: """Return the count of the number of files in the `root` directory. :param str entity: Desired entity type (ex. "objects", "metadata"). @@ -2515,7 +2552,7 @@ def _count(self, entity): count += 1 return count - def _exists(self, entity, file): + def _exists(self, entity: str, file: str) -> bool: """Check whether a given file id or path exists on disk. :param str entity: Desired entity type (e.g., "objects", "metadata"). @@ -2535,7 +2572,9 @@ def _exists(self, entity, file): except FileNotFoundError: return False - def _open(self, entity, file, mode="rb"): + def _open( + self, entity: str, file: str, mode: str = "rb" + ) -> Union[IO[bytes], IO[str]]: """Return open buffer object from given id or path. Caller is responsible for closing the stream. @@ -2559,7 +2598,7 @@ def _open(self, entity, file, mode="rb"): buffer = io.open(realpath, mode) return buffer - def _delete(self, entity, file): + def _delete(self, entity: str, file: Union[str, Path]) -> None: """Delete file using id or path. Remove any empty directories after deleting. No exception is raised if file doesn't exist. @@ -2594,7 +2633,7 @@ def _delete(self, entity, file): logging.error(exception_string) raise err - def _create_path(self, path): + def _create_path(self, path: Path) -> None: """Physically create the folder path (and all intermediate ones) on disk. :param Path path: The path to create. @@ -2605,7 +2644,7 @@ def _create_path(self, path): except FileExistsError: assert os.path.isdir(path), f"expected {path} to be a directory" - def _get_store_path(self, entity): + def _get_store_path(self, entity: str) -> Path: """Return a path object to the root directory of the requested hashstore directory type :param str entity: Desired entity type: "objects", "metadata", "refs", "cid" and "pid". @@ -2629,7 +2668,7 @@ def _get_store_path(self, entity): f"entity: {entity} does not exist. Do you mean 'objects', 'metadata' or 'refs'?" ) - def _build_hashstore_data_object_path(self, hash_id): + def _build_hashstore_data_object_path(self, hash_id: str) -> str: """Build the absolute file path for a given content identifier :param str hash_id: A hash ID to build a file path for. @@ -2642,7 +2681,7 @@ def _build_hashstore_data_object_path(self, hash_id): absolute_path = os.path.join(root_dir, *paths) return absolute_path - def _get_hashstore_data_object_path(self, cid_or_relative_path): + def _get_hashstore_data_object_path(self, cid_or_relative_path: str) -> Path: """Get the expected path to a hashstore data object that exists using a content identifier. :param str cid_or_relative_path: Content identifier or relative path in '/objects' to check @@ -2654,16 +2693,16 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path): cid_or_relative_path ) if os.path.isfile(expected_abs_data_obj_path): - return expected_abs_data_obj_path + return Path(expected_abs_data_obj_path) else: if os.path.isfile(cid_or_relative_path): # Check whether the supplied arg is an abs path that exists or not for convenience - return cid_or_relative_path + return Path(cid_or_relative_path) else: # Check the relative path relpath = os.path.join(self.objects, cid_or_relative_path) if os.path.isfile(relpath): - return relpath + return Path(relpath) else: raise FileNotFoundError( "FileHashStore - _get_hashstore_data_object_path: could not locate a" @@ -2671,7 +2710,7 @@ def _get_hashstore_data_object_path(self, cid_or_relative_path): + cid_or_relative_path ) - def _get_hashstore_metadata_path(self, metadata_relative_path): + def _get_hashstore_metadata_path(self, metadata_relative_path: str) -> Path: """Return the expected metadata path to a hashstore metadata object that exists. :param str metadata_relative_path: Metadata path to check or relative path in @@ -2683,11 +2722,11 @@ def _get_hashstore_metadata_path(self, metadata_relative_path): # Form the absolute path to the metadata file expected_abs_metadata_path = os.path.join(self.metadata, metadata_relative_path) if os.path.isfile(expected_abs_metadata_path): - return expected_abs_metadata_path + return Path(expected_abs_metadata_path) else: if os.path.isfile(metadata_relative_path): # Check whether the supplied arg is an abs path that exists or not for convenience - return metadata_relative_path + return Path(metadata_relative_path) else: raise FileNotFoundError( "FileHashStore - _get_hashstore_metadata_path: could not locate a" @@ -2695,7 +2734,7 @@ def _get_hashstore_metadata_path(self, metadata_relative_path): + metadata_relative_path ) - def _get_hashstore_pid_refs_path(self, pid): + def _get_hashstore_pid_refs_path(self, pid: str) -> Path: """Return the expected path to a pid reference file. The path may or may not exist. :param str pid: Persistent or authority-based identifier @@ -2708,9 +2747,9 @@ def _get_hashstore_pid_refs_path(self, pid): root_dir = self._get_store_path("pid") directories_and_path = self._shard(hash_id) pid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) - return pid_ref_file_abs_path + return Path(pid_ref_file_abs_path) - def _get_hashstore_cid_refs_path(self, cid): + def _get_hashstore_cid_refs_path(self, cid: str) -> Path: """Return the expected path to a cid reference file. The path may or may not exist. :param str cid: Content identifier @@ -2722,11 +2761,11 @@ def _get_hashstore_cid_refs_path(self, cid): # The content identifier is to be split into directories as is supplied directories_and_path = self._shard(cid) cid_ref_file_abs_path = os.path.join(root_dir, *directories_and_path) - return cid_ref_file_abs_path + return Path(cid_ref_file_abs_path) # Synchronization Methods - def _release_object_locked_pids(self, pid): + def _release_object_locked_pids(self, pid: str) -> None: """Remove the given persistent identifier from 'object_locked_pids' and notify other waiting threads or processes. @@ -2742,7 +2781,7 @@ def _release_object_locked_pids(self, pid): self.object_locked_pids_th.remove(pid) self.object_pid_condition_th.notify() - def _synchronize_object_locked_cids(self, cid): + def _synchronize_object_locked_cids(self, cid: str) -> None: """Multiple threads may access a data object via its 'cid' or the respective 'cid reference file' (which contains a list of 'pid's that reference a 'cid') and this needs to be coordinated. @@ -2776,7 +2815,7 @@ def _synchronize_object_locked_cids(self, cid): + f" cid: {cid}" ) - def _check_object_locked_cids(self, cid): + def _check_object_locked_cids(self, cid: str) -> None: """Check that a given content identifier is currently locked (found in the 'object_locked_cids' array). If it is not, an exception will be thrown. @@ -2793,7 +2832,7 @@ def _check_object_locked_cids(self, cid): logging.error(err_msg) raise IdentifierNotLocked(err_msg) - def _release_object_locked_cids(self, cid): + def _release_object_locked_cids(self, cid: str) -> None: """Remove the given content identifier from 'object_locked_cids' and notify other waiting threads or processes. @@ -2818,7 +2857,7 @@ def _release_object_locked_cids(self, cid): ) logging.debug(end_sync_debug_msg) - def _synchronize_referenced_locked_pids(self, pid): + def _synchronize_referenced_locked_pids(self, pid: str) -> None: """Multiple threads may interact with a pid (to tag, untag, delete) and these actions must be coordinated to prevent unexpected behaviour/race conditions that cause chaos. @@ -2851,7 +2890,7 @@ def _synchronize_referenced_locked_pids(self, pid): + f" for pid: {pid}" ) - def _check_reference_locked_pids(self, pid): + def _check_reference_locked_pids(self, pid: str) -> None: """Check that a given persistent identifier is currently locked (found in the 'reference_locked_pids' array). If it is not, an exception will be thrown. @@ -2868,7 +2907,7 @@ def _check_reference_locked_pids(self, pid): logging.error(err_msg) raise IdentifierNotLocked(err_msg) - def _release_reference_locked_pids(self, pid): + def _release_reference_locked_pids(self, pid: str) -> None: """Remove the given persistent identifier from 'reference_locked_pids' and notify other waiting threads or processes. @@ -2896,7 +2935,7 @@ def _release_reference_locked_pids(self, pid): # Other Static Methods @staticmethod - def _read_small_file_content(path_to_file): + def _read_small_file_content(path_to_file: Path): """Read the contents of a file with the given path. This method is not optimized for large files - so it should only be used for small files (like reference files). @@ -2910,10 +2949,10 @@ def _read_small_file_content(path_to_file): return content @staticmethod - def _rename_path_for_deletion(path): + def _rename_path_for_deletion(path: Union[Path, str]) -> str: """Rename a given path by appending '_delete' and move it to the renamed path. - :param string path: Path to file to rename + :param Path path: Path to file to rename :return: Path to the renamed file :rtype: str @@ -2922,10 +2961,11 @@ def _rename_path_for_deletion(path): path = Path(path) delete_path = path.with_name(path.stem + "_delete" + path.suffix) shutil.move(path, delete_path) - return delete_path + # TODO: Adjust all code for constructing paths to use path and revise accordingly + return str(delete_path) @staticmethod - def _get_file_paths(directory): + def _get_file_paths(directory: Union[str, Path]) -> Optional[List[Path]]: """Get the file paths of a given directory if it exists :param mixed directory: String or path to directory. @@ -2945,7 +2985,7 @@ def _get_file_paths(directory): return None @staticmethod - def _check_arg_data(data): + def _check_arg_data(data: Union[str, os.PathLike, io.BufferedReader]) -> bool: """Checks a data argument to ensure that it is either a string, path, or stream object. @@ -2976,7 +3016,7 @@ def _check_arg_data(data): return True @staticmethod - def _check_integer(file_size): + def _check_integer(file_size: int) -> None: """Check whether a given argument is an integer and greater than 0; throw an exception if not. @@ -2998,7 +3038,7 @@ def _check_integer(file_size): raise ValueError(exception_string) @staticmethod - def _check_string(string, arg): + def _check_string(string: str, arg: str) -> None: """Check whether a string is None or empty - or if it contains an illegal character; throws an exception if so. @@ -3015,7 +3055,7 @@ def _check_string(string, arg): raise ValueError(exception_string) @staticmethod - def _cast_to_bytes(text): + def _cast_to_bytes(text: any) -> bytes: """Convert text to a sequence of bytes using utf-8 encoding. :param Any text: String to convert. @@ -3027,7 +3067,7 @@ def _cast_to_bytes(text): return text -class Stream(object): +class Stream: """Common interface for file-like objects. The input `obj` can be a file-like object or a path to a file. If `obj` is @@ -3041,7 +3081,7 @@ class Stream(object): set its position back to ``0``. """ - def __init__(self, obj): + def __init__(self, obj: Union[IO[bytes], str]): if hasattr(obj, "read"): pos = obj.tell() elif os.path.isfile(obj): @@ -3099,7 +3139,7 @@ class ObjectMetadata: :param str pid: An authority-based or persistent identifier :param str cid: A unique identifier for the object (Hash ID, hex digest). :param int obj_size: The size of the object in bytes. - :param list hex_digests: A list of hex digests to validate objects + :param dict hex_digests: A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) (optional). """ diff --git a/tests/filehashstore/test_filehashstore.py b/tests/filehashstore/test_filehashstore.py index 4c150b7f..ead33e2c 100644 --- a/tests/filehashstore/test_filehashstore.py +++ b/tests/filehashstore/test_filehashstore.py @@ -1160,9 +1160,10 @@ def test_remove_pid_and_handle_cid_refs_deletion_cid_refs_empty(store): cid_refs_path = store._get_hashstore_cid_refs_path(cid) store._remove_pid_and_handle_cid_refs_deletion(pid, list_to_check, cid_refs_path) + delete_path = cid_refs_path.with_name(cid_refs_path.name + "_delete") assert not os.path.exists(cid_refs_path) - assert os.path.exists(cid_refs_path + "_delete") + assert os.path.exists(delete_path) assert len(list_to_check) == 1 @@ -1836,7 +1837,7 @@ def test_get_hashstore_metadata_path_relative_path(pids, store): store.metadata + "/" + rel_path + "/" + metadata_document_name ) - assert calculated_metadata_path == metadata_resolved_path + assert Path(calculated_metadata_path) == metadata_resolved_path def test_get_hashstore_pid_refs_path(pids, store): @@ -1852,7 +1853,7 @@ def test_get_hashstore_pid_refs_path(pids, store): store.pids + "/" + "/".join(store._shard(pid_refs_metadata_hashid)) ) - assert resolved_pid_ref_abs_path == calculated_pid_ref_path + assert resolved_pid_ref_abs_path == Path(calculated_pid_ref_path) def test_get_hashstore_cid_refs_path(pids, store): @@ -1866,7 +1867,7 @@ def test_get_hashstore_cid_refs_path(pids, store): resolved_cid_ref_abs_path = store._get_hashstore_cid_refs_path(cid) calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) - assert resolved_cid_ref_abs_path == calculated_cid_ref_path + assert resolved_cid_ref_abs_path == Path(calculated_cid_ref_path) def test_check_string(store): diff --git a/tests/filehashstore/test_filehashstore_interface.py b/tests/filehashstore/test_filehashstore_interface.py index 791d9d82..0f93a64a 100644 --- a/tests/filehashstore/test_filehashstore_interface.py +++ b/tests/filehashstore/test_filehashstore_interface.py @@ -1104,7 +1104,7 @@ def test_store_metadata_metadata_path(pids, store): _object_metadata = store.store_object(pid, path) stored_metadata_path = store.store_metadata(pid, syspath, format_id) metadata_path = store._get_hashstore_metadata_path(stored_metadata_path) - assert stored_metadata_path == metadata_path + assert Path(stored_metadata_path) == metadata_path def test_store_metadata_thread_lock(store):