diff --git a/src/optimake/convert.py b/src/optimake/convert.py index 026dfd1..7afcab6 100644 --- a/src/optimake/convert.py +++ b/src/optimake/convert.py @@ -7,6 +7,7 @@ import os import warnings from collections import defaultdict +import datetime from pathlib import Path from typing import Any, Callable @@ -258,9 +259,51 @@ def _parse_entries( f"None of the provided parsers {ENTRY_PARSERS[entry_type]} could parse {_path}. Errors: {exceptions}" ) + if len(set(entry_ids)) != len(entry_ids): + raise RuntimeError("Duplicate entry IDs found even when generated directly from filepaths. This should not be possible.") + return parsed_entries, entry_ids +def _set_unique_entry_ids(entry_ids: list[str]) -> list[str]: + """Attempt to make the simplest unique set of entry IDs possible, + following a series of deterministic rules. + + Parameters: + entry_ids: A list of entry IDs derived from file paths. + + Returns: + A list of unique entry IDs. + + """ + + new_ids: list[str] = list(entry_ids) + target_num_ids = len(entry_ids) + depth: int = 0 + max_depth: int = 10 # somewhat arbitrary upper limit + # Loop through each filename and try to ablate directories until a unique set arises + while len(set(new_ids)) != target_num_ids and depth < max_depth: + for i, id in enumerate(entry_ids): + new_ids[i] = "/".join(id.split("/")[-1 - depth:]) + depth += 1 + + # Now try to ablate any common file names, e.g,. subfolders of POSCARs (1/POSCAR, 2/POSCAR) + # Loop through each filename and try to ablate directories until a unique set arises + new_ids_sans_common_filenames = ["/".join(new_id.split("/")[0:-2]) for new_id in new_ids] + if len(set(new_ids_sans_common_filenames)) == target_num_ids: + new_ids = new_ids_sans_common_filenames + + # Now try to ablate any file extensions + new_ids_sans_extensions = [id.split(".")[0] for id in new_ids] + if len(set(new_ids_sans_extensions)) == target_num_ids: + return new_ids_sans_extensions + + if len(set(new_ids)) != target_num_ids: + return entry_ids + + return new_ids + + def _parse_and_assign_properties( optimade_entries: dict[str, EntryResource], property_matches_by_file: dict[str | None, list[Path]], @@ -369,22 +412,27 @@ def construct_entries( _check_missing(entry_matches_by_file) # Parse into intermediate format - parsed_entries, entry_ids = _parse_entries( + parsed_entries, file_path_entry_ids = _parse_entries( archive_path, entry_matches_by_file, entry_config.entry_type, ) + # Generate a better set of entry IDs + unique_entry_ids = _set_unique_entry_ids(file_path_entry_ids) + # Parse properties property_matches_by_file: dict[str | None, list[Path]] = _get_matches( archive_path, entry_config.property_paths ) _check_missing(property_matches_by_file) + timestamp = datetime.datetime.now().isoformat() + # Construct OPTIMADE entries from intermediate format optimade_entries: dict[str, EntryResource] = {} - for entry_id, entry in tqdm.tqdm( - zip(entry_ids, parsed_entries), + for file_path_entry_id, unique_entry_id, entry in tqdm.tqdm( + zip(file_path_entry_ids, unique_entry_ids, parsed_entries), desc=f"Constructing OPTIMADE {entry_config.entry_type} entries", ): exceptions = {} @@ -406,12 +454,16 @@ def construct_entries( entry = entry.dict() if not entry["id"]: - entry["id"] = entry_id + entry["id"] = unique_entry_id + + if entry["id"] in optimade_entries: + raise RuntimeError(f"Duplicate entry ID found: {entry['id']}") - if entry_id in optimade_entries: - raise RuntimeError(f"Duplicate entry ID found: {entry_id}") + optimade_entries[entry["id"]] = entry - optimade_entries[entry_id] = entry + if not entry["attributes"].get("immutable_id"): + entry["attributes"]["immutable_id"] = file_path_entry_id + entry["attributes"]["last_modified"] = timestamp # Now try to parse the properties and assign them to OPTIMADE entries _parse_and_assign_properties( diff --git a/tests/test_convert.py b/tests/test_convert.py index 638da46..55b55ec 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -25,7 +25,7 @@ def test_convert_example_archives(archive_path, tmp_path): jsonl_path = convert_archive(tmp_path) assert jsonl_path.exists() - + jsonl_path_custom = convert_archive(tmp_path, jsonl_path=tmp_path / "test.jsonl") assert jsonl_path_custom.exists() @@ -73,3 +73,54 @@ def test_convert_example_archives(archive_path, tmp_path): assert json.dumps(first_entry["attributes"]) == json.dumps( next_entry["attributes"] ) + + +def test_unique_id_generator(): + """Unit tests for some common cases of the unique ID generator.""" + + from optimake.convert import _set_unique_entry_ids + + entry_ids = [ + "data/structures/1.cif", + "data/structures/2.cif", + "data/structures/3.cif", + ] + assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"] + + entry_ids = ["data/structures/1", "data/structures/2", "data/structures/3"] + assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"] + + entry_ids = [ + "data/structures/1/POSCAR", + "data/structures/2/POSCAR", + "data/structures/3/POSCAR", + ] + assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"] + + entry_ids = [ + "data1/structures/1/POSCAR", + "data2/structures/1/POSCAR", + "data3/structures/1/POSCAR", + ] + assert _set_unique_entry_ids(entry_ids) == entry_ids + + entry_ids = [ + "data.zip/data/structures/1.cif", + "data.zip/data/structures/2.cif", + "data.zip/data/structures/3.cif", + ] + assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"] + + entry_ids = [ + "data.tar.gz/data/structures/1.cif", + "data.tar.gz/data/structures/2.cif", + "data.tar.gz/data/structures/3.cif", + ] + assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"] + + entry_ids = [ + "data.tar.gz/data/structures/1.cif.gz", + "data.tar.gz/data/structures/2.cif.gz", + "data.tar.gz/data/structures/3.cif.gz", + ] + assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]