Define new scheme for ID generation and set immutable_id and `last_…

…modified`
materialscloud-org · Mar 31, 2024 · 4a74d77 · 4a74d77
1 parent f1de282
commit 4a74d77
Show file tree

Hide file tree

Showing 2 changed files with 111 additions and 8 deletions.
diff --git a/src/optimake/convert.py b/src/optimake/convert.py
@@ -7,6 +7,7 @@
 import os
 import warnings
 from collections import defaultdict
+import datetime
 from pathlib import Path
 from typing import Any, Callable
 
@@ -258,9 +259,51 @@ def _parse_entries(
                     f"None of the provided parsers {ENTRY_PARSERS[entry_type]} could parse {_path}. Errors: {exceptions}"
                 )
 
+    if len(set(entry_ids)) != len(entry_ids):
+        raise RuntimeError("Duplicate entry IDs found even when generated directly from filepaths. This should not be possible.")
+
     return parsed_entries, entry_ids
 
 
+def _set_unique_entry_ids(entry_ids: list[str]) -> list[str]:
+    """Attempt to make the simplest unique set of entry IDs possible,
+    following a series of deterministic rules.
+
+    Parameters:
+        entry_ids: A list of entry IDs derived from file paths.
+
+    Returns:
+        A list of unique entry IDs.
+
+    """
+
+    new_ids: list[str] = list(entry_ids)
+    target_num_ids = len(entry_ids)
+    depth: int = 0
+    max_depth: int = 10  # somewhat arbitrary upper limit
+    # Loop through each filename and try to ablate directories until a unique set arises
+    while len(set(new_ids)) != target_num_ids and depth < max_depth:
+        for i, id in enumerate(entry_ids):
+            new_ids[i] = "/".join(id.split("/")[-1 - depth:])
+        depth += 1
+
+    # Now try to ablate any common file names, e.g,. subfolders of POSCARs (1/POSCAR, 2/POSCAR)
+    # Loop through each filename and try to ablate directories until a unique set arises
+    new_ids_sans_common_filenames = ["/".join(new_id.split("/")[0:-2]) for new_id in new_ids]
+    if len(set(new_ids_sans_common_filenames)) == target_num_ids:
+        new_ids = new_ids_sans_common_filenames
+
+    # Now try to ablate any file extensions
+    new_ids_sans_extensions = [id.split(".")[0] for id in new_ids]
+    if len(set(new_ids_sans_extensions)) == target_num_ids:
+        return new_ids_sans_extensions
+
+    if len(set(new_ids)) != target_num_ids:
+        return entry_ids
+
+    return new_ids
+
+
 def _parse_and_assign_properties(
     optimade_entries: dict[str, EntryResource],
     property_matches_by_file: dict[str | None, list[Path]],
@@ -369,22 +412,27 @@ def construct_entries(
     _check_missing(entry_matches_by_file)
 
     # Parse into intermediate format
-    parsed_entries, entry_ids = _parse_entries(
+    parsed_entries, file_path_entry_ids = _parse_entries(
         archive_path,
         entry_matches_by_file,
         entry_config.entry_type,
     )
 
+    # Generate a better set of entry IDs
+    unique_entry_ids = _set_unique_entry_ids(file_path_entry_ids)
+
     # Parse properties
     property_matches_by_file: dict[str | None, list[Path]] = _get_matches(
         archive_path, entry_config.property_paths
     )
     _check_missing(property_matches_by_file)
 
+    timestamp = datetime.datetime.now().isoformat()
+
     # Construct OPTIMADE entries from intermediate format
     optimade_entries: dict[str, EntryResource] = {}
-    for entry_id, entry in tqdm.tqdm(
-        zip(entry_ids, parsed_entries),
+    for file_path_entry_id, unique_entry_id, entry in tqdm.tqdm(
+        zip(file_path_entry_ids, unique_entry_ids, parsed_entries),
         desc=f"Constructing OPTIMADE {entry_config.entry_type} entries",
     ):
         exceptions = {}
@@ -406,12 +454,16 @@ def construct_entries(
             entry = entry.dict()
 
         if not entry["id"]:
-            entry["id"] = entry_id
+            entry["id"] = unique_entry_id
+
+        if entry["id"] in optimade_entries:
+            raise RuntimeError(f"Duplicate entry ID found: {entry['id']}")
 
-        if entry_id in optimade_entries:
-            raise RuntimeError(f"Duplicate entry ID found: {entry_id}")
+        optimade_entries[entry["id"]] = entry
 
-        optimade_entries[entry_id] = entry
+        if not entry["attributes"].get("immutable_id"):
+            entry["attributes"]["immutable_id"] = file_path_entry_id
+        entry["attributes"]["last_modified"] = timestamp
 
     # Now try to parse the properties and assign them to OPTIMADE entries
     _parse_and_assign_properties(

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -25,7 +25,7 @@ def test_convert_example_archives(archive_path, tmp_path):
 
     jsonl_path = convert_archive(tmp_path)
     assert jsonl_path.exists()
-    
+
     jsonl_path_custom = convert_archive(tmp_path, jsonl_path=tmp_path / "test.jsonl")
     assert jsonl_path_custom.exists()
 
@@ -73,3 +73,54 @@ def test_convert_example_archives(archive_path, tmp_path):
             assert json.dumps(first_entry["attributes"]) == json.dumps(
                 next_entry["attributes"]
             )
+
+
+def test_unique_id_generator(entry_ids):
+    """Unit tests for some common cases of the unique ID generator."""
+
+    from optimake.convert import _set_unique_entry_ids
+
+    entry_ids = [
+        "data/structures/1.cif",
+        "data/structures/2.cif",
+        "data/structures/3.cif",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = ["data/structures/1", "data/structures/2", "data/structures/3"]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data/structures/1/POSCAR",
+        "data/structures/2/POSCAR",
+        "data/structures/3/POSCAR",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data1/structures/1/POSCAR",
+        "data2/structures/1/POSCAR",
+        "data3/structures/1/POSCAR",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == entry_ids
+
+    entry_ids = [
+        "data.zip/data/structures/1.cif",
+        "data.zip/data/structures/2.cif",
+        "data.zip/data/structures/3.cif",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data.tar.gz/data/structures/1.cif",
+        "data.tar.gz/data/structures/2.cif",
+        "data.tar.gz/data/structures/3.cif",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]
+
+    entry_ids = [
+        "data.tar.gz/data/structures/1.cif.gz",
+        "data.tar.gz/data/structures/2.cif.gz",
+        "data.tar.gz/data/structures/3.cif.gz",
+    ]
+    assert _set_unique_entry_ids(entry_ids) == ["1", "2", "3"]