Fixed NumpyVectorStore.__eq__'s NotImplemented case (#613)

Future-House · Oct 19, 2024 · b2faca3 · b2faca3
1 parent 701e6a9
commit b2faca3
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 5 deletions.
diff --git a/paperqa/llms.py b/paperqa/llms.py
@@ -845,13 +845,21 @@ class NumpyVectorStore(VectorStore):
     _embeddings_matrix: np.ndarray | None = None
 
     def __eq__(self, other) -> bool:
-        if isinstance(other, type(self)):
-            raise NotImplementedError
+        if not isinstance(other, type(self)):
+            return NotImplemented
         return (
             self.texts == other.texts
             and self.texts_hashes == other.texts_hashes
             and self.mmr_lambda == other.mmr_lambda
-            and self._embeddings_matrix == other._embeddings_matrix
+            and (
+                other._embeddings_matrix is None
+                if self._embeddings_matrix is None
+                else (
+                    False
+                    if other._embeddings_matrix is None
+                    else np.allclose(self._embeddings_matrix, other._embeddings_matrix)
+                )
+            )
         )
 
     def clear(self) -> None:

diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -3,6 +3,7 @@
 import pickle
 import textwrap
 from collections.abc import AsyncIterable
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 
@@ -610,7 +611,7 @@ def test_duplicate(stub_data_dir: Path) -> None:
     ), "Unique documents should be hashed as unique"
 
 
-def test_custom_embedding(stub_data_dir: Path) -> None:
+def test_docs_with_custom_embedding(subtests: SubTests, stub_data_dir: Path) -> None:
     class MyEmbeds(EmbeddingModel):
         name: str = "my_embed"
 
@@ -625,7 +626,35 @@ async def embed_documents(self, texts):
         citation="WikiMedia Foundation, 2023, Accessed now",
         embedding_model=MyEmbeds(),
     )
-    assert docs.texts[0].embedding == [1, 2, 3]
+    with subtests.test(msg="confirm-embedding"):
+        assert docs.texts[0].embedding == [1, 2, 3]
+
+    with subtests.test(msg="copying-before-get-evidence"):
+        # Before getting evidence, shallow and deep copies are the same
+        docs_shallow_copy = Docs(
+            texts_index=type(docs.texts_index)(**docs.texts_index.model_dump()),
+            **docs.model_dump(exclude={"texts_index"}),
+        )
+        docs_deep_copy = deepcopy(docs)
+        assert (
+            docs.texts_index
+            == docs_shallow_copy.texts_index
+            == docs_deep_copy.texts_index
+        )
+
+    with subtests.test(msg="copying-after-get-evidence"):
+        # After getting evidence, a shallow copy of Docs is not the same because its
+        # texts index gets lazily populated, while a deep copy should preserve it
+        docs.get_evidence(
+            "What country is Frederick Bates from?", embedding_model=MyEmbeds()
+        )
+        docs_shallow_copy = Docs(
+            texts_index=type(docs.texts_index)(**docs.texts_index.model_dump()),
+            **docs.model_dump(exclude={"texts_index"}),
+        )
+        docs_deep_copy = deepcopy(docs)
+        assert docs.texts_index != docs_shallow_copy.texts_index
+        assert docs.texts_index == docs_deep_copy.texts_index
 
 
 def test_sparse_embedding(stub_data_dir: Path) -> None: