Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix metadata deserialization in async mode for PGVector #125

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions langchain_postgres/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -1058,17 +1058,38 @@ async def asimilarity_search_with_score_by_vector(

def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, float]]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a relevant unit test(s) for this?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @eyurtsev,

Thank you for your feedback. I've added unit tests to cover metadata deserialization in asynchronous operations, specifically targeting the _results_to_docs_and_scores method. The new tests ensure that metadata is correctly deserialized and that Document instances receive the proper metadata when using async_mode=True.

Please review the updated tests and let me know if you have any further comments or suggestions.

Best regards,

"""Return docs and scores from results."""
docs = [
(
Document(
id=str(result.EmbeddingStore.id),
page_content=result.EmbeddingStore.document,
metadata=result.EmbeddingStore.cmetadata,
),
result.distance if self.embeddings is not None else None,
docs = []
for result in results:
metadata = result.EmbeddingStore.cmetadata

# Attempt to convert metadata to a dict
try:
if isinstance(metadata, dict):
pass # Already a dict
elif isinstance(metadata, str):
metadata = json.loads(metadata)
elif hasattr(metadata, 'buf'):
# For Fragment types (e.g., from asyncpg)
metadata_bytes = metadata.buf
metadata_str = metadata_bytes.decode('utf-8')
metadata = json.loads(metadata_str)
elif hasattr(metadata, 'decode'):
# For other byte-like types
metadata_str = metadata.decode('utf-8')
metadata = json.loads(metadata_str)
else:
metadata = {} # Default to empty dict if unknown type
except Exception as e:
self.logger.warning(f"Failed to deserialize metadata: {e}")
metadata = {}

doc = Document(
id=str(result.EmbeddingStore.id),
page_content=result.EmbeddingStore.document,
metadata=metadata,
)
for result in results
]
score = result.distance if self.embeddings is not None else None
docs.append((doc, score))
return docs

def _handle_field_filter(
Expand Down
56 changes: 45 additions & 11 deletions tests/unit_tests/test_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,12 @@ def test_pgvector_with_metadatas_with_scores() -> None:

@pytest.mark.asyncio
async def test_async_pgvector_with_metadatas_with_scores() -> None:
"""Test end to end construction and search."""
"""Test construction and search with metadata deserialization."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
metadatas = [
{"page": str(i), "info": {"nested": f"value{i}"}}
for i in range(len(texts))
]
docsearch = await PGVector.afrom_texts(
texts=texts,
collection_name="test_collection",
Expand All @@ -170,10 +173,13 @@ async def test_async_pgvector_with_metadatas_with_scores() -> None:
connection=CONNECTION_STRING,
pre_delete_collection=True,
)
output = await docsearch.asimilarity_search_with_score("foo", k=1)
assert output == [
(Document(page_content="foo", metadata={"page": "0"}, id=AnyStr()), 0.0)
]
output = await docsearch.asimilarity_search_with_score("foo", k=3)
for i, (doc, score) in enumerate(output):
expected_metadata = metadatas[i]
assert doc.page_content == texts[i]
assert doc.metadata == expected_metadata
assert isinstance(doc.metadata, dict)
assert score is not None


def test_pgvector_with_filter_match() -> None:
Expand All @@ -196,9 +202,12 @@ def test_pgvector_with_filter_match() -> None:

@pytest.mark.asyncio
async def test_async_pgvector_with_filter_match() -> None:
"""Test end to end construction and search."""
"""Test search with filter and metadata deserialization."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
metadatas = [
{"page": str(i), "info": {"nested": f"value{i}"}}
for i in range(len(texts))
]
docsearch = await PGVector.afrom_texts(
texts=texts,
collection_name="test_collection_filter",
Expand All @@ -208,11 +217,36 @@ async def test_async_pgvector_with_filter_match() -> None:
pre_delete_collection=True,
)
output = await docsearch.asimilarity_search_with_score(
"foo", k=1, filter={"page": "0"}
"foo", k=3, filter={"page": "0"}
)
assert output == [
(Document(page_content="foo", metadata={"page": "0"}, id=AnyStr()), 0.0)
assert len(output) == 1
doc, score = output[0]
assert doc.page_content == "foo"
assert doc.metadata == metadatas[0]
assert isinstance(doc.metadata, dict)
assert score is not None


@pytest.mark.asyncio
async def test_async_pgvector_metadata_deserialization() -> None:
"""Test that metadata is correctly deserialized in async operations."""
texts = ["foo", "bar", "baz"]
metadatas = [
{"page": str(i), "info": {"nested": f"value{i}"}}
for i in range(len(texts))
]
docsearch = await PGVector.afrom_texts(
texts=texts,
collection_name="test_collection_metadata",
embedding=FakeEmbeddingsWithAdaDimension(),
metadatas=metadatas,
connection=CONNECTION_STRING,
pre_delete_collection=True,
)
output = await docsearch.asimilarity_search("foo", k=3)
for doc, expected_metadata in zip(output, metadatas):
assert doc.metadata == expected_metadata
assert isinstance(doc.metadata, dict)


def test_pgvector_with_filter_distant_match() -> None:
Expand Down