Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed type problems from llmclient #770

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ repos:
- fhaviary[llm]>=0.10.2 # Match pyproject.toml
- ldp>=0.14.5 # Match pyproject.toml
- html2text
- fh-llm-client
- fh-llm-client>=0.0.7
- httpx
- pybtex
- numpy
Expand Down
6 changes: 4 additions & 2 deletions paperqa/agents/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from rich.table import Table

from paperqa.docs import Docs
from paperqa.types import DocDetails

from .models import AnswerResponse

Expand Down Expand Up @@ -91,10 +92,11 @@ def table_formatter(
table.add_column("Title", style="cyan")
table.add_column("File", style="magenta")
for obj, filename in objects:
doc = cast(DocDetails, cast(Docs, obj).texts[0].doc)
try:
display_name = cast(Docs, obj).texts[0].doc.title
display_name = cast(str, doc.title)
except AttributeError:
display_name = cast(Docs, obj).texts[0].doc.formatted_citation
display_name = doc.formatted_citation
table.add_row(display_name[:max_chars_per_column], filename)
return table
raise NotImplementedError(
Expand Down
6 changes: 2 additions & 4 deletions paperqa/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,14 @@ async def query(self, **kwargs) -> DocDetails | None:
sum(
await gather_with_concurrency(
len(task.processors),
task.processor_queries(
cast(DocDetails, doc_details), session
),
task.processor_queries(doc_details, session),
)
)
or None
)

if doc_details:
doc_details = cast(DocDetails, doc_details)
# doc_details = cast(DocDetails, doc_details)
maykcaldas marked this conversation as resolved.
Show resolved Hide resolved
# abuse int handling in __add__ for empty all_doc_details, None types won't work
all_doc_details = doc_details + (all_doc_details or 0)

Expand Down
15 changes: 9 additions & 6 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,10 +477,12 @@ async def aadd_texts(
# 3. Update self
# NOTE: we defer adding texts to the texts index to retrieval time
# (e.g. `self.texts_index.add_texts_and_embeddings(texts)`)
self.docs[doc.dockey] = doc
self.texts += texts
self.docnames.add(doc.docname)
return True
if doc.docname and doc.dockey:
self.docs[doc.dockey] = doc
self.texts += texts
self.docnames.add(doc.docname)
return True
return False

def delete(
self,
Expand All @@ -496,8 +498,9 @@ def delete(
doc = next((doc for doc in self.docs.values() if doc.docname == name), None)
if doc is None:
return
self.docnames.remove(doc.docname)
dockey = doc.dockey
if doc.docname and doc.dockey:
self.docnames.remove(doc.docname)
dockey = doc.dockey
del self.docs[dockey]
self.deleted_dockeys.add(dockey)
self.texts = list(filter(lambda x: x.doc.dockey != dockey, self.texts))
Expand Down
6 changes: 4 additions & 2 deletions paperqa/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
Iterable,
Sequence,
)
from typing import Any
from typing import Any, cast

import numpy as np
from llmclient import (
Expand Down Expand Up @@ -334,8 +334,10 @@ def add_texts_and_embeddings(self, texts: Iterable[Embeddable]) -> None:
texts_list = list(texts)

if texts_list and not self.client.collection_exists(self.collection_name):

params = models.VectorParams(
size=len(texts_list[0].embedding), distance=models.Distance.COSINE
size=len(cast(list, texts_list[0].embedding)),
distance=models.Distance.COSINE,
)
self.client.create_collection(
self.collection_name,
Expand Down
6 changes: 3 additions & 3 deletions paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ def set_llm_session_ids(session_id: UUID):


class Doc(Embeddable):
docname: str
docname: str | None = None
dockey: DocKey | None = None
Comment on lines +57 to +58
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why make these optional and defaulted to None?

I am not against it, but more just wondering why. Can you add a description stating what these should be and what None means

citation: str
dockey: DocKey
overwrite_fields_from_metadata: bool = Field(
default=True,
description=(
Expand Down Expand Up @@ -94,7 +94,7 @@ def matches_filter_criteria(self, filter_criteria: dict) -> bool:
class Text(Embeddable):
text: str
name: str
doc: Doc
doc: Doc | DocDetails

def __hash__(self) -> int:
return hash(self.text)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies = [
"PyMuPDF>=1.24.12", # For pymupdf.set_messages addition
"aiohttp>=3.10.6", # TODO: remove in favor of httpx, pin for aiohttp.ClientConnectionResetError
"anyio",
"fh-llm-client",
"fh-llm-client>=0.0.7",
"fhaviary[llm]>=0.10.2", # For tool execution concurrency
"html2text", # TODO: evaluate moving to an opt-in dependency
"httpx",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ async def llm_model_call(*args, **kwargs):
# NOTE: "required" will not lead to thoughts being emitted, it has to be "auto"
# https://docs.anthropic.com/en/docs/build-with-claude/tool-use#chain-of-thought
kwargs.pop("tool_choice", MultipleCompletionLLMModel.TOOL_CHOICE_REQUIRED)
return await orig_llm_model_call(*args, tool_choice="auto", **kwargs)
return await orig_llm_model_call(*args, tool_choice="auto", **kwargs) # type: ignore [misc]

with patch.object(
MultipleCompletionLLMModel, "call", side_effect=llm_model_call, autospec=True
Expand Down
4 changes: 2 additions & 2 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,7 +683,7 @@ def test_sparse_embedding(stub_data_dir: Path, vector_store: type[VectorStore])
citation="WikiMedia Foundation, 2023, Accessed now",
embedding_model=SparseEmbeddingModel(),
)
assert any(docs.texts[0].embedding)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we keep the any assertions around? Why move to list?

If you think the assertion is the same behavior, can you add an assertion failure message, to help make what it's checking for clear?

--- assert any(docs.texts[0].embedding), "We require the embedding to be populated"
+++ assert isinstance(docs.texts[0].embedding, list), "We require the embedding to be populated"

assert any(cast(list, docs.texts[0].embedding))
maykcaldas marked this conversation as resolved.
Show resolved Hide resolved
assert all(
len(np.array(x.embedding).shape) == 1 for x in docs.texts
), "Embeddings should be 1D"
Expand All @@ -705,7 +705,7 @@ def test_hybrid_embedding(stub_data_dir: Path, vector_store: type[VectorStore])
citation="WikiMedia Foundation, 2023, Accessed now",
embedding_model=emb_model,
)
assert any(docs.texts[0].embedding)
assert any(cast(list, docs.texts[0].embedding))

# check the embeddings are the same size
assert docs.texts[0].embedding is not None
Expand Down
36 changes: 30 additions & 6 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading