Future-House · maykcaldas · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -87,7 +87,7 @@ repos:
           - fhaviary[llm]>=0.10.2 # Match pyproject.toml
           - ldp>=0.14.5 # Match pyproject.toml
           - html2text
-          - fh-llm-client
+          - fh-llm-client>=0.0.7 # Match pyproject.toml
           - httpx
           - pybtex
           - numpy

diff --git a/paperqa/agents/helpers.py b/paperqa/agents/helpers.py
@@ -9,6 +9,7 @@
 from rich.table import Table
 
 from paperqa.docs import Docs
+from paperqa.types import DocDetails
 
 from .models import AnswerResponse
 
@@ -91,10 +92,11 @@ def table_formatter(
         table.add_column("Title", style="cyan")
         table.add_column("File", style="magenta")
         for obj, filename in objects:
+            doc = cast(DocDetails, cast(Docs, obj).texts[0].doc)
             try:
-                display_name = cast(Docs, obj).texts[0].doc.title
+                display_name = cast(str, doc.title)
             except AttributeError:
-                display_name = cast(Docs, obj).texts[0].doc.formatted_citation
+                display_name = doc.formatted_citation
             table.add_row(display_name[:max_chars_per_column], filename)
         return table
     raise NotImplementedError(

diff --git a/paperqa/clients/__init__.py b/paperqa/clients/__init__.py
@@ -164,16 +164,13 @@ async def query(self, **kwargs) -> DocDetails | None:
                     sum(
                         await gather_with_concurrency(
                             len(task.processors),
-                            task.processor_queries(
-                                cast(DocDetails, doc_details), session
-                            ),
+                            task.processor_queries(doc_details, session),
                         )
                     )
                     or None
                 )
 
             if doc_details:
-                doc_details = cast(DocDetails, doc_details)
                 # abuse int handling in __add__ for empty all_doc_details, None types won't work
                 all_doc_details = doc_details + (all_doc_details or 0)
 

diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -477,10 +477,12 @@ async def aadd_texts(
         # 3. Update self
         # NOTE: we defer adding texts to the texts index to retrieval time
         # (e.g. `self.texts_index.add_texts_and_embeddings(texts)`)
-        self.docs[doc.dockey] = doc
-        self.texts += texts
-        self.docnames.add(doc.docname)
-        return True
+        if doc.docname and doc.dockey:
+            self.docs[doc.dockey] = doc
+            self.texts += texts
+            self.docnames.add(doc.docname)
+            return True
+        return False
 
     def delete(
         self,
@@ -496,8 +498,9 @@ def delete(
             doc = next((doc for doc in self.docs.values() if doc.docname == name), None)
             if doc is None:
                 return
-            self.docnames.remove(doc.docname)
-            dockey = doc.dockey
+            if doc.docname and doc.dockey:
+                self.docnames.remove(doc.docname)
+                dockey = doc.dockey
         del self.docs[dockey]
         self.deleted_dockeys.add(dockey)
         self.texts = list(filter(lambda x: x.doc.dockey != dockey, self.texts))

diff --git a/paperqa/llms.py b/paperqa/llms.py
@@ -10,7 +10,7 @@
     Iterable,
     Sequence,
 )
-from typing import Any
+from typing import Any, cast
 
 import numpy as np
 from llmclient import (
@@ -334,8 +334,10 @@ def add_texts_and_embeddings(self, texts: Iterable[Embeddable]) -> None:
         texts_list = list(texts)
 
         if texts_list and not self.client.collection_exists(self.collection_name):
+
             params = models.VectorParams(
-                size=len(texts_list[0].embedding), distance=models.Distance.COSINE
+                size=len(cast(list, texts_list[0].embedding)),
+                distance=models.Distance.COSINE,
             )
             self.client.create_collection(
                 self.collection_name,

diff --git a/paperqa/types.py b/paperqa/types.py
@@ -54,9 +54,9 @@ def set_llm_session_ids(session_id: UUID):
 
 
 class Doc(Embeddable):
-    docname: str
+    docname: str | None = None
+    dockey: DocKey | None = None
     citation: str
-    dockey: DocKey
     overwrite_fields_from_metadata: bool = Field(
         default=True,
         description=(
@@ -94,7 +94,7 @@ def matches_filter_criteria(self, filter_criteria: dict) -> bool:
 class Text(Embeddable):
     text: str
     name: str
-    doc: Doc
+    doc: Doc | DocDetails
 
     def __hash__(self) -> int:
         return hash(self.text)

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "PyMuPDF>=1.24.12",  # For pymupdf.set_messages addition
     "aiohttp>=3.10.6",  # TODO: remove in favor of httpx, pin for aiohttp.ClientConnectionResetError
     "anyio",
-    "fh-llm-client",
+    "fh-llm-client>=0.0.7",  # for py.typed
     "fhaviary[llm]>=0.10.2",  # For tool execution concurrency
     "html2text",  # TODO: evaluate moving to an opt-in dependency
     "httpx",

diff --git a/tests/test_agents.py b/tests/test_agents.py
@@ -335,7 +335,7 @@ async def llm_model_call(*args, **kwargs):
         # NOTE: "required" will not lead to thoughts being emitted, it has to be "auto"
         # https://docs.anthropic.com/en/docs/build-with-claude/tool-use#chain-of-thought
         kwargs.pop("tool_choice", MultipleCompletionLLMModel.TOOL_CHOICE_REQUIRED)
-        return await orig_llm_model_call(*args, tool_choice="auto", **kwargs)
+        return await orig_llm_model_call(*args, tool_choice="auto", **kwargs)  # type: ignore [misc]
 
     with patch.object(
         MultipleCompletionLLMModel, "call", side_effect=llm_model_call, autospec=True

diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -683,7 +683,7 @@ def test_sparse_embedding(stub_data_dir: Path, vector_store: type[VectorStore])
         citation="WikiMedia Foundation, 2023, Accessed now",
         embedding_model=SparseEmbeddingModel(),
     )
-    assert any(docs.texts[0].embedding)
+    assert isinstance(docs.texts[0].embedding, list)
     assert all(
         len(np.array(x.embedding).shape) == 1 for x in docs.texts
     ), "Embeddings should be 1D"
@@ -705,7 +705,7 @@ def test_hybrid_embedding(stub_data_dir: Path, vector_store: type[VectorStore])
         citation="WikiMedia Foundation, 2023, Accessed now",
         embedding_model=emb_model,
     )
-    assert any(docs.texts[0].embedding)
+    assert isinstance(docs.texts[0].embedding, list)
 
     # check the embeddings are the same size
     assert docs.texts[0].embedding is not None

diff --git a/uv.lock b/uv.lock