Skip to content

Commit

Permalink
Added option to strip indirect citations (#203)
Browse files Browse the repository at this point in the history
* Added stripping

* Update paperqa/docs.py

Co-authored-by: Jakub Lála <[email protected]>

* Fixde pre-commit

* Update setup.py

---------

Co-authored-by: Jakub Lála <[email protected]>
  • Loading branch information
whitead and jakublala authored Nov 6, 2023
1 parent fb1ab43 commit 1923614
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 3 deletions.
6 changes: 6 additions & 0 deletions paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
maybe_is_text,
md5sum,
name_in_text,
strip_citations,
)


Expand All @@ -59,6 +60,8 @@ class Docs(BaseModel, arbitrary_types_allowed=True, smart_union=True):
memory: bool = False
memory_model: Optional[BaseChatMemory] = None
jit_texts_index: bool = False
# This is used to strip indirect citations that come up from the summary llm
strip_citations: bool = True

# TODO: Not sure how to get this to work
# while also passing mypy checks
Expand Down Expand Up @@ -505,6 +508,9 @@ async def process(match):
raise e
if "not applicable" in context.lower() or "not relevant" in context.lower():
return None
if self.strip_citations:
# remove citations that collide with our grounded citations (for the answer LLM)
context = strip_citations(context)
c = Context(
context=context,
text=Text(
Expand Down
8 changes: 8 additions & 0 deletions paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,11 @@ def get_llm_name(llm: BaseLanguageModel) -> str:
return llm.model_name # type: ignore
except AttributeError:
return llm.model # type: ignore


def strip_citations(text: str) -> str:
# Combined regex for identifying citations (see unit tests for examples)
citation_regex = r"\b[\w\-]+\set\sal\.\s\([0-9]{4}\)|\((?:[^\)]*?[a-zA-Z][^\)]*?[0-9]{4}[^\)]*?)\)"
# Remove the citations from the text
text = re.sub(citation_regex, "", text, flags=re.MULTILINE)
return text
2 changes: 1 addition & 1 deletion paperqa/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "3.11.2"
__version__ = "3.12.0"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"pypdf",
"pydantic<2",
"langchain>=0.0.303",
"openai >= 0.27.8",
"openai <1",
"faiss-cpu",
"PyCryptodome",
"html2text",
Expand Down
76 changes: 75 additions & 1 deletion tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,88 @@
from paperqa.chains import get_score
from paperqa.readers import read_doc
from paperqa.types import Doc
from paperqa.utils import maybe_is_html, maybe_is_text, name_in_text, strings_similarity
from paperqa.utils import (
maybe_is_html,
maybe_is_text,
name_in_text,
strings_similarity,
strip_citations,
)


class TestHandler(AsyncCallbackHandler):
async def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
print(token)


# Assume strip_citations is imported or defined in this file.


def test_single_author():
text = "This was first proposed by (Smith 1999)."
assert strip_citations(text) == "This was first proposed by ."


def test_multiple_authors():
text = "Recent studies (Smith et al. 1999) show that this is true."
assert strip_citations(text) == "Recent studies show that this is true."


def test_multiple_citations():
text = "As discussed by several authors (Smith et al. 1999; Johnson 2001; Lee et al. 2003)."
assert strip_citations(text) == "As discussed by several authors ."


def test_citations_with_pages():
text = "This is shown in (Smith et al. 1999, p. 150)."
assert strip_citations(text) == "This is shown in ."


def test_citations_without_space():
text = "Findings by(Smith et al. 1999)were significant."
assert strip_citations(text) == "Findings bywere significant."


def test_citations_with_commas():
text = "The method was adopted by (Smith, 1999, 2001; Johnson, 2002)."
assert strip_citations(text) == "The method was adopted by ."


def test_citations_with_text():
text = "This was noted (see Smith, 1999, for a review)."
assert strip_citations(text) == "This was noted ."


def test_no_citations():
text = "There are no references in this text."
assert strip_citations(text) == "There are no references in this text."


def test_malformed_citations():
text = "This is a malformed citation (Smith 199)."
assert strip_citations(text) == "This is a malformed citation (Smith 199)."


def test_edge_case_citations():
text = "Edge cases like (Smith et al.1999) should be handled."
assert strip_citations(text) == "Edge cases like should be handled."


def test_citations_with_special_characters():
text = "Some names have dashes (O'Neil et al. 2000; Smith-Jones 1998)."
assert strip_citations(text) == "Some names have dashes ."


def test_citations_with_nonstandard_chars():
text = (
"In non-English languages, citations might look different (Müller et al. 1999)."
)
assert (
strip_citations(text)
== "In non-English languages, citations might look different ."
)


def test_ablations():
tests_dir = os.path.dirname(os.path.abspath(__file__))
doc_path = os.path.join(tests_dir, "paper.pdf")
Expand Down

0 comments on commit 1923614

Please sign in to comment.