Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose Tantivy's PhraseQuery #234

Merged
merged 5 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,48 @@ impl Query {
})
}

/// Construct a Tantivy's PhraseQuery with custom offsets and slop
///
/// # Arguments
///
/// * `schema` - Schema of the target index.
/// * `field_name` - Field name to be searched.
/// * `words` - Word list that constructs the phrase. A word can be a term text or a pair of term text and its offset in the phrase.
/// * `slop` - (Optional) The number of gaps permitted between the words in the query phrase. Default is 0.
#[staticmethod]
#[pyo3(signature = (schema, field_name, words, slop = 0))]
pub(crate) fn phrase_query(
schema: &Schema,
field_name: &str,
words: Vec<&PyAny>,
slop: u32,
) -> PyResult<Query> {
let mut terms_with_offset = Vec::with_capacity(words.len());
for (idx, word) in words.into_iter().enumerate() {
if let Ok((offset, value)) = word.extract() {
// Custom offset is provided.
let term = make_term(&schema.inner, field_name, value)?;
terms_with_offset.push((offset, term));
} else {
// Custom offset is not provided. Use the list index as the offset.
let term = make_term(&schema.inner, field_name, word)?;
terms_with_offset.push((idx, term));
};
}
if terms_with_offset.is_empty() {
return Err(exceptions::PyValueError::new_err(
"words must not be empty.",
));
}
let inner = tv::query::PhraseQuery::new_with_offset_and_slop(
terms_with_offset,
slop,
);
Ok(Query {
inner: Box::new(inner),
})
}

/// Construct a Tantivy's BooleanQuery
#[staticmethod]
#[pyo3(signature = (subqueries))]
Expand Down
12 changes: 3 additions & 9 deletions tantivy/tantivy.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import datetime
from enum import Enum
from typing import Any, Optional, Sequence
from typing import Any, Optional, Sequence, Union

class Schema:
pass
Expand Down Expand Up @@ -206,16 +206,10 @@ class Query:
pass

@staticmethod
def fuzzy_term_query(
schema: Schema,
field_name: str,
text: str,
distance: int = 1,
transposition_cost_one: bool = True,
prefix=False,
) -> Query:
def phrase_query(schema: Schema, field_name: str, words: list[Union[str, tuple[int, str]]], slop: int = 0) -> Query:
pass


@staticmethod
def boolean_query(subqueries: Sequence[tuple[Occur, Query]]) -> Query:
pass
Expand Down
32 changes: 32 additions & 0 deletions tests/tantivy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,38 @@ def test_all_query(self, ram_index):
result = index.searcher().search(query, 10)
assert len(result.hits) == 3

def test_phrase_query(self, ram_index):
index = ram_index
searcher = index.searcher()

query = Query.phrase_query(index.schema, "title", ["old", "man"])
# should match the title "The Old Man and the Sea"
result = searcher.search(query, 10)
assert len(result.hits) == 1

query = Query.phrase_query(index.schema, "title", ["man", "old"])
# sholdn't match any document
result = searcher.search(query, 10)
assert len(result.hits) == 0

query = Query.phrase_query(index.schema, "title", [(1, "man"), (0, "old")])
# should match "The Old Man and the Sea" with the given offsets
result = searcher.search(query, 10)
assert len(result.hits) == 1

query = Query.phrase_query(index.schema, "title", ["man", "sea"])
# sholdn't match any document with default slop 0.
result = searcher.search(query, 10)
assert len(result.hits) == 0

query = Query.phrase_query(index.schema, "title", ["man", "sea"], slop=2)
# should match the title "The Old Man and the Sea" with slop 2.
result = searcher.search(query, 10)
assert len(result.hits) == 1

with pytest.raises(ValueError, match = "words must not be empty."):
Query.phrase_query(index.schema, "title", [])

def test_fuzzy_term_query(self, ram_index):
index = ram_index
query = Query.fuzzy_term_query(index.schema, "title", "ice")
Expand Down
Loading