Skip to content

Commit

Permalink
Support missing and empty values in search (#3231)
Browse files Browse the repository at this point in the history
Add support for indexing and searching missing and empty values.

Currently there are some limitation from the server side, for example
empty values are supported only for TEXT and TAG fields.
  • Loading branch information
gerzse authored Jun 13, 2024
1 parent 29b861b commit 5115145
Show file tree
Hide file tree
Showing 3 changed files with 205 additions and 44 deletions.
67 changes: 34 additions & 33 deletions redis/commands/search/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from ._util import to_string
from .aggregation import AggregateRequest, AggregateResult, Cursor
from .document import Document
from .field import Field
from .indexDefinition import IndexDefinition
from .query import Query
from .result import Result
from .suggestion import SuggestionParser
Expand Down Expand Up @@ -151,44 +153,43 @@ def batch_indexer(self, chunk_size=100):

def create_index(
self,
fields,
no_term_offsets=False,
no_field_flags=False,
stopwords=None,
definition=None,
fields: List[Field],
no_term_offsets: bool = False,
no_field_flags: bool = False,
stopwords: Optional[List[str]] = None,
definition: Optional[IndexDefinition] = None,
max_text_fields=False,
temporary=None,
no_highlight=False,
no_term_frequencies=False,
skip_initial_scan=False,
no_highlight: bool = False,
no_term_frequencies: bool = False,
skip_initial_scan: bool = False,
):
"""
Create the search index. The index must not already exist.
### Parameters:
- **fields**: a list of TextField or NumericField objects
- **no_term_offsets**: If true, we will not save term offsets in
the index
- **no_field_flags**: If true, we will not save field flags that
allow searching in specific fields
- **stopwords**: If not None, we create the index with this custom
stopword list. The list can be empty
- **max_text_fields**: If true, we will encode indexes as if there
were more than 32 text fields which allows you to add additional
fields (beyond 32).
- **temporary**: Create a lightweight temporary index which will
expire after the specified period of inactivity (in seconds). The
internal idle timer is reset whenever the index is searched or added to.
- **no_highlight**: If true, disabling highlighting support.
Also implied by no_term_offsets.
- **no_term_frequencies**: If true, we avoid saving the term frequencies
in the index.
- **skip_initial_scan**: If true, we do not scan and index.
For more information see `FT.CREATE <https://redis.io/commands/ft.create>`_.
""" # noqa
Creates the search index. The index must not already exist.
For more information, see https://redis.io/commands/ft.create/
Args:
fields: A list of Field objects.
no_term_offsets: If `true`, term offsets will not be saved in the index.
no_field_flags: If true, field flags that allow searching in specific fields
will not be saved.
stopwords: If provided, the index will be created with this custom stopword
list. The list can be empty.
definition: If provided, the index will be created with this custom index
definition.
max_text_fields: If true, indexes will be encoded as if there were more than
32 text fields, allowing for additional fields beyond 32.
temporary: Creates a lightweight temporary index which will expire after the
specified period of inactivity. The internal idle timer is reset
whenever the index is searched or added to.
no_highlight: If true, disables highlighting support. Also implied by
`no_term_offsets`.
no_term_frequencies: If true, term frequencies will not be saved in the
index.
skip_initial_scan: If true, the initial scan and indexing will be skipped.
"""
args = [CREATE_CMD, self.index_name]
if definition is not None:
args += definition.args
Expand Down
26 changes: 26 additions & 0 deletions redis/commands/search/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@


class Field:
"""
A class representing a field in a document.
"""

NUMERIC = "NUMERIC"
TEXT = "TEXT"
WEIGHT = "WEIGHT"
Expand All @@ -14,15 +18,33 @@ class Field:
NOINDEX = "NOINDEX"
AS = "AS"
GEOSHAPE = "GEOSHAPE"
INDEX_MISSING = "INDEXMISSING"
INDEX_EMPTY = "INDEXEMPTY"

def __init__(
self,
name: str,
args: List[str] = None,
sortable: bool = False,
no_index: bool = False,
index_missing: bool = False,
index_empty: bool = False,
as_name: str = None,
):
"""
Create a new field object.
Args:
name: The name of the field.
args:
sortable: If `True`, the field will be sortable.
no_index: If `True`, the field will not be indexed.
index_missing: If `True`, it will be possible to search for documents that
have this field missing.
index_empty: If `True`, it will be possible to search for documents that
have this field empty.
as_name: If provided, this alias will be used for the field.
"""
if args is None:
args = []
self.name = name
Expand All @@ -34,6 +56,10 @@ def __init__(
self.args_suffix.append(Field.SORTABLE)
if no_index:
self.args_suffix.append(Field.NOINDEX)
if index_missing:
self.args_suffix.append(Field.INDEX_MISSING)
if index_empty:
self.args_suffix.append(Field.INDEX_EMPTY)

if no_index and not sortable:
raise ValueError("Non-Sortable non-Indexable fields are ignored")
Expand Down
156 changes: 145 additions & 11 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2105,7 +2105,7 @@ def test_geo_params(client):
params_dict = {"lat": "34.95126", "lon": "29.69465", "radius": 1000, "units": "km"}
q = Query("@g:[$lon $lat $radius $units]").dialect(2)
res = client.ft().search(q, query_params=params_dict)
_assert_geosearch_result(client, res, ["doc1", "doc2", "doc3"])
_assert_search_result(client, res, ["doc1", "doc2", "doc3"])


@pytest.mark.redismod
Expand All @@ -2122,13 +2122,13 @@ def test_geoshapes_query_intersects_and_disjoint(client):
Query("@g:[intersects $shape]").dialect(3),
query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"},
)
_assert_geosearch_result(client, intersection, ["doc_point2", "doc_polygon1"])
_assert_search_result(client, intersection, ["doc_point2", "doc_polygon1"])

disjunction = client.ft().search(
Query("@g:[disjoint $shape]").dialect(3),
query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"},
)
_assert_geosearch_result(client, disjunction, ["doc_point1", "doc_polygon2"])
_assert_search_result(client, disjunction, ["doc_point1", "doc_polygon2"])


@pytest.mark.redismod
Expand All @@ -2146,19 +2146,19 @@ def test_geoshapes_query_contains_and_within(client):
Query("@g:[contains $shape]").dialect(3),
query_params={"shape": "POINT(25 25)"},
)
_assert_geosearch_result(client, contains_a, ["doc_polygon1"])
_assert_search_result(client, contains_a, ["doc_polygon1"])

contains_b = client.ft().search(
Query("@g:[contains $shape]").dialect(3),
query_params={"shape": "POLYGON((24 24, 24 26, 25 25, 24 24))"},
)
_assert_geosearch_result(client, contains_b, ["doc_polygon1"])
_assert_search_result(client, contains_b, ["doc_polygon1"])

within = client.ft().search(
Query("@g:[within $shape]").dialect(3),
query_params={"shape": "POLYGON((15 15, 75 15, 50 70, 20 40, 15 15))"},
)
_assert_geosearch_result(client, within, ["doc_point2", "doc_polygon1"])
_assert_search_result(client, within, ["doc_point2", "doc_polygon1"])


@pytest.mark.redismod
Expand Down Expand Up @@ -2322,19 +2322,153 @@ def test_geoshape(client: redis.Redis):
q2 = Query("@geom:[CONTAINS $poly]").dialect(3)
qp2 = {"poly": "POLYGON((2 2, 2 50, 50 50, 50 2, 2 2))"}
result = client.ft().search(q1, query_params=qp1)
_assert_geosearch_result(client, result, ["small"])
_assert_search_result(client, result, ["small"])
result = client.ft().search(q2, query_params=qp2)
_assert_geosearch_result(client, result, ["small", "large"])
_assert_search_result(client, result, ["small", "large"])


def _assert_geosearch_result(client, result, expected_doc_ids):
@pytest.mark.redismod
def test_search_missing_fields(client):
definition = IndexDefinition(prefix=["property:"], index_type=IndexType.HASH)

fields = [
TextField("title", sortable=True),
TagField("features", index_missing=True),
TextField("description", index_missing=True),
]

client.ft().create_index(fields, definition=definition)

# All fields present
client.hset(
"property:1",
mapping={
"title": "Luxury Villa in Malibu",
"features": "pool,sea view,modern",
"description": "A stunning modern villa overlooking the Pacific Ocean.",
},
)

# Missing features
client.hset(
"property:2",
mapping={
"title": "Downtown Flat",
"description": "Modern flat in central Paris with easy access to metro.",
},
)

# Missing description
client.hset(
"property:3",
mapping={
"title": "Beachfront Bungalow",
"features": "beachfront,sun deck",
},
)

with pytest.raises(redis.exceptions.ResponseError) as e:
client.ft().search(
Query("ismissing(@title)").dialect(5).return_field("id").no_content()
)
assert "to be defined with 'INDEXMISSING'" in e.value.args[0]

res = client.ft().search(
Query("ismissing(@features)").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:2"])

res = client.ft().search(
Query("-ismissing(@features)").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:1", "property:3"])

res = client.ft().search(
Query("ismissing(@description)").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:3"])

res = client.ft().search(
Query("-ismissing(@description)").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:1", "property:2"])


@pytest.mark.redismod
def test_search_empty_fields(client):
definition = IndexDefinition(prefix=["property:"], index_type=IndexType.HASH)

fields = [
TextField("title", sortable=True),
TagField("features", index_empty=True),
TextField("description", index_empty=True),
]

client.ft().create_index(fields, definition=definition)

# All fields present
client.hset(
"property:1",
mapping={
"title": "Luxury Villa in Malibu",
"features": "pool,sea view,modern",
"description": "A stunning modern villa overlooking the Pacific Ocean.",
},
)

# Empty features
client.hset(
"property:2",
mapping={
"title": "Downtown Flat",
"features": "",
"description": "Modern flat in central Paris with easy access to metro.",
},
)

# Empty description
client.hset(
"property:3",
mapping={
"title": "Beachfront Bungalow",
"features": "beachfront,sun deck",
"description": "",
},
)

with pytest.raises(redis.exceptions.ResponseError) as e:
client.ft().search(
Query("@title:''").dialect(5).return_field("id").no_content()
)
assert "to be defined with `INDEXEMPTY`" in e.value.args[0]

res = client.ft().search(
Query("@features:{ }").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:2"])

res = client.ft().search(
Query("-@features:{ }").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:1", "property:3"])

res = client.ft().search(
Query("@description:''").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:3"])

res = client.ft().search(
Query("-@description:''").dialect(5).return_field("id").no_content()
)
_assert_search_result(client, res, ["property:1", "property:2"])


def _assert_search_result(client, result, expected_doc_ids):
"""
Make sure the result of a geo search is as expected, taking into account the RESP
version being used.
"""
if is_resp2_connection(client):
assert set([doc.id for doc in result.docs]) == set(expected_doc_ids)
assert result.total == len(expected_doc_ids)
else:
assert set([doc["id"] for doc in result["results"]]) == set(expected_doc_ids)
assert result["total_results"] == len(expected_doc_ids)

0 comments on commit 5115145

Please sign in to comment.