diff --git a/isic/core/api/image.py b/isic/core/api/image.py index a7093b6c..20fc4f1c 100644 --- a/isic/core/api/image.py +++ b/isic/core/api/image.py @@ -1,3 +1,5 @@ +from typing import cast + from django.conf import settings from django.http.request import HttpRequest from django.shortcuts import get_object_or_404 @@ -7,7 +9,7 @@ from ninja.pagination import paginate from pyparsing.exceptions import ParseException -from isic.core.dsl import parse_query +from isic.core.dsl import es_parser, parse_query from isic.core.models import Collection, Image from isic.core.pagination import CursorPagination from isic.core.permissions import get_visible_objects @@ -109,14 +111,16 @@ def search_images(request: HttpRequest, search: SearchQueryIn = Query(...)): @router.get("/facets/", response=dict, include_in_schema=False) def get_facets(request: HttpRequest, search: SearchQueryIn = Query(...)): + es_query: dict | None = None if search.query: try: - parse_query(search.query) + # we know it can't be a Q object because we're using es_parser + es_query = cast(dict | None, parse_query(es_parser, search.query)) except ParseException: raise ImageSearchParseError() query = build_elasticsearch_query( - search.query or "", + es_query or {}, request.user, search.collections, ) diff --git a/isic/core/dsl.py b/isic/core/dsl.py index 82d27dec..9f2f6e40 100644 --- a/isic/core/dsl.py +++ b/isic/core/dsl.py @@ -1,6 +1,7 @@ from __future__ import barry_as_FLUFL from dataclasses import dataclass +from typing import Any, Callable from django.db.models.query_utils import Q from isic_metadata import FIELD_REGISTRY @@ -20,12 +21,29 @@ class SearchTermKey: class Value: + value: Any + def to_q(self, key: SearchTermKey) -> Q: if self.value == "*": return Q(**{f"{key.field_lookup}__isnull": False}, _negated=key.negated) else: + if key.negated: + return ~Q(**{key.field_lookup: self.value}) | Q( + **{f"{key.field_lookup}__isnull": True} + ) return Q(**{key.field_lookup: self.value}, _negated=key.negated) + def to_es(self, key: SearchTermKey) -> dict: + if self.value == "*": + term = {"exists": {"field": key.field_lookup}} + else: + term = {"term": {key.field_lookup: self.value}} + + if key.negated: + return {"bool": {"must_not": term}} + else: + return term + class BoolValue(Value): def __init__(self, toks) -> None: @@ -52,12 +70,47 @@ def to_q(self, key: SearchTermKey) -> Q: if self.value == "*": return Q(**{f"{key.field_lookup}__isnull": False}, _negated=key.negated) if self.value.startswith("*"): - return Q(**{f"{key.field_lookup}__endswith": self.value[1:]}, _negated=key.negated) + if key.negated: + return ~Q(**{f"{key.field_lookup}__startswith": self.value[1:]}) | Q( + **{f"{key.field_lookup}__isnull": True} + ) + else: + return Q(**{f"{key.field_lookup}__endswith": self.value[1:]}, _negated=key.negated) elif self.value.endswith("*"): - return Q(**{f"{key.field_lookup}__startswith": self.value[:-1]}, _negated=key.negated) + if key.negated: + return ~Q(**{f"{key.field_lookup}__startswith": self.value[:-1]}) | Q( + **{f"{key.field_lookup}__isnull": True} + ) + else: + return Q( + **{f"{key.field_lookup}__startswith": self.value[:-1]}, _negated=key.negated + ) else: return super().to_q(key) + def to_es(self, key: SearchTermKey) -> dict: + # Special casing for image type renaming, see + # https://linear.app/isic/issue/ISIC-138#comment-93029f64 + # TODO: Remove this once better error messages are put in place. + if key.field_lookup == "image_type" and self.value == "clinical": + self.value = "clinical: close-up" + elif key.field_lookup == "image_type" and self.value == "overview": + self.value = "clinical: overview" + + if self.value == "*": + term = {"exists": {"field": key.field_lookup}} + elif self.value.startswith("*"): + term = {"wildcard": {key.field_lookup: {"value": f"*{self.value[1:]}"}}} + elif self.value.endswith("*"): + term = {"wildcard": {key.field_lookup: {"value": f"{self.value[:-1]}*"}}} + else: + term = {"term": {key.field_lookup: self.value}} + + if key.negated: + return {"bool": {"must_not": term}} + else: + return term + class NumberValue(Value): def __init__(self, toks) -> None: @@ -79,7 +132,62 @@ def to_q(self, key: SearchTermKey) -> Q: f"{key.field_lookup}__{self.upper_lookup}", ) start_value, end_value = self.value - return Q(**{start_key: start_value}, **{end_key: end_value}, _negated=key.negated) + if key.negated: + return ~Q(**{start_key: start_value, end_key: end_value}) | Q( + **{f"{key.field_lookup}__isnull": True} + ) + else: + return Q(**{start_key: start_value}, **{end_key: end_value}, _negated=key.negated) + + def to_es(self, key: SearchTermKey) -> dict: + start_value, end_value = self.value + term = { + "range": { + key.field_lookup: { + self.lower_lookup: start_value, + self.upper_lookup: end_value, + } + } + } + + if key.negated: + return {"bool": {"must_not": term}} + else: + return term + + +def es_query(s, loc, toks): + key = toks[0] + value = toks[1] + return value.to_es(key) + + +def es_query_and(s, loc, toks): + ret = {"bool": {"filter": []}} + if isinstance(toks, ParseResults): + # Explicit ANDs come in as parse results + q_objects = toks.asList() + elif isinstance(toks[0], Q): + # Single search queries come in as a single Q object + q_objects = [toks[0]] + else: + raise Exception("Something went wrong") + + # Results can be nested one level + if isinstance(q_objects, list) and isinstance(q_objects[0], list): + q_objects = q_objects[0] + + for q in q_objects: + ret["bool"]["filter"].append(q) + + return ret + + +def es_query_or(s, loc, toks): + ret = {"bool": {"should": []}} + for tok in toks[0]: + ret["bool"]["should"].append(tok) + toks[0] = ret def q(s, loc, toks): @@ -167,63 +275,82 @@ def convert_term(s, loc, toks): return SearchTermKey(f"accession__metadata__{toks[0]}", negate) -def make_term_keyword(name): - return (Optional("-") + Keyword(name)).add_parse_action(convert_term) +def es_convert_term(s, loc, toks): + negate = False + + if len(toks) == 2 and toks[0] == "-": + negate = True + toks = toks[1:] + + if len(toks) > 1: + raise Exception("Something went wrong") + return SearchTermKey(toks[0], negate) -def make_term(name, values): - term = make_term_keyword(name) - term = term + Suppress(Literal(":")) + values - term.add_parse_action(q) - return term +def make_parser( + element=q, conjunctive=q_and, disjunctive=q_or, term_converter: Callable = convert_term +) -> ParserElement: + def make_term_keyword(name): + term = Optional("-") + Keyword(name) + if term_converter: + term.add_parse_action(term_converter) + return term -def make_number_term(keyword_name): - return make_term(keyword_name, number_range_value | number_value) + def make_term(name, values): + term = make_term_keyword(name) + term = term + Suppress(Literal(":")) + values + term.add_parse_action(element) + return term + def make_number_term(keyword_name): + return make_term(keyword_name, number_range_value | number_value) -def make_str_term(keyword_name): - return make_term(keyword_name, str_value) + def make_str_term(keyword_name): + return make_term(keyword_name, str_value) + def make_bool_term(keyword_name): + return make_term(keyword_name, bool_value) -def make_bool_term(keyword_name): - return make_term(keyword_name, bool_value) + # First setup reserved (special) search terms + TERMS = { + "isic_id": make_str_term("isic_id"), + "public": make_bool_term("public"), + "age_approx": make_number_term("age_approx"), + # TODO: use one_of to get this validating maybe? it's hard to get this to work + # with QuotedString + "copyright_license": make_str_term("copyright_license"), + } + for key, definition in FIELD_REGISTRY.items(): + if definition.get("search"): + es_property_type = definition["search"]["es_property"]["type"] -# First setup reserved (special) search terms -TERMS = { - "isic_id": make_str_term("isic_id"), - "public": make_bool_term("public"), - "age_approx": make_number_term("age_approx"), - # TODO: use one_of to get this validating maybe? it's hard to get this to work with QuotedString - "copyright_license": make_str_term("copyright_license"), -} + if es_property_type == "keyword": + term = make_str_term(key) + elif es_property_type == "boolean": + term = make_bool_term(key) + elif es_property_type in ["integer", "float"]: + term = make_number_term(key) + else: + raise Exception("Found unknown es property type") -for key, definition in FIELD_REGISTRY.items(): - if definition.get("search"): - es_property_type = definition["search"]["es_property"]["type"] + TERMS[key] = term - if es_property_type == "keyword": - term = make_str_term(key) - elif es_property_type == "boolean": - term = make_bool_term(key) - elif es_property_type in ["integer", "float"]: - term = make_number_term(key) - else: - raise Exception("Found unknown es property type") + parser = OneOrMore(Or(TERMS.values())).add_parse_action(conjunctive) - TERMS[key] = term + # TODO: ZeroOrMore? + return infixNotation( + parser, [(AND, 2, opAssoc.LEFT, conjunctive), (OR, 2, opAssoc.LEFT, disjunctive)] + ) -parser = OneOrMore(Or(TERMS.values())).add_parse_action(q_and) -# TODO: ZeroOrMore? -e = infixNotation(parser, [(AND, 2, opAssoc.LEFT, q_and), (OR, 2, opAssoc.LEFT, q_or)]) +django_parser = make_parser() +es_parser = make_parser(es_query, es_query_and, es_query_or, es_convert_term) # Takes ~16ms to parse a fairly complex query -def parse_query(query) -> Q: - parse_results = e.parse_string(query, parse_all=True) +def parse_query(parser, query) -> Q | dict | None: + parse_results = parser.parse_string(query, parse_all=True) if parse_results: return parse_results[0] - else: - return Q() diff --git a/isic/core/models/image.py b/isic/core/models/image.py index cdf88453..954547fe 100644 --- a/isic/core/models/image.py +++ b/isic/core/models/image.py @@ -12,7 +12,7 @@ from django.urls import reverse from django_extensions.db.models import TimeStampedModel -from isic.core.dsl import parse_query +from isic.core.dsl import django_parser, parse_query from isic.core.models.base import CreationSortedTimeStampedModel from isic.ingest.models import Accession @@ -24,7 +24,7 @@ def from_search_query(self, query: str): if query == "": return self else: - return self.filter(parse_query(query)) + return self.filter(parse_query(django_parser, query) or Q()) def with_elasticsearch_properties(self): return self.select_related("accession__cohort").annotate( diff --git a/isic/core/search.py b/isic/core/search.py index a993166a..514e8f00 100644 --- a/isic/core/search.py +++ b/isic/core/search.py @@ -170,10 +170,10 @@ def facets(query: dict | None = None, collections: list[int] | None = None) -> d def build_elasticsearch_query( - query: str, user: User, collection_pks: list[int] | None = None + query: dict, user: User, collection_pks: list[int] | None = None ) -> dict: """ - Build an elasticsearch query from a DSL query string, a user, and collection ids. + Build an elasticsearch query from an elasticsearch query body, a user, and collection ids. collection_pks is the confusing bit here. None indicates the user doesn't want to do any filtering of collections. An empty list would instead indicate that the user wants images that @@ -190,15 +190,14 @@ def build_elasticsearch_query( else: visible_collection_pks = None - query_dict = {"bool": {}} + if query: + query_dict = {"bool": {"filter": [query]}} + else: + query_dict = {"bool": {}} if visible_collection_pks is not None: - query_dict["bool"].setdefault("filter", {}) - query_dict["bool"]["filter"]["terms"] = {"collections": visible_collection_pks} - - if query: - query_dict["bool"].setdefault("must", {}) - query_dict["bool"]["must"]["query_string"] = {"query": query} + query_dict["bool"].setdefault("filter", []) + query_dict["bool"]["filter"].append({"terms": {"collections": visible_collection_pks}}) # Note: permissions here must be also modified in ImagePermissions.view_image_list if user.is_anonymous: diff --git a/isic/core/tests/test_dsl.py b/isic/core/tests/test_dsl.py index 74558687..b3dbac47 100644 --- a/isic/core/tests/test_dsl.py +++ b/isic/core/tests/test_dsl.py @@ -2,7 +2,9 @@ from pyparsing.exceptions import ParseException import pytest -from isic.core.dsl import parse_query +from isic.core.dsl import django_parser, es_parser, parse_query + +# test null problem @pytest.mark.parametrize( @@ -22,19 +24,34 @@ ["age_approx:[50 TO *]", ParseException], ["-melanocytic:*", ~Q(accession__metadata__melanocytic__isnull=False)], ["melanocytic:*", Q(accession__metadata__melanocytic__isnull=False)], - ["-age_approx:50", ~Q(accession__metadata__age__approx=50)], - ["-diagnosis:foo*", ~Q(accession__metadata__diagnosis__startswith="foo")], + [ + "-age_approx:50", + ~Q(accession__metadata__age__approx=50) + | Q(accession__metadata__age__approx__isnull=True), + ], + [ + "-diagnosis:foo*", + ~Q(accession__metadata__diagnosis__startswith="foo") + | Q(accession__metadata__diagnosis__isnull=True), + ], [ "-age_approx:[50 TO 70]", - ~Q(accession__metadata__age__approx__gte=50, accession__metadata__age__approx__lte=70), + ~Q(accession__metadata__age__approx__gte=50, accession__metadata__age__approx__lte=70) + | Q(accession__metadata__age__approx__isnull=True), ], [ "-diagnosis:foobar OR (diagnosis:foobaz AND (-diagnosis:foo* OR age_approx:50))", - ~Q(accession__metadata__diagnosis="foobar") + ( + ~Q(accession__metadata__diagnosis="foobar") + | Q(accession__metadata__diagnosis__isnull=True) + ) | ( Q(accession__metadata__diagnosis="foobaz") & ( - ~Q(accession__metadata__diagnosis__startswith="foo") + ( + ~Q(accession__metadata__diagnosis__startswith="foo") + | Q(accession__metadata__diagnosis__isnull=True) + ) | Q(accession__metadata__age__approx=50) ) ), @@ -106,12 +123,68 @@ ], ], ) -def test_dsl_parser(query, filter_or_exception): +def test_dsl_django_parser(query, filter_or_exception): if isinstance(filter_or_exception, Q): - assert parse_query(query) == filter_or_exception + assert parse_query(django_parser, query) == filter_or_exception else: with pytest.raises(filter_or_exception): - parse_query(query) + parse_query(django_parser, query) + + +@pytest.mark.parametrize( + "query,filter", + [ + [ + "isic_id:ISIC_123*", + {"bool": {"filter": [{"wildcard": {"isic_id": {"value": "ISIC_123*"}}}]}}, + ], + [ + "-isic_id:*", + {"bool": {"filter": [{"bool": {"must_not": {"exists": {"field": "isic_id"}}}}]}}, + ], + [ + "diagnosis:foobar OR (diagnosis:foobaz AND (diagnosis:foo* OR age_approx:50))", + { + "bool": { + "should": [ + {"bool": {"filter": [{"term": {"diagnosis": "foobar"}}]}}, + { + "bool": { + "filter": [ + {"bool": {"filter": [{"term": {"diagnosis": "foobaz"}}]}}, + { + "bool": { + "should": [ + { + "bool": { + "filter": [ + { + "wildcard": { + "diagnosis": {"value": "foo*"} + } + } + ] + } + }, + { + "bool": { + "filter": [{"term": {"age_approx": 50}}] + } + }, + ] + } + }, + ] + } + }, + ] + } + }, + ], + ], +) +def test_dsl_es_parser(query, filter): + assert parse_query(es_parser, query) == filter @pytest.mark.parametrize( @@ -122,4 +195,4 @@ def test_dsl_parser(query, filter_or_exception): ], ) def test_dsl_image_type_backwards_compatible(query, filter): - assert parse_query(query) == filter + assert parse_query(django_parser, query) == filter diff --git a/isic/settings.py b/isic/settings.py index a8a035f2..33cb499c 100644 --- a/isic/settings.py +++ b/isic/settings.py @@ -155,7 +155,7 @@ class DevelopmentConfiguration(IsicMixin, DevelopmentBaseConfiguration): # Development-only settings SHELL_PLUS_IMPORTS = [ "from django.core.files.uploadedfile import UploadedFile", - "from isic.core.dsl import parse_query", + "from isic.core.dsl import *", "from isic.core.search import *", "from isic.core.tasks import *", "from isic.ingest.services.cohort import *",