From 18aa5afa7c94333bbc7e8bd2de83101f59336e6f Mon Sep 17 00:00:00 2001 From: AndyKilmory Date: Tue, 14 Jan 2025 17:02:12 +0000 Subject: [PATCH 1/4] basic fuzziness support --- dev/script/generate-config/service-config.js | 3 +++ media-api/app/lib/MediaApiConfig.scala | 8 ++++++++ .../app/lib/elasticsearch/QueryBuilder.scala | 18 +++++++++++++++--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/dev/script/generate-config/service-config.js b/dev/script/generate-config/service-config.js index e9d58e72c7..d969844af3 100644 --- a/dev/script/generate-config/service-config.js +++ b/dev/script/generate-config/service-config.js @@ -142,6 +142,9 @@ function getMediaApiConfig(config) { |es6.replicas=${config.es6.replicas} |quota.store.key="rcs-quota.json" |security.cors.allowedOrigins="${getCorsAllowedOriginString(config)}" + |search.fuzziness={ + | enabled=true + |} |metrics.request.enabled=false |syndication.review.useRuntimeFieldsFix=true |`; diff --git a/media-api/app/lib/MediaApiConfig.scala b/media-api/app/lib/MediaApiConfig.scala index f6c36232a2..3bd7dc0d8a 100644 --- a/media-api/app/lib/MediaApiConfig.scala +++ b/media-api/app/lib/MediaApiConfig.scala @@ -31,6 +31,12 @@ class MediaApiConfig(resources: GridConfigResources) extends CommonConfigWithEla val cloudFrontPrivateKeyBucketKey: Option[String] = stringOpt("cloudfront.private-key.key") val cloudFrontKeyPairId: Option[String] = stringOpt("cloudfront.keypair.id") + val fuzzySearchEnabled: Boolean = boolean("search.fuzziness.enabled") + val fuzzySearchEditDistance: String = stringOpt("search.fuzziness.editDistance") match { + case Some(editDistance) if convertToInt(editDistance).isDefined => editDistance + case None => "AUTO" + } + val rootUri: String = services.apiBaseUri val kahunaUri: String = services.kahunaBaseUri val cropperUri: String = services.cropperBaseUri @@ -63,4 +69,6 @@ class MediaApiConfig(resources: GridConfigResources) extends CommonConfigWithEla val restrictDownload: Boolean = boolean("restrictDownload") + private def convertToInt(s: String): Option[Int] = Try { s.toInt }.toOption + } diff --git a/media-api/app/lib/elasticsearch/QueryBuilder.scala b/media-api/app/lib/elasticsearch/QueryBuilder.scala index 96fd6971ff..5a711f921a 100644 --- a/media-api/app/lib/elasticsearch/QueryBuilder.scala +++ b/media-api/app/lib/elasticsearch/QueryBuilder.scala @@ -26,10 +26,22 @@ class QueryBuilder(matchFields: Seq[String], overQuotaAgencies: () => List[Agenc private def multiMatchPhraseQuery(value: String, fields: Seq[String]): MultiMatchQuery = ElasticDsl.multiMatchQuery(value).fields(fields).matchType(MultiMatchQueryBuilderType.PHRASE) + private def multiMatchWordQuery(value: String, fields: Seq[String]): MultiMatchQuery = { + val multiMatchQuery = ElasticDsl.multiMatchQuery(value).fields(fields).operator(Operator.AND) + + if (config.fuzzySearchEnabled) { + multiMatchQuery.matchType(MultiMatchQueryBuilderType.BEST_FIELDS).fuzziness(config.fuzzySearchEditDistance) + } else { + multiMatchQuery.matchType(MultiMatchQueryBuilderType.CROSS_FIELDS) + } + } + private def makeMultiQuery(value: Value, fields: Seq[String]): MultiMatchQuery = value match { - case Words(value) => ElasticDsl.multiMatchQuery(value).fields(fields). - operator(Operator.AND). - matchType(MultiMatchQueryBuilderType.CROSS_FIELDS) + //*** moving to possibility of fuzzy search *** + //case Words(value) => ElasticDsl.multiMatchQuery(value).fields(fields). + // operator(Operator.AND). + // matchType(MultiMatchQueryBuilderType.CROSS_FIELDS) + case Words(value) => multiMatchWordQuery(value, fields) case Phrase(string) => multiMatchPhraseQuery(string, fields) // That's OK, we only do date queries on a single field at a time case e => throw InvalidQuery(s"Cannot do multiQuery on $e") From 00d570da923b4ff20b7afb5b014dbafc6e4ffaf3 Mon Sep 17 00:00:00 2001 From: AndyKilmory Date: Tue, 21 Jan 2025 16:36:29 +0000 Subject: [PATCH 2/4] extend range of fuzzy parameters --- media-api/app/lib/MediaApiConfig.scala | 11 ++++++++++- media-api/app/lib/elasticsearch/QueryBuilder.scala | 5 ++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/media-api/app/lib/MediaApiConfig.scala b/media-api/app/lib/MediaApiConfig.scala index 3bd7dc0d8a..86dbea6658 100644 --- a/media-api/app/lib/MediaApiConfig.scala +++ b/media-api/app/lib/MediaApiConfig.scala @@ -32,9 +32,18 @@ class MediaApiConfig(resources: GridConfigResources) extends CommonConfigWithEla val cloudFrontKeyPairId: Option[String] = stringOpt("cloudfront.keypair.id") val fuzzySearchEnabled: Boolean = boolean("search.fuzziness.enabled") + val fuzzySearchPrefixLength: Int = stringOpt("search.fuzziness.prefixLength") match { + case Some(prefixLength) if convertToInt(prefixLength).isDefined => prefixLength.toInt + case _ => 1 + } val fuzzySearchEditDistance: String = stringOpt("search.fuzziness.editDistance") match { case Some(editDistance) if convertToInt(editDistance).isDefined => editDistance - case None => "AUTO" + case Some(editDistance) if editDistance.contains("AUTO:") => editDistance //<- for non-default AUTO word boundaries + case _ => "AUTO" + } + val fuzzyMaxExpansions: Int = stringOpt("search.fuzziness.maxExpansions") match { + case Some(maxExpansions) if convertToInt(maxExpansions).isDefined => maxExpansions.toInt + case _ => 50 } val rootUri: String = services.apiBaseUri diff --git a/media-api/app/lib/elasticsearch/QueryBuilder.scala b/media-api/app/lib/elasticsearch/QueryBuilder.scala index 5a711f921a..bc5c490d8c 100644 --- a/media-api/app/lib/elasticsearch/QueryBuilder.scala +++ b/media-api/app/lib/elasticsearch/QueryBuilder.scala @@ -30,7 +30,10 @@ class QueryBuilder(matchFields: Seq[String], overQuotaAgencies: () => List[Agenc val multiMatchQuery = ElasticDsl.multiMatchQuery(value).fields(fields).operator(Operator.AND) if (config.fuzzySearchEnabled) { - multiMatchQuery.matchType(MultiMatchQueryBuilderType.BEST_FIELDS).fuzziness(config.fuzzySearchEditDistance) + multiMatchQuery.matchType(MultiMatchQueryBuilderType.BEST_FIELDS) + .fuzziness(config.fuzzySearchEditDistance) + .maxExpansions(config.fuzzyMaxExpansions) + .prefixLength(config.fuzzySearchPrefixLength) } else { multiMatchQuery.matchType(MultiMatchQueryBuilderType.CROSS_FIELDS) } From 303626c4cdcbbf5bd261ce548d0f74be33e01605 Mon Sep 17 00:00:00 2001 From: AndyKilmory Date: Wed, 29 Jan 2025 17:38:14 +0000 Subject: [PATCH 3/4] remove unwanted comments --- media-api/app/lib/elasticsearch/QueryBuilder.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/media-api/app/lib/elasticsearch/QueryBuilder.scala b/media-api/app/lib/elasticsearch/QueryBuilder.scala index f390ac2537..e4acb1492f 100644 --- a/media-api/app/lib/elasticsearch/QueryBuilder.scala +++ b/media-api/app/lib/elasticsearch/QueryBuilder.scala @@ -40,10 +40,6 @@ class QueryBuilder(matchFields: Seq[String], overQuotaAgencies: () => List[Agenc } private def makeMultiQuery(value: Value, fields: Seq[String]): MultiMatchQuery = value match { - //*** moving to possibility of fuzzy search *** - //case Words(value) => ElasticDsl.multiMatchQuery(value).fields(fields). - // operator(Operator.AND). - // matchType(MultiMatchQueryBuilderType.CROSS_FIELDS) case Words(value) => multiMatchWordQuery(value, fields) case Phrase(string) => multiMatchPhraseQuery(string, fields) // That's OK, we only do date queries on a single field at a time From c43513a02766daf31dae6e18a47dc46a5e81e178 Mon Sep 17 00:00:00 2001 From: AndyKilmory Date: Thu, 30 Jan 2025 11:29:42 +0000 Subject: [PATCH 4/4] add unit test for basic fuzzy search config --- .../lib/elasticsearch/QueryBuilderTest.scala | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/media-api/test/lib/elasticsearch/QueryBuilderTest.scala b/media-api/test/lib/elasticsearch/QueryBuilderTest.scala index c562b5a2a9..ace13ce300 100644 --- a/media-api/test/lib/elasticsearch/QueryBuilderTest.scala +++ b/media-api/test/lib/elasticsearch/QueryBuilderTest.scala @@ -23,8 +23,10 @@ class QueryBuilderTest extends AnyFunSpec with Matchers with ConditionFixtures w val matchFields: Seq[String] = Seq("afield", "anothermatchfield") + private val commonConfigurations = USED_CONFIGS_IN_TEST ++ MOCK_CONFIG_KEYS.map(_ -> NOT_USED_IN_TEST).toMap + private val mediaApiConfig = new MediaApiConfig(GridConfigResources( - Configuration.from(USED_CONFIGS_IN_TEST ++ MOCK_CONFIG_KEYS.map(_ -> NOT_USED_IN_TEST).toMap), + Configuration.from(commonConfigurations), null, new ApplicationLifecycle { override def addStopHook(hook: () => Future[_]): Unit = {} @@ -132,6 +134,28 @@ class QueryBuilderTest extends AnyFunSpec with Matchers with ConditionFixtures w multiMatchClause.`type` shouldBe Some(MultiMatchQueryBuilderType.CROSS_FIELDS) } + it("any field words queries should be applied to all of the match fields with best fields type and fuzziness, operator and analyzers set") { + val mediaApiConfigWithFuzzySearch = new MediaApiConfig(GridConfigResources( + Configuration.from(commonConfigurations ++ Map("search.fuzziness.enabled" -> true)), + null, + new ApplicationLifecycle { + override def addStopHook(hook: () => Future[_]): Unit = {} + override def stop(): Future[_] = Future.successful(()) + } + )) + val queryBuilder = new QueryBuilder(matchFields, () => Nil, mediaApiConfigWithFuzzySearch) + val query = queryBuilder.makeQuery(List(anyFieldWordsCondition)).asInstanceOf[BoolQuery] + + query.must.size shouldBe 1 + val multiMatchClause = query.must.head.asInstanceOf[MultiMatchQuery] + multiMatchClause.text shouldBe "cats dogs" + multiMatchClause.fields.map(_.field) shouldBe matchFields + multiMatchClause.operator shouldBe Some(Operator.AND) + multiMatchClause.`type` shouldBe Some(MultiMatchQueryBuilderType.BEST_FIELDS) + multiMatchClause.fuzziness shouldBe defined + multiMatchClause.fuzziness shouldBe Some("AUTO") + } + it("multiple field queries should query against the requested fields only") { val query = queryBuilder.makeQuery(List(multipleFieldWordsCondition)).asInstanceOf[BoolQuery]