From 4b44592811b834f3f35c535c3a7778d0aebf483a Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 12:28:34 -0700 Subject: [PATCH] [Backport 2.x] [Derived Fields] Add aggregation support for derived fields (#15009) * [Derived Fields] Add aggregation support for derived fields (#14618) * Add aggregation support for derived fields Signed-off-by: Marc Handalian * add unit test for a terms agg with derived fields Signed-off-by: Marc Handalian * Fix license header and add changelog entry Signed-off-by: Marc Handalian * move matrix_stats tests to aggs-matrix-stats module Signed-off-by: Marc Handalian * Move matrix tests back and add dependency to painless module Signed-off-by: Marc Handalian * add tests for all aggregations types and support ip_range Signed-off-by: Marc Handalian * Add tests for agg script returned from DerivedFieldType Signed-off-by: Marc Handalian * remove children aggs test as its not yet supported Signed-off-by: Marc Handalian * Add more tests Signed-off-by: Marc Handalian * fix changelog Signed-off-by: Marc Handalian --------- Signed-off-by: Marc Handalian (cherry picked from commit e26608b1492a8c1dcc61b8f3965564a40b3c0401) Signed-off-by: github-actions[bot] * Fix derived field tests for percentile ranks. (#15015) These tests fail to backport to 2.x becuase 2.x uses a different branch of tdigest that computes percentiles differently. Rather than chase these over time, change the assertions to check for the length of results returned instead of their values. Signed-off-by: Marc Handalian (cherry picked from commit 0cde7baf438e6ab994114b71dd9fadcacac4e443) Signed-off-by: Marc Handalian --------- Signed-off-by: Marc Handalian Signed-off-by: github-actions[bot] Co-authored-by: github-actions[bot] Co-authored-by: Marc Handalian --- CHANGELOG.md | 1 + modules/lang-painless/build.gradle | 1 + .../derived_fields/60_derived_field_aggs.yml | 1515 +++++++++++++++++ .../index/mapper/DerivedFieldType.java | 75 +- .../index/mapper/ObjectDerivedFieldType.java | 31 +- .../support/ValuesSourceConfig.java | 9 +- .../support/values/ScriptBytesValues.java | 7 +- .../index/mapper/DerivedFieldTypeTests.java | 69 + .../terms/DerivedFieldAggregationTests.java | 146 ++ .../support/ValuesSourceConfigTests.java | 39 + 10 files changed, 1883 insertions(+), 10 deletions(-) create mode 100644 modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/derived_fields/60_derived_field_aggs.yml create mode 100644 server/src/test/java/org/opensearch/search/aggregations/bucket/terms/DerivedFieldAggregationTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cd0dec96b836..6284bea89c335 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased 2.x] ### Added - Fix for hasInitiatedFetching to fix allocation explain and manual reroute APIs (([#14972](https://github.com/opensearch-project/OpenSearch/pull/14972)) +- Add basic aggregation support for derived fields ([#14618](https://github.com/opensearch-project/OpenSearch/pull/14618)) ### Dependencies - Bump `org.apache.commons:commons-lang3` from 3.14.0 to 3.15.0 ([#14861](https://github.com/opensearch-project/OpenSearch/pull/14861)) diff --git a/modules/lang-painless/build.gradle b/modules/lang-painless/build.gradle index a8e95911bbd7b..71e922392e39d 100644 --- a/modules/lang-painless/build.gradle +++ b/modules/lang-painless/build.gradle @@ -46,6 +46,7 @@ ext { testClusters.all { module ':modules:mapper-extras' + module ':modules:aggs-matrix-stats' systemProperty 'opensearch.scripting.update.ctx_in_params', 'false' // TODO: remove this once cname is prepended to transport.publish_address by default in 8.0 systemProperty 'opensearch.transport.cname_in_publish_address', 'true' diff --git a/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/derived_fields/60_derived_field_aggs.yml b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/derived_fields/60_derived_field_aggs.yml new file mode 100644 index 0000000000000..87c260ce5f308 --- /dev/null +++ b/modules/lang-painless/src/yamlRestTest/resources/rest-api-spec/test/painless/derived_fields/60_derived_field_aggs.yml @@ -0,0 +1,1515 @@ +--- +setup: +- skip: + version: " - 2.14.99" + reason: "derived_field feature was added in 2.15" + +# -- NOT SUPPORTED: -- +# geobounds +# scripted metric +# -- NOT SUPPORTED: -- +# Any geo agg +# sig terms/text + +- do: + indices.create: + index: test + body: + mappings: + properties: + text: + type: text + keyword: + type: keyword + os: + type: keyword + long: + type: long + float: + type: float + double: + type: double + date: + type: date + geo: + type: geo_point + ip: + type: ip + boolean: + type: boolean + array_of_long: + type: long + json_field: + type: text + derived: + derived_text: + type: text + script: "emit(params._source[\"text\"])" + derived_text_prefilter_field: + type: text + script: "emit(params._source[\"text\"])" + prefilter_field: "text" + derived_keyword: + type: keyword + script: "emit(params._source[\"keyword\"])" + derived_os: + type: keyword + script: "emit(params._source[\"os\"])" + derived_long: + type: long + script: "emit(params._source[\"long\"])" + derived_float: + type: float + script: "emit(params._source[\"float\"])" + derived_double: + type: double + script: "emit(params._source[\"double\"])" + derived_date: + type: date + script: "emit(ZonedDateTime.parse(params._source[\"date\"]).toInstant().toEpochMilli())" + derived_geo: + type: geo_point + script: "emit(params._source[\"geo\"][0], params._source[\"geo\"][1])" + derived_ip: + type: ip + script: "emit(params._source[\"ip\"])" + derived_boolean: + type: boolean + script: "emit(params._source[\"boolean\"])" + derived_array_of_long: + type: long + script: "emit(params._source[\"array_of_long\"][0]);emit(params._source[\"array_of_long\"][1]);" + derived_object: + type: object + properties: + keyword: keyword + ip: ip + os: keyword + script: "emit(params._source[\"json_field\"])" + prefilter_field: "json_field" + +- do: + bulk: + refresh: true + body: + - index: + _index: test + _id: 1 + - text: "peter piper" + keyword: "foo" + os: "mac" + long: 1 + float: 1.0 + double: 1.0 + date: "2017-01-01T00:00:00Z" + geo: [ -74.0060, 40.7128 ] + ip: "192.168.0.1" + boolean: true + array_of_long: [ 1, 2 ] + json_field: "{\"text\":\"peter piper\",\"keyword\":\"foo\",\"os\":\"mac\",\"long\":1,\"float\":1.0,\"double\":1.0,\"date\":\"2017-01-01T00:00:00Z\",\"ip\":\"192.168.0.1\",\"boolean\":true, \"array_of_long\": [1, 2]}" + - index: + _index: test + _id: 2 + - text: "piper picked a peck" + keyword: "bar" + os: "windows" + long: 2 + float: 2.0 + double: 2.0 + date: "2017-01-02T00:00:00Z" + geo: [ -118.2437, 34.0522 ] + ip: "10.0.0.1" + boolean: false + array_of_long: [ 2, 3 ] + json_field: "{\"keyword\":\"bar\",\"long\":2,\"float\":2.0,\"os\":\"windows\",\"double\":2.0,\"date\":\"2017-01-02T00:00:00Z\",\"ip\":\"10.0.0.1\",\"boolean\":false, \"array_of_long\": [2, 3]}" + - index: + _index: test + _id: 3 + - text: "peck of pickled peppers" + keyword: "baz" + os: "mac" + long: -3 + float: -3.0 + double: -3.0 + date: "2017-01-03T00:00:00Z" + geo: [ -87.6298, 41.87 ] + ip: "172.16.0.1" + boolean: true + array_of_long: [ 3, 4 ] + json_field: "{\"keyword\":\"baz\",\"long\":-3,\"float\":-3.0,\"os\":\"mac\",\"double\":-3.0,\"date\":\"2017-01-03T00:00:00Z\",\"ip\":\"172.16.0.1\",\"boolean\":true, \"array_of_long\": [3, 4]}" + - index: + _index: test + _id: 4 + - text: "pickled peppers" + keyword: "qux" + os: "windows" + long: 4 + float: 4.0 + double: 4.0 + date: "2017-01-04T00:00:00Z" + geo: [ -74.0060, 40.7128 ] + ip: "192.168.0.2" + boolean: false + array_of_long: [ 4, 5 ] + json_field: "{\"keyword\":\"qux\",\"long\":4,\"float\":4.0,\"os\":\"windows\",\"double\":4.0,\"date\":\"2017-01-04T00:00:00Z\",\"ip\":\"192.168.0.2\",\"boolean\":false, \"array_of_long\": [4, 5]}" + - index: + _index: test + _id: 5 + - text: "peppers" + keyword: "quux" + os: "mac" + long: 5 + float: 5.0 + double: 5.0 + date: "2017-01-05T00:00:00Z" + geo: [ -87.6298, 41.87 ] + ip: "10.0.0.2" + boolean: true + array_of_long: [ 5, 6 ] + json_field: "{\"keyword\":\"quux\",\"long\":5,\"float\":5.0,\"os\":\"mac\",\"double\":5.0,\"date\":\"2017-01-05T00:00:00Z\",\"ip\":\"10.0.0.2\",\"boolean\":true, \"array_of_long\": [5, 6]}" + +- do: + indices.refresh: + index: [test] + +### BUCKET AGGS +--- +"Test terms aggregation on derived_keyword from search definition": +- do: + search: + index: test + body: + derived: + derived_keyword_search_definition: + type: keyword + script: "emit(params._source[\"keyword\"])" + size: 0 + aggs: + keywords: + terms: + field: derived_keyword_search_definition + +- match: { hits.total.value: 5 } +- length: { aggregations.keywords.buckets: 5 } +- match: { aggregations.keywords.buckets.0.key: "bar" } +- match: { aggregations.keywords.buckets.0.doc_count: 1 } +- match: { aggregations.keywords.buckets.1.key: "baz" } +- match: { aggregations.keywords.buckets.1.doc_count: 1 } +- match: { aggregations.keywords.buckets.2.key: "foo" } +- match: { aggregations.keywords.buckets.2.doc_count: 1 } +- match: { aggregations.keywords.buckets.3.key: "quux" } +- match: { aggregations.keywords.buckets.3.doc_count: 1 } +- match: { aggregations.keywords.buckets.4.key: "qux" } +- match: { aggregations.keywords.buckets.4.doc_count: 1 } + +--- +"Test terms aggregation on derived_keyword": +- do: + search: + index: test + body: + size: 0 + aggs: + keywords: + terms: + field: derived_keyword + +- match: { hits.total.value: 5 } +- length: { aggregations.keywords.buckets: 5 } +- match: { aggregations.keywords.buckets.0.key: "bar" } +- match: { aggregations.keywords.buckets.0.doc_count: 1 } +- match: { aggregations.keywords.buckets.1.key: "baz" } +- match: { aggregations.keywords.buckets.1.doc_count: 1 } +- match: { aggregations.keywords.buckets.2.key: "foo" } +- match: { aggregations.keywords.buckets.2.doc_count: 1 } +- match: { aggregations.keywords.buckets.3.key: "quux" } +- match: { aggregations.keywords.buckets.3.doc_count: 1 } +- match: { aggregations.keywords.buckets.4.key: "qux" } +- match: { aggregations.keywords.buckets.4.doc_count: 1 } + +--- +"Test range aggregation on derived_long": +- do: + search: + index: test + body: + size: 0 + aggs: + long_ranges: + range: + field: derived_long + ranges: + - to: 0 + - from: 0 + to: 3 + - from: 3 + +- match: { hits.total.value: 5 } +- length: { aggregations.long_ranges.buckets: 3 } +- match: { aggregations.long_ranges.buckets.0.doc_count: 1 } +- match: { aggregations.long_ranges.buckets.1.doc_count: 2 } +- match: { aggregations.long_ranges.buckets.2.doc_count: 2 } + +--- +"Test histogram aggregation on derived_float": +- do: + search: + index: test + body: + size: 0 + aggs: + float_histogram: + histogram: + field: derived_float + interval: 2 + +- match: { hits.total.value: 5 } +- length: { aggregations.float_histogram.buckets: 5 } +- match: { aggregations.float_histogram.buckets.0.key: -4.0 } +- match: { aggregations.float_histogram.buckets.0.doc_count: 1 } + +--- +"Test date_histogram aggregation on derived_date": +- do: + search: + index: test + body: + size: 0 + aggs: + date_histogram: + date_histogram: + field: derived_date + calendar_interval: day + +- match: { hits.total.value: 5 } +- length: { aggregations.date_histogram.buckets: 5 } +- match: { aggregations.date_histogram.buckets.0.key_as_string: "2017-01-01T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.0.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.1.key_as_string: "2017-01-02T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.1.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.2.key_as_string: "2017-01-03T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.2.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.3.key_as_string: "2017-01-04T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.3.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.4.key_as_string: "2017-01-05T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.4.doc_count: 1 } + +--- +"Test date_range aggregation on derived_date": +- do: + search: + index: test + body: + size: 0 + aggs: + date_range: + date_range: + field: derived_date + ranges: + - to: "2017-01-03T00:00:00Z" + - from: "2017-01-03T00:00:00Z" + +- match: { hits.total.value: 5 } +- match: { aggregations.date_range.buckets.0.key: "*-2017-01-03T00:00:00.000Z" } +- match: { aggregations.date_range.buckets.0.doc_count: 2 } +- match: { aggregations.date_range.buckets.1.key: "2017-01-03T00:00:00.000Z-*" } +- match: { aggregations.date_range.buckets.1.doc_count: 3 } + +--- +"Test filters aggregation on derived_boolean": +- do: + search: + index: test + body: + size: 0 + aggs: + boolean_filters: + filters: + filters: + true_values: + term: + derived_boolean: true + false_values: + term: + derived_boolean: false + +- match: { hits.total.value: 5 } +- match: { aggregations.boolean_filters.buckets.true_values.doc_count: 3 } +- match: { aggregations.boolean_filters.buckets.false_values.doc_count: 2 } + +--- +"Test adjacency matrix aggregation on derived_long": +- do: + search: + index: test + body: + size: 0 + aggs: + adj_matrix: + adjacency_matrix: + filters: + high_num: + range: + derived_long: + gte: 3 + low_num: + range: + derived_long: + lt: 3 +- match: { hits.total.value: 5 } +- length: { aggregations.adj_matrix.buckets: 2 } +- match: { aggregations.adj_matrix.buckets.0.key: "high_num" } +- match: { aggregations.adj_matrix.buckets.0.doc_count: 2 } +- match: { aggregations.adj_matrix.buckets.1.key: "low_num" } +- match: { aggregations.adj_matrix.buckets.1.doc_count: 3 } + +### METRIC AGGS + +--- +"Test stats aggregation on derived_array_of_long": +- do: + search: + index: test + body: + size: 0 + aggs: + long_array_stats: + stats: + field: derived_array_of_long + +- match: { hits.total.value: 5 } +- match: { aggregations.long_array_stats.count: 10 } +- match: { aggregations.long_array_stats.min: 1 } +- match: { aggregations.long_array_stats.max: 6 } +- match: { aggregations.long_array_stats.avg: 3.5 } +- match: { aggregations.long_array_stats.sum: 35 } + +--- +"Test cardinality aggregation on derived_keyword": +- do: + search: + index: test + body: + size: 0 + aggs: + unique_keywords: + cardinality: + field: derived_keyword + +- match: { hits.total.value: 5 } +- match: { aggregations.unique_keywords.value: 5 } + +--- +"Test percentiles aggregation on derived_double": +- do: + search: + index: test + body: + size: 0 + aggs: + double_percentiles: + percentiles: + field: derived_double + percents: [ 25, 50, 75 ] + +- match: { hits.total.value: 5 } +- length: { aggregations.double_percentiles.values: 3} + +--- +"Test percentile ranks aggregation on derived_long": +- do: + search: + index: test + body: + size: 0 + aggs: + long_percentile_ranks: + percentile_ranks: + field: derived_long + values: [ 2, 4 ] + +- match: { hits.total.value: 5 } +- length: { aggregations.long_percentile_ranks.values: 2} + +--- +"Test top hits aggregation on derived_keyword": +- do: + search: + index: test + body: + size: 0 + aggs: + top_keywords: + terms: + field: derived_keyword + aggs: + top_hits: + top_hits: + size: 1 +- match: { hits.total.value: 5 } +- length: { aggregations.top_keywords.buckets: 5 } +- match: { aggregations.top_keywords.buckets.0.key: "bar" } +- match: { aggregations.top_keywords.buckets.0.doc_count: 1 } +- length: { aggregations.top_keywords.buckets.0.top_hits.hits.hits: 1 } + +--- +"Test matrix stats aggregation on derived_long and float": +- do: + search: + index: test + body: + size: 0 + aggs: + matrix_stats: + matrix_stats: + fields: [ derived_long, derived_float ] +- match: { hits.total.value: 5 } +- length: { aggregations.matrix_stats.fields: 2 } +- match: { aggregations.matrix_stats.fields.0.name: "derived_float" } +- match: { aggregations.matrix_stats.fields.0.count: 5 } +- match: { aggregations.matrix_stats.fields.1.name: "derived_long" } +- match: { aggregations.matrix_stats.fields.1.count: 5 } + +--- +"Test median absolute deviation aggregation on derived_long": +- do: + search: + index: test + body: + size: 0 + aggs: + mad_long: + median_absolute_deviation: + field: derived_long +- match: { hits.total.value: 5 } +- match: { aggregations.mad_long.value: 2.0 } + +## Pipeline agg +--- +"Test simple pipeline agg with derived_keyword and long": +- do: + search: + index: test + body: + size: 0 + aggs: + keywords: + terms: + field: derived_keyword + aggs: + sum_derived_longs: + sum: + field: derived_long + sum_total: + sum_bucket: + buckets_path: "keywords>sum_derived_longs" +- match: { hits.total.value: 5 } +- match: { aggregations.keywords.buckets.0.key: "bar" } +- match: { aggregations.keywords.buckets.0.sum_derived_longs.value: 2 } +- match: { aggregations.keywords.buckets.1.key: "baz" } +- match: { aggregations.keywords.buckets.1.sum_derived_longs.value: -3 } +- match: { aggregations.keywords.buckets.2.key: "foo" } +- match: { aggregations.keywords.buckets.2.sum_derived_longs.value: 1 } +- match: { aggregations.keywords.buckets.3.key: "quux" } +- match: { aggregations.keywords.buckets.3.sum_derived_longs.value: 5 } +- match: { aggregations.keywords.buckets.4.key: "qux" } +- match: { aggregations.keywords.buckets.4.sum_derived_longs.value: 4 } +- match: { aggregations.sum_total.value: 9 } + + +--- +"Test terms aggregation on derived_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + ip_terms: + terms: + field: derived_ip + +- match: { hits.total.value: 5 } +- length: { aggregations.ip_terms.buckets: 5 } +- match: { aggregations.ip_terms.buckets.0.key: "10.0.0.1" } +- match: { aggregations.ip_terms.buckets.0.doc_count: 1 } + +--- +"Test range aggregation on derived_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + ip_ranges: + ip_range: + field: derived_ip + ranges: + - to: "10.0.0.0" + - from: "10.0.0.0" + to: "172.16.0.0" + - from: "172.16.0.0" + +- match: { hits.total.value: 5 } +- length: { aggregations.ip_ranges.buckets: 3 } +- match: { aggregations.ip_ranges.buckets.0.doc_count: 0 } +- match: { aggregations.ip_ranges.buckets.1.doc_count: 2 } +- match: { aggregations.ip_ranges.buckets.2.doc_count: 3 } + +--- +"Test cardinality aggregation on derived_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + unique_ips: + cardinality: + field: derived_ip + +- match: { hits.total.value: 5 } +- match: { aggregations.unique_ips.value: 5 } + +--- +"Test missing aggregation on derived_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + missing_ips: + missing: + field: derived_ip + +- match: { hits.total.value: 5 } +- match: { aggregations.missing_ips.doc_count: 0 } + +--- +"Test value count aggregation on derived_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + ip_count: + value_count: + field: derived_ip + +- match: { hits.total.value: 5 } +- match: { aggregations.ip_count.value: 5 } + +--- +"Test composite agg": +- do: + search: + index: test + body: + size: 0 + aggs: + test_composite_agg: + composite: + size: 10 + sources: + - os: + terms: + field: derived_os + - keyword: + terms: + field: derived_keyword + - is_true: + terms: + field: derived_boolean + aggs: + avg_long: + avg: + field: derived_long +- match: { aggregations.test_composite_agg.buckets.0.key.os: "mac" } +- match: { aggregations.test_composite_agg.buckets.0.key.keyword: "baz" } +- match: { aggregations.test_composite_agg.buckets.0.key.is_true: true } +- match: { aggregations.test_composite_agg.buckets.0.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.0.avg_long.value: -3.0 } +- match: { aggregations.test_composite_agg.buckets.1.key.os: "mac" } +- match: { aggregations.test_composite_agg.buckets.1.key.keyword: "foo" } +- match: { aggregations.test_composite_agg.buckets.1.key.is_true: true } +- match: { aggregations.test_composite_agg.buckets.1.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.1.avg_long.value: 1.0 } +- match: { aggregations.test_composite_agg.buckets.2.key.os: "mac" } +- match: { aggregations.test_composite_agg.buckets.2.key.keyword: "quux" } +- match: { aggregations.test_composite_agg.buckets.2.key.is_true: true } +- match: { aggregations.test_composite_agg.buckets.2.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.2.avg_long.value: 5.0 } +- match: { aggregations.test_composite_agg.buckets.3.key.os: "windows" } +- match: { aggregations.test_composite_agg.buckets.3.key.keyword: "bar" } +- match: { aggregations.test_composite_agg.buckets.3.key.is_true: false } +- match: { aggregations.test_composite_agg.buckets.3.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.3.avg_long.value: 2.0 } +- match: { aggregations.test_composite_agg.buckets.4.key.os: "windows" } +- match: { aggregations.test_composite_agg.buckets.4.key.keyword: "qux" } +- match: { aggregations.test_composite_agg.buckets.4.key.is_true: false } +- match: { aggregations.test_composite_agg.buckets.4.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.4.avg_long.value: 4.0 } + +--- +"Test auto date histogram": +- do: + search: + rest_total_hits_as_int: true + index: test + body: + size: 0 + aggs: + test_auto_date_histogram: + auto_date_histogram: + field: "derived_date" + buckets: 10 + format: "yyyy-MM-dd" + aggs: + avg_long: + avg: + field: derived_long +- match: { hits.total: 5 } +- length: { aggregations.test_auto_date_histogram.buckets: 9 } +- match: { aggregations.test_auto_date_histogram.buckets.0.key_as_string: "2017-01-01"} +- match: { aggregations.test_auto_date_histogram.buckets.0.avg_long.value: 1.0} + +--- +"Test variable_width_histogram aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + var_width_hist: + variable_width_histogram: + field: derived_long + buckets: 3 + +- match: { hits.total.value: 5 } +- length: { aggregations.var_width_hist.buckets: 3 } + +--- +"Test extended_stats aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + extended_stats_agg: + extended_stats: + field: derived_long + +- match: { hits.total.value: 5 } +- match: { aggregations.extended_stats_agg.count: 5 } +- match: { aggregations.extended_stats_agg.min: -3 } +- match: { aggregations.extended_stats_agg.max: 5 } +- is_true: aggregations.extended_stats_agg.avg +- is_true: aggregations.extended_stats_agg.sum +- is_true: aggregations.extended_stats_agg.sum_of_squares +- is_true: aggregations.extended_stats_agg.variance +- is_true: aggregations.extended_stats_agg.std_deviation + +--- +"Test rare_terms aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + rare_terms_agg: + rare_terms: + field: derived_keyword + max_doc_count: 1 + +- match: { hits.total.value: 5 } +- length: { aggregations.rare_terms_agg.buckets: 5 } + +--- +"Test global aggregation": +- do: + search: + index: test + body: + query: + term: + derived_keyword: "foo" + aggs: + all_docs: + global: {} + aggs: + avg_long: + avg: + field: derived_long + +- match: { hits.total.value: 1 } +- match: { aggregations.all_docs.doc_count: 5 } +- match: { aggregations.all_docs.avg_long.value: 1.8 } + +--- +"Test missing aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + missing_agg: + missing: + field: derived_keyword + +- match: { hits.total.value: 5 } +- match: { aggregations.missing_agg.doc_count: 0 } + +--- +"Test value_count aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + value_count_agg: + value_count: + field: derived_long + +- match: { hits.total.value: 5 } +- match: { aggregations.value_count_agg.value: 5 } + +--- +"Test weighted_avg aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + weighted_avg_agg: + weighted_avg: + value: + field: derived_long + weight: + field: derived_float + +- match: { hits.total.value: 5 } +- is_true: aggregations.weighted_avg_agg.value + +--- +"Test diversified_sampler aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + diversified_sampler_agg: + diversified_sampler: + field: derived_keyword + max_docs_per_value: 1 + aggs: + avg_long: + avg: + field: derived_long + +- match: { hits.total.value: 5 } +- match: { aggregations.diversified_sampler_agg.doc_count: 5 } +- match: { aggregations.diversified_sampler_agg.avg_long.value: 1.8 } + +--- +"Test sampler aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + sampler_agg: + sampler: + shard_size: 2 + aggs: + avg_long: + avg: + field: derived_long + +- match: { hits.total.value: 5 } +- is_true: aggregations.sampler_agg.doc_count +- is_true: aggregations.sampler_agg.avg_long.value + +--- +"Test multi_terms aggregation": +- do: + search: + index: test + body: + size: 0 + aggs: + multi_terms_agg: + multi_terms: + terms: + - field: derived_keyword + - field: derived_os + size: 10 + +- match: { hits.total.value: 5 } +- length: { aggregations.multi_terms_agg.buckets: 5 } + +#### SAME TESTS WITH DERIVED_OBJECT +--- +"Test terms aggregation on derived_object.keyword": +- do: + search: + index: test + body: + size: 0 + aggs: + keywords: + terms: + field: derived_object.keyword + +- match: { hits.total.value: 5 } +- length: { aggregations.keywords.buckets: 5 } +- match: { aggregations.keywords.buckets.0.key: "bar" } +- match: { aggregations.keywords.buckets.0.doc_count: 1 } +- match: { aggregations.keywords.buckets.1.key: "baz" } +- match: { aggregations.keywords.buckets.1.doc_count: 1 } +- match: { aggregations.keywords.buckets.2.key: "foo" } +- match: { aggregations.keywords.buckets.2.doc_count: 1 } +- match: { aggregations.keywords.buckets.3.key: "quux" } +- match: { aggregations.keywords.buckets.3.doc_count: 1 } +- match: { aggregations.keywords.buckets.4.key: "qux" } +- match: { aggregations.keywords.buckets.4.doc_count: 1 } + +--- +"Test range aggregation on derived_object.long": +- do: + search: + index: test + body: + size: 0 + aggs: + long_ranges: + range: + field: derived_object.long + ranges: + - to: 0 + - from: 0 + to: 3 + - from: 3 + +- match: { hits.total.value: 5 } +- length: { aggregations.long_ranges.buckets: 3 } +- match: { aggregations.long_ranges.buckets.0.doc_count: 1 } +- match: { aggregations.long_ranges.buckets.1.doc_count: 2 } +- match: { aggregations.long_ranges.buckets.2.doc_count: 2 } + +--- +"Test histogram aggregation on derived_object.float": +- do: + search: + index: test + body: + size: 0 + aggs: + float_histogram: + histogram: + field: derived_object.float + interval: 2 + +- match: { hits.total.value: 5 } +- length: { aggregations.float_histogram.buckets: 5 } +- match: { aggregations.float_histogram.buckets.0.key: -4.0 } +- match: { aggregations.float_histogram.buckets.0.doc_count: 1 } + +--- +"Test date_histogram aggregation on derived_object.date": +- do: + search: + index: test + body: + size: 0 + aggs: + date_histogram: + date_histogram: + field: derived_object.date + calendar_interval: day + +- match: { hits.total.value: 5 } +- length: { aggregations.date_histogram.buckets: 5 } +- match: { aggregations.date_histogram.buckets.0.key_as_string: "2017-01-01T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.0.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.1.key_as_string: "2017-01-02T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.1.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.2.key_as_string: "2017-01-03T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.2.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.3.key_as_string: "2017-01-04T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.3.doc_count: 1 } +- match: { aggregations.date_histogram.buckets.4.key_as_string: "2017-01-05T00:00:00.000Z" } +- match: { aggregations.date_histogram.buckets.4.doc_count: 1 } + +--- +"Test date_range aggregation on derived_object.date": +- do: + search: + index: test + body: + size: 0 + aggs: + date_range: + date_range: + field: derived_object.date + ranges: + - to: "2017-01-03T00:00:00Z" + - from: "2017-01-03T00:00:00Z" + +- match: { hits.total.value: 5 } +- match: { aggregations.date_range.buckets.0.key: "*-2017-01-03T00:00:00.000Z" } +- match: { aggregations.date_range.buckets.0.doc_count: 2 } +- match: { aggregations.date_range.buckets.1.key: "2017-01-03T00:00:00.000Z-*" } +- match: { aggregations.date_range.buckets.1.doc_count: 3 } + +--- +"Test filters aggregation on derived_object.boolean": +- do: + search: + index: test + body: + size: 0 + aggs: + boolean_filters: + filters: + filters: + true_values: + term: + derived_object.boolean: true + false_values: + term: + derived_object.boolean: false + +- match: { hits.total.value: 5 } +- match: { aggregations.boolean_filters.buckets.true_values.doc_count: 3 } +- match: { aggregations.boolean_filters.buckets.false_values.doc_count: 2 } + +--- +"Test adjacency matrix aggregation on derived_object.long": +- do: + search: + index: test + body: + size: 0 + aggs: + adj_matrix: + adjacency_matrix: + filters: + high_num: + range: + derived_object.long: + gte: 3 + low_num: + range: + derived_object.long: + lt: 3 +- match: { hits.total.value: 5 } +- length: { aggregations.adj_matrix.buckets: 2 } +- match: { aggregations.adj_matrix.buckets.0.key: "high_num" } +- match: { aggregations.adj_matrix.buckets.0.doc_count: 2 } +- match: { aggregations.adj_matrix.buckets.1.key: "low_num" } +- match: { aggregations.adj_matrix.buckets.1.doc_count: 3 } + +--- +"Test stats aggregation on derived_object.array_of_long": +- do: + search: + index: test + body: + size: 0 + aggs: + long_array_stats: + stats: + field: derived_object.array_of_long + +- match: { hits.total.value: 5 } +- match: { aggregations.long_array_stats.count: 10 } +- match: { aggregations.long_array_stats.min: 1 } +- match: { aggregations.long_array_stats.max: 6 } +- match: { aggregations.long_array_stats.avg: 3.5 } +- match: { aggregations.long_array_stats.sum: 35 } + +--- +"Test cardinality aggregation on derived_object_keyword": +- do: + search: + index: test + body: + size: 0 + aggs: + unique_keywords: + cardinality: + field: derived_object.keyword + +- match: { hits.total.value: 5 } +- match: { aggregations.unique_keywords.value: 5 } + +--- +"Test percentiles aggregation on derived_object.double": +- do: + search: + index: test + body: + size: 0 + aggs: + double_percentiles: + percentiles: + field: derived_object.double + percents: [ 25, 50, 75 ] + +- match: { hits.total.value: 5 } +- length: { aggregations.double_percentiles.values: 3} + +--- +"Test percentile ranks aggregation on derived_object.long": +- do: + search: + index: test + body: + size: 0 + aggs: + long_percentile_ranks: + percentile_ranks: + field: derived_object.long + values: [ 2, 4 ] + +- match: { hits.total.value: 5 } +- length: { aggregations.long_percentile_ranks.values: 2} + +--- +"Test top hits aggregation on derived_object.keyword": +- do: + search: + index: test + body: + size: 0 + aggs: + top_keywords: + terms: + field: derived_object.keyword + aggs: + top_hits: + top_hits: + size: 1 +- match: { hits.total.value: 5 } +- length: { aggregations.top_keywords.buckets: 5 } +- match: { aggregations.top_keywords.buckets.0.key: "bar" } +- match: { aggregations.top_keywords.buckets.0.doc_count: 1 } +- length: { aggregations.top_keywords.buckets.0.top_hits.hits.hits: 1 } + +--- +"Test matrix stats aggregation on derived_object.long and float": +- do: + search: + index: test + body: + size: 0 + aggs: + matrix_stats: + matrix_stats: + fields: [ derived_object.long, derived_object.float ] +- match: { hits.total.value: 5 } +- length: { aggregations.matrix_stats.fields: 2 } +- match: { aggregations.matrix_stats.fields.0.name: "derived_object.long" } +- match: { aggregations.matrix_stats.fields.0.count: 5 } +- match: { aggregations.matrix_stats.fields.1.name: "derived_object.float" } +- match: { aggregations.matrix_stats.fields.1.count: 5 } + +--- +"Test median absolute deviation aggregation on derived_object.long": +- do: + search: + index: test + body: + size: 0 + aggs: + mad_long: + median_absolute_deviation: + field: derived_object.long +- match: { hits.total.value: 5 } +- match: { aggregations.mad_long.value: 2.0 } + +--- +"Test simple pipeline agg derived_object": +- do: + search: + index: test + body: + size: 0 + aggs: + keywords: + terms: + field: derived_object.keyword + aggs: + sum_derived_longs: + sum: + field: derived_object.long + sum_total: + sum_bucket: + buckets_path: "keywords>sum_derived_longs" +- match: { hits.total.value: 5 } +- match: { aggregations.keywords.buckets.0.key: "bar" } +- match: { aggregations.keywords.buckets.0.sum_derived_longs.value: 2 } +- match: { aggregations.keywords.buckets.1.key: "baz" } +- match: { aggregations.keywords.buckets.1.sum_derived_longs.value: -3 } +- match: { aggregations.keywords.buckets.2.key: "foo" } +- match: { aggregations.keywords.buckets.2.sum_derived_longs.value: 1 } +- match: { aggregations.keywords.buckets.3.key: "quux" } +- match: { aggregations.keywords.buckets.3.sum_derived_longs.value: 5 } +- match: { aggregations.keywords.buckets.4.key: "qux" } +- match: { aggregations.keywords.buckets.4.sum_derived_longs.value: 4 } +- match: { aggregations.sum_total.value: 9 } + + +--- +"Test composite agg on derived_object": +- do: + search: + index: test + body: + size: 0 + aggs: + test_composite_agg: + composite: + size: 10 + sources: + - os: + terms: + field: derived_object.os + - keyword: + terms: + field: derived_object.keyword + - is_true: + terms: + field: derived_object.boolean + aggs: + avg_long: + avg: + field: derived_object.long +- length: { aggregations.test_composite_agg.buckets: 5 } +- match: { aggregations.test_composite_agg.buckets.0.key.os: "mac" } +- match: { aggregations.test_composite_agg.buckets.0.key.keyword: "baz" } +- match: { aggregations.test_composite_agg.buckets.0.key.is_true: true } +- match: { aggregations.test_composite_agg.buckets.0.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.0.avg_long.value: -3.0 } +- match: { aggregations.test_composite_agg.buckets.1.key.os: "mac" } +- match: { aggregations.test_composite_agg.buckets.1.key.keyword: "foo" } +- match: { aggregations.test_composite_agg.buckets.1.key.is_true: true } +- match: { aggregations.test_composite_agg.buckets.1.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.1.avg_long.value: 1.0 } +- match: { aggregations.test_composite_agg.buckets.2.key.os: "mac" } +- match: { aggregations.test_composite_agg.buckets.2.key.keyword: "quux" } +- match: { aggregations.test_composite_agg.buckets.2.key.is_true: true } +- match: { aggregations.test_composite_agg.buckets.2.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.2.avg_long.value: 5.0 } +- match: { aggregations.test_composite_agg.buckets.3.key.os: "windows" } +- match: { aggregations.test_composite_agg.buckets.3.key.keyword: "bar" } +- match: { aggregations.test_composite_agg.buckets.3.key.is_true: false } +- match: { aggregations.test_composite_agg.buckets.3.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.3.avg_long.value: 2.0 } +- match: { aggregations.test_composite_agg.buckets.4.key.os: "windows" } +- match: { aggregations.test_composite_agg.buckets.4.key.keyword: "qux" } +- match: { aggregations.test_composite_agg.buckets.4.key.is_true: false } +- match: { aggregations.test_composite_agg.buckets.4.doc_count: 1 } +- match: { aggregations.test_composite_agg.buckets.4.avg_long.value: 4.0 } + +--- +"Test auto date histogram on derived_object": +- do: + search: + rest_total_hits_as_int: true + index: test + body: + size: 0 + aggs: + test_auto_date_histogram: + auto_date_histogram: + field: "derived_object.date" + buckets: 10 + format: "yyyy-MM-dd" + aggs: + avg_long: + avg: + field: derived_object.long +- match: { hits.total: 5 } +- length: { aggregations.test_auto_date_histogram.buckets: 9 } +- match: { aggregations.test_auto_date_histogram.buckets.0.key_as_string: "2017-01-01"} +- match: { aggregations.test_auto_date_histogram.buckets.0.avg_long.value: 1.0} + +--- +"Test variable_width_histogram aggregation on derived_object": +- do: + search: + index: test + body: + size: 0 + aggs: + var_width_hist: + variable_width_histogram: + field: derived_object.long + buckets: 3 + +- match: { hits.total.value: 5 } +- length: { aggregations.var_width_hist.buckets: 3 } + +--- +"Test extended_stats aggregation on derived_object": +- do: + search: + index: test + body: + size: 0 + aggs: + extended_stats_agg: + extended_stats: + field: derived_object.long + +- match: { hits.total.value: 5 } +- match: { aggregations.extended_stats_agg.count: 5 } +- match: { aggregations.extended_stats_agg.min: -3 } +- match: { aggregations.extended_stats_agg.max: 5 } +- is_true: aggregations.extended_stats_agg.avg +- is_true: aggregations.extended_stats_agg.sum +- is_true: aggregations.extended_stats_agg.sum_of_squares +- is_true: aggregations.extended_stats_agg.variance +- is_true: aggregations.extended_stats_agg.std_deviation + +--- +"Test rare_terms aggregation on derived_object": +- do: + search: + index: test + body: + size: 0 + aggs: + rare_terms_agg: + rare_terms: + field: derived_object.keyword + max_doc_count: 1 + +- match: { hits.total.value: 5 } +- length: { aggregations.rare_terms_agg.buckets: 5 } + +--- +"Test global aggregation on derived_object": +- do: + search: + index: test + body: + query: + term: + derived_object.keyword: "foo" + aggs: + all_docs: + global: {} + aggs: + avg_long: + avg: + field: derived_object.long + +- match: { hits.total.value: 1 } +- match: { aggregations.all_docs.doc_count: 5 } +- match: { aggregations.all_docs.avg_long.value: 1.8 } + +--- +"Test value_count aggregation on derived_object": +- do: + search: + index: test + body: + size: 0 + aggs: + value_count_agg: + value_count: + field: derived_object.long + +- match: { hits.total.value: 5 } +- match: { aggregations.value_count_agg.value: 5 } + +--- +"Test multi_terms aggregation on derived_object": +- do: + search: + index: test + body: + size: 0 + aggs: + multi_terms_agg: + multi_terms: + terms: + - field: derived_object.keyword + - field: derived_object.os + size: 10 + +- match: { hits.total.value: 5 } +- length: { aggregations.multi_terms_agg.buckets: 5 } + + +### IP specific tests +--- +"Test terms aggregation on derived_object_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + ip_terms: + terms: + field: derived_object.ip + +- match: { hits.total.value: 5 } +- length: { aggregations.ip_terms.buckets: 5 } +- match: { aggregations.ip_terms.buckets.0.key: "10.0.0.1" } +- match: { aggregations.ip_terms.buckets.0.doc_count: 1 } + +--- +"Test range aggregation on derived_object_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + ip_ranges: + ip_range: + field: derived_object.ip + ranges: + - to: "10.0.0.0" + - from: "10.0.0.0" + to: "172.16.0.0" + - from: "172.16.0.0" + +- match: { hits.total.value: 5 } +- length: { aggregations.ip_ranges.buckets: 3 } +- match: { aggregations.ip_ranges.buckets.0.doc_count: 0 } +- match: { aggregations.ip_ranges.buckets.1.doc_count: 2 } +- match: { aggregations.ip_ranges.buckets.2.doc_count: 3 } + +--- +"Test cardinality aggregation on derived_object_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + unique_ips: + cardinality: + field: derived_object.ip + +- match: { hits.total.value: 5 } +- match: { aggregations.unique_ips.value: 5 } + +--- +"Test missing aggregation on derived_object_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + missing_ips: + missing: + field: derived_object.ip + +- match: { hits.total.value: 5 } +- match: { aggregations.missing_ips.doc_count: 0 } + +--- +"Test value count aggregation on derived_object_ip": +- do: + search: + index: test + body: + size: 0 + aggs: + ip_count: + value_count: + field: derived_object.ip + +- match: { hits.total.value: 5 } +- match: { aggregations.ip_count.value: 5 } + +### TEST UNSUPPORTED AGG TYPES +--- +"Test sig terms not supported": +- do: + catch: /illegal_argument_exception/ + search: + rest_total_hits_as_int: true + index: test + body: + query: + terms: + derived_keyword: ["foo"] + aggs: + significant_os: + significant_terms: + field: "derived_os" + min_doc_count: 1 + size: 10 + +--- +"Test significant text": +- do: + catch: /illegal_argument_exception/ + search: + rest_total_hits_as_int: true + index: test + body: + query: + terms: + derived_keyword: ["foo"] + aggs: + significant_words: + significant_text: + field: "derived_text" + size: 10 + min_doc_count: 1 + +--- +"Test scripted_metric aggregation": +- do: + catch: /A document doesn't have a value for a field/ + search: + index: test + body: + size: 0 + aggs: + scripted_metric_agg: + scripted_metric: + init_script: "state.arr = []" + map_script: "state.arr.add(doc.derived_long.value)" + combine_script: "return 0" + reduce_script: "return 0" + +--- +"Test geo_distance aggregation on derived_geo": +- do: + catch: /aggregation_execution_exception/ + search: + index: test + rest_total_hits_as_int: true + body: + size: 0 + aggs: + distance: + geo_distance: + field: derived_geo + origin: "35.7796, -78.6382" + ranges: + - to: 1000000 + - from: 1000000 + to: 5000000 + - from: 5000000 diff --git a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldType.java b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldType.java index f0200e72c3bc2..e230e37e6d826 100644 --- a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldType.java +++ b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldType.java @@ -9,38 +9,50 @@ package org.opensearch.index.mapper; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.InetAddressPoint; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper; import org.apache.lucene.queries.spans.SpanQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.util.BytesRef; import org.opensearch.common.Nullable; import org.opensearch.common.geo.ShapeRelation; +import org.opensearch.common.network.InetAddresses; import org.opensearch.common.time.DateFormatter; import org.opensearch.common.time.DateMathParser; import org.opensearch.common.unit.Fuzziness; import org.opensearch.geometry.Geometry; import org.opensearch.index.analysis.IndexAnalyzers; import org.opensearch.index.analysis.NamedAnalyzer; +import org.opensearch.index.fielddata.IndexFieldData; import org.opensearch.index.query.DerivedFieldQuery; import org.opensearch.index.query.QueryShardContext; +import org.opensearch.script.AggregationScript; import org.opensearch.script.DerivedFieldScript; import org.opensearch.script.Script; +import org.opensearch.search.DocValueFormat; +import org.opensearch.search.lookup.LeafSearchLookup; import org.opensearch.search.lookup.SearchLookup; import java.io.IOException; +import java.net.InetAddress; import java.time.ZoneId; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; /** * MappedFieldType for Derived Fields * Contains logic to execute different type of queries on a derived field of given type. + * * @opensearch.internal */ @@ -49,6 +61,11 @@ public class DerivedFieldType extends MappedFieldType implements GeoShapeQueryab final FieldMapper typeFieldMapper; final Function indexableFieldGenerator; + @Override + public DocValueFormat docValueFormat(String format, ZoneId timeZone) { + return typeFieldMapper.mappedFieldType.docValueFormat(format, timeZone); + } + public DerivedFieldType( DerivedField derivedField, boolean isIndexed, @@ -134,6 +151,11 @@ public DerivedFieldValueFetcher valueFetcher(QueryShardContext context, SearchLo ); } + @Override + public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, Supplier searchLookup) { + return getFieldMapper().mappedFieldType.fielddataBuilder(fullyQualifiedIndexName, searchLookup); + } + @Override public Query termQuery(Object value, QueryShardContext context) { Query query = typeFieldMapper.mappedFieldType.termQuery(value, context); @@ -503,7 +525,7 @@ public Query existsQuery(QueryShardContext context) { @Override public boolean isAggregatable() { - return false; + return true; } private Query createConjuctionQuery(Query filterQuery, DerivedFieldQuery derivedFieldQuery) { @@ -529,4 +551,55 @@ public static DerivedFieldScript.LeafFactory getDerivedFieldLeafFactory( DerivedFieldScript.Factory factory = context.compile(script, DerivedFieldScript.CONTEXT); return factory.newFactory(script.getParams(), searchLookup); } + + public AggregationScript.LeafFactory getAggregationScript(QueryShardContext context) { + return new AggregationScript.LeafFactory() { + @Override + public AggregationScript newInstance(LeafReaderContext ctx) throws IOException { + final DerivedFieldValueFetcher derivedFieldValueFetcher = valueFetcher(context, context.lookup(), null); + derivedFieldValueFetcher.setNextReader(ctx); + final LeafSearchLookup leafSearchLookup = context.lookup().getLeafSearchLookup(ctx); + + return new AggregationScript(derivedField.getScript().getParams(), context.lookup(), ctx) { + @Override + public Object execute() { + return formatValues(derivedFieldValueFetcher.fetchValuesInternal(leafSearchLookup.source())); + } + + @Override + public void setDocument(int docid) { + super.setDocument(docid); + leafSearchLookup.source().setSegmentAndDocument(ctx, docid); + } + }; + } + + @Override + public boolean needs_score() { + return false; + } + }; + } + + // perform any formatting on the returned Object before passing to + // any values source. + private Object formatValues(List objects) { + // ips are returned as raw strings, format them as BytesRefs + // This ensures that ip_range aggs compare the bytesRef against ranges computed in the + // same way. + if (typeFieldMapper instanceof IpFieldMapper) { + return objects.stream().map(o -> (String) o).map(this::toBytesRef).collect(Collectors.toList()); + } + return objects; + } + + // format the ip string as BytesRef. + private BytesRef toBytesRef(String ip) { + if (ip == null) { + return null; + } + InetAddress address = InetAddresses.forString(ip); + byte[] bytes = InetAddressPoint.encode(address); + return new BytesRef(bytes); + } } diff --git a/server/src/main/java/org/opensearch/index/mapper/ObjectDerivedFieldType.java b/server/src/main/java/org/opensearch/index/mapper/ObjectDerivedFieldType.java index 7e5c9a3f3da93..3d0165f702fda 100644 --- a/server/src/main/java/org/opensearch/index/mapper/ObjectDerivedFieldType.java +++ b/server/src/main/java/org/opensearch/index/mapper/ObjectDerivedFieldType.java @@ -91,21 +91,22 @@ public DerivedFieldValueFetcher valueFetcher(QueryShardContext context, SearchLo derivedField.getFormat() != null ? DateFormatter.forPattern(derivedField.getFormat()) : null ); - Function valueForDisplayUpdated = derivedField.getType().equals(DerivedFieldSupportedTypes.DATE.getName()) ? (o -> { + Function dateFormatter = derivedField.getType().equals(DerivedFieldSupportedTypes.DATE.getName()) ? (o -> { // this is needed to support date type for nested fields as they are required to be converted to long if (o instanceof String) { - return valueForDisplay.apply(((DateFieldMapper) typeFieldMapper).fieldType().parse((String) o)); + return ((DateFieldMapper) typeFieldMapper).fieldType().parse((String) o); } else { - return valueForDisplay.apply(o); + return o; } - }) : valueForDisplay; + }) : null; String subFieldName = name().substring(name().indexOf(".") + 1); return new ObjectDerivedFieldValueFetcher( subFieldName, getDerivedFieldLeafFactory(derivedField.getScript(), context, searchLookup == null ? context.lookup() : searchLookup), - valueForDisplayUpdated, - derivedField.getIgnoreMalformed() + valueForDisplay, + derivedField.getIgnoreMalformed(), + dateFormatter ); } @@ -115,6 +116,8 @@ static class ObjectDerivedFieldValueFetcher extends DerivedFieldValueFetcher { // TODO add it as part of index setting? private final boolean ignoreOnMalFormed; + private final Function dateFormatter; + ObjectDerivedFieldValueFetcher( String subField, DerivedFieldScript.LeafFactory derivedFieldScriptFactory, @@ -124,6 +127,20 @@ static class ObjectDerivedFieldValueFetcher extends DerivedFieldValueFetcher { super(derivedFieldScriptFactory, valueForDisplay); this.subField = subField; this.ignoreOnMalFormed = ignoreOnMalFormed; + this.dateFormatter = null; + } + + ObjectDerivedFieldValueFetcher( + String subField, + DerivedFieldScript.LeafFactory derivedFieldScriptFactory, + Function valueForDisplay, + boolean ignoreOnMalFormed, + Function dateFormatter + ) { + super(derivedFieldScriptFactory, valueForDisplay); + this.subField = subField; + this.ignoreOnMalFormed = ignoreOnMalFormed; + this.dateFormatter = dateFormatter; } @Override @@ -140,7 +157,7 @@ public List fetchValuesInternal(SourceLookup lookup) { if (nestedFieldObj instanceof List) { result.addAll((List) nestedFieldObj); } else { - result.add(nestedFieldObj); + result.add(dateFormatter != null ? dateFormatter.apply(nestedFieldObj) : nestedFieldObj); } } catch (OpenSearchParseException e) { if (!ignoreOnMalFormed) { diff --git a/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSourceConfig.java b/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSourceConfig.java index d006b15df327c..b6c8fe5d4802c 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSourceConfig.java +++ b/server/src/main/java/org/opensearch/search/aggregations/support/ValuesSourceConfig.java @@ -36,6 +36,7 @@ import org.opensearch.index.fielddata.IndexFieldData; import org.opensearch.index.fielddata.IndexGeoPointFieldData; import org.opensearch.index.fielddata.IndexNumericFieldData; +import org.opensearch.index.mapper.DerivedFieldType; import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.index.mapper.RangeFieldMapper; import org.opensearch.index.query.QueryShardContext; @@ -183,6 +184,12 @@ private static ValuesSourceConfig internalResolve( valuesSourceType = defaultValueSourceType; } DocValueFormat docValueFormat = resolveFormat(format, valuesSourceType, timeZone, fieldType); + + // If we are aggregating on derived field set the agg script. + if (fieldType instanceof DerivedFieldType) { + aggregationScript = ((DerivedFieldType) fieldType).getAggregationScript(context); + } + config = new ValuesSourceConfig( valuesSourceType, fieldContext, @@ -336,7 +343,7 @@ private ValuesSource ConstructValuesSource(Object missing, DocValueFormat format if (this.unmapped) { vs = valueSourceType().getEmpty(); } else { - if (fieldContext() == null) { + if (fieldContext() == null || fieldType() instanceof DerivedFieldType) { // Script case vs = valueSourceType().getScript(script(), scriptValueType()); } else { diff --git a/server/src/main/java/org/opensearch/search/aggregations/support/values/ScriptBytesValues.java b/server/src/main/java/org/opensearch/search/aggregations/support/values/ScriptBytesValues.java index 349bd8e14edf6..30f7494ea2d18 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/support/values/ScriptBytesValues.java +++ b/server/src/main/java/org/opensearch/search/aggregations/support/values/ScriptBytesValues.java @@ -32,6 +32,7 @@ package org.opensearch.search.aggregations.support.values; import org.apache.lucene.search.Scorable; +import org.apache.lucene.util.BytesRef; import org.opensearch.common.lucene.ScorerAware; import org.opensearch.core.common.util.CollectionUtils; import org.opensearch.index.fielddata.SortedBinaryDocValues; @@ -61,7 +62,11 @@ private void set(int i, Object o) { values[i].clear(); } else { CollectionUtils.ensureNoSelfReferences(o, "ScriptBytesValues value"); - values[i].copyChars(o.toString()); + if (o instanceof BytesRef) { + values[i].copyBytes((BytesRef) o); + } else { + values[i].copyChars(o.toString()); + } } } diff --git a/server/src/test/java/org/opensearch/index/mapper/DerivedFieldTypeTests.java b/server/src/test/java/org/opensearch/index/mapper/DerivedFieldTypeTests.java index f65acd0db0627..fe9db24f494ad 100644 --- a/server/src/test/java/org/opensearch/index/mapper/DerivedFieldTypeTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/DerivedFieldTypeTests.java @@ -15,14 +15,30 @@ import org.apache.lucene.document.LatLonPoint; import org.apache.lucene.document.LongField; import org.apache.lucene.document.LongPoint; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.util.BytesRef; import org.opensearch.OpenSearchException; import org.opensearch.common.collect.Tuple; +import org.opensearch.common.network.InetAddresses; +import org.opensearch.index.query.QueryShardContext; +import org.opensearch.script.AggregationScript; import org.opensearch.script.Script; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; +import org.opensearch.search.lookup.SourceLookup; +import java.io.IOException; import java.util.List; import static org.apache.lucene.index.IndexOptions.NONE; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.eq; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; public class DerivedFieldTypeTests extends FieldTypeTestCase { @@ -100,4 +116,57 @@ public void testObjectType() { public void testUnsupportedType() { expectThrows(IllegalArgumentException.class, () -> createDerivedFieldType("match_only_text")); } + + public void testGetAggregationScript_keyword() throws IOException { + DerivedFieldType dft = spy(createDerivedFieldType("keyword")); + assertTrue(dft.isAggregatable()); + QueryShardContext mockContext = mock(QueryShardContext.class); + List expected = List.of("foo"); + mockValueFetcherForAggs(mockContext, dft, expected); + + AggregationScript.LeafFactory aggregationScript = dft.getAggregationScript(mockContext); + // have to use a memoryIndex because we can't mock leafReaderContext + MemoryIndex index = new MemoryIndex(); + LeafReaderContext leafReaderContext = index.createSearcher().getIndexReader().leaves().get(0); + AggregationScript script = aggregationScript.newInstance(leafReaderContext); + + Object result = script.execute(); + assertEquals(expected, result); + } + + public void testGetAggregationScript_ip() throws IOException { + DerivedFieldType dft = spy(createDerivedFieldType("ip")); + assertTrue(dft.isAggregatable()); + QueryShardContext mockContext = mock(QueryShardContext.class); + List expected = List.of("192.168.0.1"); + LeafSearchLookup leafSearchLookup = mockValueFetcherForAggs(mockContext, dft, expected); + SourceLookup sourceLookup = mock(SourceLookup.class); + when(leafSearchLookup.source()).thenReturn(sourceLookup); + AggregationScript.LeafFactory aggregationScript = dft.getAggregationScript(mockContext); + assertFalse(aggregationScript.needs_score()); + // have to use a memoryIndex because we can't mock leafReaderContext + MemoryIndex index = new MemoryIndex(); + LeafReaderContext leafReaderContext = index.createSearcher().getIndexReader().leaves().get(0); + AggregationScript script = aggregationScript.newInstance(leafReaderContext); + + // test setDocument + int docid = 1; + script.setDocument(docid); + verify(sourceLookup, times(1)).setSegmentAndDocument(any(), eq(docid)); + + // test execute + List result = (List) script.execute(); + assertEquals(new BytesRef(InetAddressPoint.encode(InetAddresses.forString((String) expected.get(0)))), result.get(0)); + } + + private static LeafSearchLookup mockValueFetcherForAggs(QueryShardContext mockContext, DerivedFieldType dft, List expected) { + SearchLookup searchLookup = mock(SearchLookup.class); + LeafSearchLookup leafLookup = mock(LeafSearchLookup.class); + when(searchLookup.getLeafSearchLookup(any())).thenReturn(leafLookup); + when(mockContext.lookup()).thenReturn(searchLookup); + DerivedFieldValueFetcher valueFetcher = mock(DerivedFieldValueFetcher.class); + when(valueFetcher.fetchValuesInternal(any())).thenReturn(expected); + doReturn(valueFetcher).when(dft).valueFetcher(any(), any(), any()); + return leafLookup; + } } diff --git a/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/DerivedFieldAggregationTests.java b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/DerivedFieldAggregationTests.java new file mode 100644 index 0000000000000..2fb65d7fe3c46 --- /dev/null +++ b/server/src/test/java/org/opensearch/search/aggregations/bucket/terms/DerivedFieldAggregationTests.java @@ -0,0 +1,146 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.search.aggregations.bucket.terms; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.KeywordField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.opensearch.Version; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.index.Index; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.mapper.DerivedField; +import org.opensearch.index.mapper.DerivedFieldResolver; +import org.opensearch.index.mapper.DerivedFieldResolverFactory; +import org.opensearch.index.mapper.DerivedFieldType; +import org.opensearch.index.mapper.DerivedFieldValueFetcher; +import org.opensearch.index.mapper.KeywordFieldMapper; +import org.opensearch.index.mapper.MappedFieldType; +import org.opensearch.index.mapper.MapperService; +import org.opensearch.index.query.QueryShardContext; +import org.opensearch.script.DerivedFieldScript; +import org.opensearch.script.Script; +import org.opensearch.search.aggregations.AggregatorTestCase; +import org.opensearch.search.lookup.LeafSearchLookup; +import org.opensearch.search.lookup.SearchLookup; +import org.opensearch.search.lookup.SourceLookup; +import org.junit.Before; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + +public class DerivedFieldAggregationTests extends AggregatorTestCase { + + private QueryShardContext mockContext; + private List docs; + + private static final String[][] raw_requests = new String[][] { + { "40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0", "200", "40.135.0.0" }, + { "232.0.0.0 GET /images/hm_bg.jpg HTTP/1.0", "400", "232.0.0.0" }, + { "26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0", "200", "26.1.0.0" }, + { "247.37.0.0 GET /french/splash_inet.html HTTP/1.0", "400", "247.37.0.0" }, + { "247.37.0.0 GET /french/splash_inet.html HTTP/1.0", "400", "247.37.0.0" }, + { "247.37.0.0 GET /french/splash_inet.html HTTP/1.0", "200", "247.37.0.0" } }; + + @Before + public void init() { + super.initValuesSourceRegistry(); + // Create a mock QueryShardContext + mockContext = mock(QueryShardContext.class); + when(mockContext.index()).thenReturn(new Index("test_index", "uuid")); + when(mockContext.allowExpensiveQueries()).thenReturn(true); + + MapperService mockMapperService = mock(MapperService.class); + when(mockContext.getMapperService()).thenReturn(mockMapperService); + Settings indexSettings = Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + .build(); + // Mock IndexSettings + IndexSettings mockIndexSettings = new IndexSettings( + IndexMetadata.builder("test_index").settings(indexSettings).build(), + Settings.EMPTY + ); + when(mockMapperService.getIndexSettings()).thenReturn(mockIndexSettings); + when(mockContext.getIndexSettings()).thenReturn(mockIndexSettings); + docs = new ArrayList<>(); + for (String[] request : raw_requests) { + Document document = new Document(); + document.add(new TextField("raw_request", request[0], Field.Store.YES)); + document.add(new KeywordField("status", request[1], Field.Store.YES)); + docs.add(document); + } + } + + public void testSimpleTermsAggregationWithDerivedField() throws IOException { + MappedFieldType keywordFieldType = new KeywordFieldMapper.KeywordFieldType("status"); + + SearchLookup searchLookup = mock(SearchLookup.class); + SourceLookup sourceLookup = new SourceLookup(); + LeafSearchLookup leafLookup = mock(LeafSearchLookup.class); + when(leafLookup.source()).thenReturn(sourceLookup); + + // Mock DerivedFieldScript.Factory + DerivedFieldScript.Factory factory = (params, lookup) -> (DerivedFieldScript.LeafFactory) ctx -> { + when(searchLookup.getLeafSearchLookup(any())).thenReturn(leafLookup); + return new DerivedFieldScript(params, lookup, ctx) { + @Override + public void execute() { + addEmittedValue(raw_requests[sourceLookup.docId()][1]); + } + + @Override + public void setDocument(int docid) { + sourceLookup.setSegmentAndDocument(ctx, docid); + } + }; + }; + + DerivedField derivedField = new DerivedField("derived_field", "keyword", new Script("")); + DerivedFieldResolver resolver = DerivedFieldResolverFactory.createResolver( + mockContext, + Collections.emptyMap(), + Collections.singletonList(derivedField), + true + ); + + // spy on the resolved type so we can mock the valuefetcher + DerivedFieldType derivedFieldType = spy((DerivedFieldType) resolver.resolve("derived_field")); + DerivedFieldScript.LeafFactory leafFactory = factory.newFactory((new Script("")).getParams(), searchLookup); + DerivedFieldValueFetcher valueFetcher = new DerivedFieldValueFetcher(leafFactory, null); + doReturn(valueFetcher).when(derivedFieldType).valueFetcher(any(), any(), any()); + + TermsAggregationBuilder aggregationBuilder = new TermsAggregationBuilder("derived_terms").field("status").size(10); + + testCase(aggregationBuilder, new MatchAllDocsQuery(), iw -> { + for (Document d : docs) { + iw.addDocument(d); + } + }, (InternalTerms result) -> { + assertEquals(2, result.getBuckets().size()); + List buckets = result.getBuckets(); + assertEquals("200", buckets.get(0).getKey()); + assertEquals(3, buckets.get(0).getDocCount()); + assertEquals("400", buckets.get(1).getKey()); + assertEquals(3, buckets.get(1).getDocCount()); + }, keywordFieldType, derivedFieldType); + } +} diff --git a/server/src/test/java/org/opensearch/search/aggregations/support/ValuesSourceConfigTests.java b/server/src/test/java/org/opensearch/search/aggregations/support/ValuesSourceConfigTests.java index 33d9a63f61a35..568c3c950f588 100644 --- a/server/src/test/java/org/opensearch/search/aggregations/support/ValuesSourceConfigTests.java +++ b/server/src/test/java/org/opensearch/search/aggregations/support/ValuesSourceConfigTests.java @@ -37,9 +37,12 @@ import org.apache.lucene.util.BytesRef; import org.opensearch.action.support.WriteRequest; import org.opensearch.common.settings.Settings; +import org.opensearch.common.xcontent.XContentFactory; +import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.index.IndexService; import org.opensearch.index.engine.Engine; import org.opensearch.index.fielddata.SortedBinaryDocValues; +import org.opensearch.index.mapper.MapperService; import org.opensearch.index.query.QueryShardContext; import org.opensearch.test.OpenSearchSingleNodeTestCase; @@ -334,4 +337,40 @@ public void testFieldAlias() throws Exception { assertEquals(new BytesRef("value"), values.nextValue()); } } + + public void testDerivedField() throws Exception { + String script = "derived_field_script"; + String derived_field = "derived_keyword"; + + XContentBuilder mapping = XContentFactory.jsonBuilder() + .startObject() + .startObject("derived") + .startObject(derived_field) + .field("type", "keyword") + .startObject("script") + .field("source", script) + .field("lang", "mockscript") + .endObject() + .endObject() + .endObject() + .endObject(); + IndexService indexService = createIndex("index", Settings.EMPTY, MapperService.SINGLE_MAPPING_NAME, mapping); + client().prepareIndex("index").setId("1").setSource("field", "value").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get(); + + try (Engine.Searcher searcher = indexService.getShard(0).acquireSearcher("test")) { + QueryShardContext context = indexService.newQueryShardContext(0, searcher, () -> 42L, null); + ValuesSourceConfig config = ValuesSourceConfig.resolve( + context, + null, + derived_field, + null, + null, + null, + null, + CoreValuesSourceType.BYTES + ); + assertNotNull(script); + assertEquals(ValuesSource.Bytes.Script.class, config.getValuesSource().getClass()); + } + } }