From 0332408c9240af8864cc5ba19340a73d0db718fb Mon Sep 17 00:00:00 2001 From: Prudhvi Godithi Date: Sat, 26 Oct 2024 11:53:11 -0700 Subject: [PATCH 1/2] Add custom synonym_analyzer Signed-off-by: Prudhvi Godithi --- .../gradle/testclusters/OpenSearchNode.java | 12 ++- .../common/CommonAnalysisModulePlugin.java | 33 ++++++- .../SynonymGraphTokenFilterFactory.java | 11 ++- .../common/SynonymTokenFilterFactory.java | 28 +++++- .../common/CommonAnalysisFactoryTests.java | 42 ++++++++ .../common/SynonymsAnalysisTests.java | 97 +++++++++++++++++-- .../indices/analysis/AnalysisModule.java | 7 +- .../opensearch/plugins/AnalysisPlugin.java | 9 ++ .../indices/analysis/AnalysisModuleTests.java | 56 +++++++++++ .../analysis/AnalysisFactoryTestCase.java | 11 +++ 10 files changed, 288 insertions(+), 18 deletions(-) diff --git a/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java b/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java index cd22560af9a96..bb409c2afd871 100644 --- a/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java +++ b/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java @@ -1216,14 +1216,18 @@ private void createConfiguration() { ); final List configFiles; - try (Stream stream = Files.list(getDistroDir().resolve("config"))) { + try (Stream stream = Files.walk(getDistroDir().resolve("config"))) { configFiles = stream.collect(Collectors.toList()); } logToProcessStdout("Copying additional config files from distro " + configFiles); for (Path file : configFiles) { - Path dest = configFile.getParent().resolve(file.getFileName()); - if (Files.exists(dest) == false) { - Files.copy(file, dest); + Path relativePath = getDistroDir().resolve("config").relativize(file); + Path dest = configFile.getParent().resolve(relativePath); + if (Files.isDirectory(file)) { + Files.createDirectories(dest); + } else { + Files.createDirectories(dest.getParent()); + Files.copy(file, dest, StandardCopyOption.REPLACE_EXISTING); } } } catch (IOException e) { diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index f14e499081ce9..763c1783c2d28 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -146,6 +146,7 @@ import org.opensearch.index.analysis.PreConfiguredTokenizer; import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.opensearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.opensearch.plugins.AnalysisPlugin; @@ -332,8 +333,6 @@ public Map> getTokenFilters() { filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new); filters.put("stemmer_override", requiresAnalysisSettings(StemmerOverrideTokenFilterFactory::new)); filters.put("stemmer", StemmerTokenFilterFactory::new); - filters.put("synonym", requiresAnalysisSettings(SynonymTokenFilterFactory::new)); - filters.put("synonym_graph", requiresAnalysisSettings(SynonymGraphTokenFilterFactory::new)); filters.put("trim", TrimTokenFilterFactory::new); filters.put("truncate", requiresAnalysisSettings(TruncateTokenFilterFactory::new)); filters.put("unique", UniqueTokenFilterFactory::new); @@ -343,6 +342,36 @@ public Map> getTokenFilters() { return filters; } + @Override + public Map> getTokenFilters(AnalysisModule analysisModule) { + Map> filters = getTokenFilters(); + filters.put( + "synonym", + requiresAnalysisSettings( + (indexSettings, environment, name, settings) -> new SynonymTokenFilterFactory( + indexSettings, + environment, + name, + settings, + analysisModule.getAnalysisRegistry() + ) + ) + ); + filters.put( + "synonym_graph", + requiresAnalysisSettings( + (indexSettings, environment, name, settings) -> new SynonymGraphTokenFilterFactory( + indexSettings, + environment, + name, + settings, + analysisModule.getAnalysisRegistry() + ) + ) + ); + return filters; + } + @Override public Map> getCharFilters() { Map> filters = new TreeMap<>(); diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java index fed959108c411..c2e20e99473de 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymGraphTokenFilterFactory.java @@ -40,6 +40,7 @@ import org.opensearch.env.Environment; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.AnalysisMode; +import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.index.analysis.CharFilterFactory; import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.index.analysis.TokenizerFactory; @@ -49,8 +50,14 @@ public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory { - SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { - super(indexSettings, env, name, settings); + SynonymGraphTokenFilterFactory( + IndexSettings indexSettings, + Environment env, + String name, + Settings settings, + AnalysisRegistry analysisRegistry + ) { + super(indexSettings, env, name, settings, analysisRegistry); } @Override diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java index 01a65e87d7466..86417a579628c 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java @@ -44,11 +44,13 @@ import org.opensearch.index.analysis.AbstractTokenFilterFactory; import org.opensearch.index.analysis.Analysis; import org.opensearch.index.analysis.AnalysisMode; +import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.index.analysis.CharFilterFactory; import org.opensearch.index.analysis.CustomAnalyzer; import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.index.analysis.TokenizerFactory; +import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.List; @@ -64,8 +66,16 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { protected final Settings settings; protected final Environment environment; protected final AnalysisMode analysisMode; - - SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + private final String synonymAnalyzer; + private final AnalysisRegistry analysisRegistry; + + SynonymTokenFilterFactory( + IndexSettings indexSettings, + Environment env, + String name, + Settings settings, + AnalysisRegistry analysisRegistry + ) { super(indexSettings, name, settings); this.settings = settings; @@ -83,6 +93,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { boolean updateable = settings.getAsBoolean("updateable", false); this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL; this.environment = env; + this.synonymAnalyzer = settings.get("synonym_analyzer", null); + this.analysisRegistry = analysisRegistry; } @Override @@ -137,6 +149,17 @@ Analyzer buildSynonymAnalyzer( List tokenFilters, Function allFilters ) { + if (synonymAnalyzer != null) { + Analyzer customSynonymAnalyzer; + try { + customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzer); + } catch (IOException e) { + throw new RuntimeException(e); + } + if (customSynonymAnalyzer != null) { + return customSynonymAnalyzer; + } + } return new CustomAnalyzer( tokenizer, charFilters.toArray(new CharFilterFactory[0]), @@ -177,5 +200,4 @@ Reader getRulesFromSettings(Environment env) { } return rulesReader; } - } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index 7e3140f8bcba3..baad489d33649 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -39,12 +39,16 @@ import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory; import org.apache.lucene.analysis.te.TeluguStemFilterFactory; +import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.indices.analysis.AnalysisFactoryTestCase; +import org.opensearch.indices.analysis.AnalysisModule; import java.util.List; import java.util.Map; import java.util.TreeMap; +import org.mockito.Mock; + import static java.util.Collections.emptyList; import static java.util.stream.Collectors.toList; @@ -53,6 +57,9 @@ public CommonAnalysisFactoryTests() { super(new CommonAnalysisModulePlugin()); } + @Mock + private AnalysisModule analysisModule; + @Override protected Map> getTokenizers() { Map> tokenizers = new TreeMap<>(super.getTokenizers()); @@ -302,4 +309,39 @@ private void markedTestCase(String name, Map> map) { unmarked ); } + + /** + * Tests the getTokenFilters(AnalysisModule) method to verify: + * 1. All token filters are properly loaded + * 2. Basic filters remain available + * 3. Synonym filters are added when AnalysisModule is provided + * 4. The total number of filters is correct (base filters + synonym filters) + */ + public void testGetTokenFiltersWithAnalysisModule() { + CommonAnalysisModulePlugin plugin = (CommonAnalysisModulePlugin) getAnalysisPlugin(); + Map> filters = plugin.getTokenFilters(analysisModule); + assertNotNull("Token filters should not be null", filters); + assertTrue("Should contain basic filters", filters.containsKey("lowercase")); + assertTrue("Should contain synonym filter", filters.containsKey("synonym")); + assertTrue("Should contain synonym_graph filter", filters.containsKey("synonym_graph")); + Map> baseFilters = plugin.getTokenFilters(); + assertEquals("Should contain additional synonym filters", baseFilters.size() + 2, filters.size()); + } + + /** + * Tests that synonym-related token filters are only available when an AnalysisModule is provided. + * This test verifies that: + * 1. Base getTokenFilters() does not include synonym filters + * 2. Extended getTokenFilters(AnalysisModule) includes synonym filters + * 3. Both synonym and synonym_graph filters require AnalysisModule + */ + public void testSynonymFiltersRequireAnalysisModule() { + CommonAnalysisModulePlugin plugin = (CommonAnalysisModulePlugin) getAnalysisPlugin(); + Map> baseFilters = plugin.getTokenFilters(); + Map> extendedFilters = plugin.getTokenFilters(analysisModule); + assertFalse("Base filters should not contain synonym filter", baseFilters.containsKey("synonym")); + assertTrue("Extended filters should contain synonym filter", extendedFilters.containsKey("synonym")); + assertFalse("Base filters should not contain synonym_graph filter", baseFilters.containsKey("synonym_graph")); + assertTrue("Extended filters should contain synonym_graph filter", extendedFilters.containsKey("synonym_graph")); + } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java index 8c8b8ac7f61c0..e7b378701c166 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java @@ -41,11 +41,14 @@ import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; import org.opensearch.env.Environment; +import org.opensearch.env.TestEnvironment; import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AnalysisRegistry; import org.opensearch.index.analysis.IndexAnalyzers; import org.opensearch.index.analysis.PreConfiguredTokenFilter; import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.test.IndexSettingsModule; import org.opensearch.test.OpenSearchTestCase; import org.opensearch.test.VersionUtils; @@ -63,6 +66,7 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.startsWith; +import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents; public class SynonymsAnalysisTests extends OpenSearchTestCase { private IndexAnalyzers indexAnalyzers; @@ -255,14 +259,18 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException { .put("hyphenation_patterns_path", "foo") .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + Environment environment = TestEnvironment.newEnvironment(settings); + // Initialize analysis registry + AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin())); + AnalysisRegistry analysisRegistry = module.getAnalysisRegistry(); String[] bypassingFactories = new String[] { "dictionary_decompounder" }; CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin(); for (String factory : bypassingFactories) { - TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); - TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); - SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, environment, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry); Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null); try (TokenStream ts = analyzer.tokenStream("field", "text")) { @@ -319,7 +327,13 @@ public void testDisallowedTokenFilters() throws IOException { .putList("common_words", "a", "b") .put("output_unigrams", "true") .build(); + + Environment environment = TestEnvironment.newEnvironment(settings); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); + + // Create analysis module + AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin())); + AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry(); CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin(); String[] disallowedFactories = new String[] { @@ -333,9 +347,9 @@ public void testDisallowedTokenFilters() throws IOException { "fingerprint" }; for (String factory : disallowedFactories) { - TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings); - TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings); - SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings); + TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, environment, factory, settings); + TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings); + SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry); IllegalArgumentException e = expectThrows( IllegalArgumentException.class, @@ -362,4 +376,75 @@ private void match(String analyzerName, String source, String target) throws IOE MatcherAssert.assertThat(target, equalTo(sb.toString().trim())); } + /** + * Tests the integration of word delimiter and synonym graph filters with synonym_analyzer based on issue #16263. + * This test verifies the correct handling of: + * 1. Hyphenated words with word delimiter (e.g., "note-book" → ["notebook", "note", "book"]) + * 2. Multi-word synonyms (e.g., "mobile phone" → ["smartphone"]) + * 3. Single word synonyms (e.g., "laptop" → ["notebook"]) + * + * @see Issue #16263 + */ + public void testSynonymAnalyzerWithWordDelimiter() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + .put("path.home", createTempDir().toString()) + .put("index.analysis.filter.custom_word_delimiter.type", "word_delimiter_graph") + .put("index.analysis.filter.custom_word_delimiter.generate_word_parts", true) + .put("index.analysis.filter.custom_word_delimiter.catenate_all", true) + .put("index.analysis.filter.custom_word_delimiter.split_on_numerics", false) + .put("index.analysis.filter.custom_word_delimiter.split_on_case_change", false) + .put("index.analysis.filter.custom_pattern_replace_filter.type", "pattern_replace") + .put("index.analysis.filter.custom_pattern_replace_filter.pattern", "(-)") + .put("index.analysis.filter.custom_pattern_replace_filter.replacement", " ") + .put("index.analysis.filter.custom_pattern_replace_filter.all", true) + .put("index.analysis.filter.custom_synonym_graph_filter.type", "synonym_graph") + .putList( + "index.analysis.filter.custom_synonym_graph_filter.synonyms", + "laptop => notebook", + "smartphone, mobile phone, cell phone => smartphone", + "tv, television => television" + ) + .put("index.analysis.filter.custom_synonym_graph_filter.synonym_analyzer", "standard") + .put("index.analysis.analyzer.text_en_index.type", "custom") + .put("index.analysis.analyzer.text_en_index.tokenizer", "whitespace") + .putList( + "index.analysis.analyzer.text_en_index.filter", + "lowercase", + "custom_word_delimiter", + "custom_synonym_graph_filter", + "custom_pattern_replace_filter", + "flatten_graph" + ) + .build(); + Environment environment = TestEnvironment.newEnvironment(settings); + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings); + AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin())); + IndexAnalyzers analyzers = module.getAnalysisRegistry().build(indexSettings); + try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "note-book")) { + assertTokenStreamContents( + ts, + new String[] { "notebook", "note", "book" }, + new int[] { 0, 0, 5 }, + new int[] { 9, 4, 9 }, + new String[] { "word", "word", "word" }, + new int[] { 1, 0, 1 }, + new int[] { 2, 1, 1 } + ); + } + try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "mobile phone")) { + assertTokenStreamContents( + ts, + new String[] { "smartphone" }, + new int[] { 0 }, + new int[] { 12 }, + new String[] { "SYNONYM" }, + new int[] { 1 }, + new int[] { 1 } + ); + } + try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "laptop")) { + assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 }); + } + } } diff --git a/server/src/main/java/org/opensearch/indices/analysis/AnalysisModule.java b/server/src/main/java/org/opensearch/indices/analysis/AnalysisModule.java index 0926d497087d1..dbb3035a18f74 100644 --- a/server/src/main/java/org/opensearch/indices/analysis/AnalysisModule.java +++ b/server/src/main/java/org/opensearch/indices/analysis/AnalysisModule.java @@ -165,7 +165,12 @@ public boolean requiresAnalysisSettings() { ) ); - tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters); + for (AnalysisPlugin plugin : plugins) { + Map> filters = plugin.getTokenFilters(this); + for (Map.Entry> entry : filters.entrySet()) { + tokenFilters.register(entry.getKey(), entry.getValue()); + } + } return tokenFilters; } diff --git a/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java b/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java index 53dcc916b244f..a7c4604a30553 100644 --- a/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java +++ b/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java @@ -47,6 +47,7 @@ import org.opensearch.index.analysis.PreConfiguredTokenizer; import org.opensearch.index.analysis.TokenFilterFactory; import org.opensearch.index.analysis.TokenizerFactory; +import org.opensearch.indices.analysis.AnalysisModule; import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider; import java.io.IOException; @@ -84,6 +85,14 @@ default Map> getCharFilters() { return emptyMap(); } + /** + * Override to add additional {@link TokenFilter}s that need access to the AnalysisModule. + * The default implementation calls the existing getTokenFilters() method for backward compatibility. + */ + default Map> getTokenFilters(AnalysisModule analysisModule) { + return getTokenFilters(); + } + /** * Override to add additional {@link TokenFilter}s. See {@link #requiresAnalysisSettings(AnalysisProvider)} * how to on get the configuration from the index. diff --git a/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java b/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java index c9e26d6d6159a..625f79b26b01b 100644 --- a/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java +++ b/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java @@ -56,6 +56,8 @@ import org.opensearch.index.analysis.CustomAnalyzer; import org.opensearch.index.analysis.IndexAnalyzers; import org.opensearch.index.analysis.MyFilterTokenFilterFactory; +import org.opensearch.index.analysis.NameOrDefinition; +import org.opensearch.index.analysis.NamedAnalyzer; import org.opensearch.index.analysis.PreConfiguredCharFilter; import org.opensearch.index.analysis.PreConfiguredTokenFilter; import org.opensearch.index.analysis.PreConfiguredTokenizer; @@ -80,6 +82,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; @@ -521,4 +524,57 @@ public boolean incrementToken() throws IOException { } } + /** + * Tests registration and functionality of token filters that require access to the AnalysisModule. + * This test verifies: + * 1. Token filter registration using the extended getTokenFilters(AnalysisModule) method + * 2. Filter functionality in both predefined and custom analyzers + * 3. Proper access to AnalysisModule reference within filter factory creation + */ + public void testTokenFilterRegistrationWithModuleReference() throws IOException { + class TestPlugin implements AnalysisPlugin { + @Override + public Map> getTokenFilters(AnalysisModule module) { + return Map.of( + "test_filter", + (indexSettings, env, name, settings) -> AppendTokenFilter.factoryForSuffix("_" + module.hashCode()) + ); + } + } + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.analyzer.my_analyzer.tokenizer", "standard") + .put("index.analysis.analyzer.my_analyzer.filter", "test_filter") + .build(); + Environment environment = TestEnvironment.newEnvironment(settings); + AnalysisModule module = new AnalysisModule(environment, singletonList(new TestPlugin())); + AnalysisRegistry registry = module.getAnalysisRegistry(); + IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", Settings.builder().put(settings).build()); + Map tokenFilterFactories = registry.buildTokenFilterFactories(indexSettings); + assertTrue("Token filter 'test_filter' should be registered", tokenFilterFactories.containsKey("test_filter")); + IndexAnalyzers analyzers = registry.build(indexSettings); + String testText = "test"; + TokenStream tokenStream = analyzers.get("my_analyzer").tokenStream("", testText); + CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + assertTrue("Should have found a token", tokenStream.incrementToken()); + assertEquals("Token should have expected suffix", "test_" + module.hashCode(), charTermAttribute.toString()); + assertFalse("Should not have additional tokens", tokenStream.incrementToken()); + tokenStream.close(); + NamedAnalyzer customAnalyzer = registry.buildCustomAnalyzer( + indexSettings, + false, + new NameOrDefinition("standard"), + Collections.emptyList(), + Collections.singletonList(new NameOrDefinition("test_filter")) + ); + tokenStream = customAnalyzer.tokenStream("", testText); + charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); + tokenStream.reset(); + assertTrue("Custom analyzer should produce a token", tokenStream.incrementToken()); + assertEquals("Custom analyzer token should have expected suffix", "test_" + module.hashCode(), charTermAttribute.toString()); + assertFalse("Custom analyzer should not produce additional tokens", tokenStream.incrementToken()); + tokenStream.close(); + } } diff --git a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java index 23cf4d47a49d9..ca23f67215f3d 100644 --- a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java @@ -248,6 +248,17 @@ public AnalysisFactoryTestCase(AnalysisPlugin plugin) { this.plugin = Objects.requireNonNull(plugin, "plugin is required. use an empty plugin for core"); } + /** + * Returns the AnalysisPlugin instance that was passed to this test case. + * This protected method allows subclasses to access the plugin for testing + * specific analysis components. + * + * @return The AnalysisPlugin instance used by this test case + */ + protected AnalysisPlugin getAnalysisPlugin() { + return plugin; + } + protected Map> getCharFilters() { return KNOWN_CHARFILTERS; } From 7d6fc4398907c5a172c7f7fe6b70a685a5dc6c26 Mon Sep 17 00:00:00 2001 From: Prudhvi Godithi Date: Sat, 2 Nov 2024 12:46:07 -0700 Subject: [PATCH 2/2] synonym_analyzer configuration setting Signed-off-by: Prudhvi Godithi --- CHANGELOG.md | 1 + .../gradle/testclusters/OpenSearchNode.java | 12 ++++------ .../common/CommonAnalysisModulePlugin.java | 8 +------ .../common/SynonymTokenFilterFactory.java | 8 +++---- .../common/CommonAnalysisFactoryTests.java | 22 +------------------ .../common/SynonymsAnalysisTests.java | 12 ++++------ .../opensearch/plugins/AnalysisPlugin.java | 2 +- .../indices/analysis/AnalysisModuleTests.java | 5 +---- 8 files changed, 17 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index edbf7c8ed065c..c875f440ad7f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add Setting to adjust the primary constraint weights ([#16471](https://github.com/opensearch-project/OpenSearch/pull/16471)) - Switch from `buildSrc/version.properties` to Gradle version catalog (`gradle/libs.versions.toml`) to enable dependabot to perform automated upgrades on common libs ([#16284](https://github.com/opensearch-project/OpenSearch/pull/16284)) - Add dynamic setting allowing size > 0 requests to be cached in the request cache ([#16483](https://github.com/opensearch-project/OpenSearch/pull/16483/files)) +- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)). ### Dependencies - Bump `com.azure:azure-storage-common` from 12.25.1 to 12.27.1 ([#16521](https://github.com/opensearch-project/OpenSearch/pull/16521)) diff --git a/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java b/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java index bb409c2afd871..cd22560af9a96 100644 --- a/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java +++ b/buildSrc/src/main/java/org/opensearch/gradle/testclusters/OpenSearchNode.java @@ -1216,18 +1216,14 @@ private void createConfiguration() { ); final List configFiles; - try (Stream stream = Files.walk(getDistroDir().resolve("config"))) { + try (Stream stream = Files.list(getDistroDir().resolve("config"))) { configFiles = stream.collect(Collectors.toList()); } logToProcessStdout("Copying additional config files from distro " + configFiles); for (Path file : configFiles) { - Path relativePath = getDistroDir().resolve("config").relativize(file); - Path dest = configFile.getParent().resolve(relativePath); - if (Files.isDirectory(file)) { - Files.createDirectories(dest); - } else { - Files.createDirectories(dest.getParent()); - Files.copy(file, dest, StandardCopyOption.REPLACE_EXISTING); + Path dest = configFile.getParent().resolve(file.getFileName()); + if (Files.exists(dest) == false) { + Files.copy(file, dest); } } } catch (IOException e) { diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index 763c1783c2d28..7f9437972a358 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -248,7 +248,7 @@ public Map>> getAn } @Override - public Map> getTokenFilters() { + public Map> getTokenFilters(AnalysisModule analysisModule) { Map> filters = new TreeMap<>(); filters.put("apostrophe", ApostropheFilterFactory::new); filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new); @@ -339,12 +339,6 @@ public Map> getTokenFilters() { filters.put("uppercase", UpperCaseTokenFilterFactory::new); filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); - return filters; - } - - @Override - public Map> getTokenFilters(AnalysisModule analysisModule) { - Map> filters = getTokenFilters(); filters.put( "synonym", requiresAnalysisSettings( diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java index 86417a579628c..1cd78170e66c8 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/SynonymTokenFilterFactory.java @@ -66,7 +66,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { protected final Settings settings; protected final Environment environment; protected final AnalysisMode analysisMode; - private final String synonymAnalyzer; + private final String synonymAnalyzerName; private final AnalysisRegistry analysisRegistry; SynonymTokenFilterFactory( @@ -93,7 +93,7 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { boolean updateable = settings.getAsBoolean("updateable", false); this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL; this.environment = env; - this.synonymAnalyzer = settings.get("synonym_analyzer", null); + this.synonymAnalyzerName = settings.get("synonym_analyzer", null); this.analysisRegistry = analysisRegistry; } @@ -149,10 +149,10 @@ Analyzer buildSynonymAnalyzer( List tokenFilters, Function allFilters ) { - if (synonymAnalyzer != null) { + if (synonymAnalyzerName != null) { Analyzer customSynonymAnalyzer; try { - customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzer); + customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index baad489d33649..1f4faf53dced5 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -314,8 +314,7 @@ private void markedTestCase(String name, Map> map) { * Tests the getTokenFilters(AnalysisModule) method to verify: * 1. All token filters are properly loaded * 2. Basic filters remain available - * 3. Synonym filters are added when AnalysisModule is provided - * 4. The total number of filters is correct (base filters + synonym filters) + * 3. Synonym filters remain available when AnalysisModule is provided */ public void testGetTokenFiltersWithAnalysisModule() { CommonAnalysisModulePlugin plugin = (CommonAnalysisModulePlugin) getAnalysisPlugin(); @@ -324,24 +323,5 @@ public void testGetTokenFiltersWithAnalysisModule() { assertTrue("Should contain basic filters", filters.containsKey("lowercase")); assertTrue("Should contain synonym filter", filters.containsKey("synonym")); assertTrue("Should contain synonym_graph filter", filters.containsKey("synonym_graph")); - Map> baseFilters = plugin.getTokenFilters(); - assertEquals("Should contain additional synonym filters", baseFilters.size() + 2, filters.size()); - } - - /** - * Tests that synonym-related token filters are only available when an AnalysisModule is provided. - * This test verifies that: - * 1. Base getTokenFilters() does not include synonym filters - * 2. Extended getTokenFilters(AnalysisModule) includes synonym filters - * 3. Both synonym and synonym_graph filters require AnalysisModule - */ - public void testSynonymFiltersRequireAnalysisModule() { - CommonAnalysisModulePlugin plugin = (CommonAnalysisModulePlugin) getAnalysisPlugin(); - Map> baseFilters = plugin.getTokenFilters(); - Map> extendedFilters = plugin.getTokenFilters(analysisModule); - assertFalse("Base filters should not contain synonym filter", baseFilters.containsKey("synonym")); - assertTrue("Extended filters should contain synonym filter", extendedFilters.containsKey("synonym")); - assertFalse("Base filters should not contain synonym_graph filter", baseFilters.containsKey("synonym_graph")); - assertTrue("Extended filters should contain synonym_graph filter", extendedFilters.containsKey("synonym_graph")); } } diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java index e7b378701c166..33d92e01a85b1 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/SynonymsAnalysisTests.java @@ -260,15 +260,13 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException { .build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); Environment environment = TestEnvironment.newEnvironment(settings); - - // Initialize analysis registry - AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin())); - AnalysisRegistry analysisRegistry = module.getAnalysisRegistry(); + AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin())); + AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry(); String[] bypassingFactories = new String[] { "dictionary_decompounder" }; CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin(); for (String factory : bypassingFactories) { - TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, environment, factory, settings); + TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings); TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings); SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry); Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null); @@ -330,8 +328,6 @@ public void testDisallowedTokenFilters() throws IOException { Environment environment = TestEnvironment.newEnvironment(settings); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings); - - // Create analysis module AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin())); AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry(); CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin(); @@ -347,7 +343,7 @@ public void testDisallowedTokenFilters() throws IOException { "fingerprint" }; for (String factory : disallowedFactories) { - TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, environment, factory, settings); + TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings); TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings); SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry); diff --git a/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java b/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java index a7c4604a30553..58e43633777c9 100644 --- a/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java +++ b/server/src/main/java/org/opensearch/plugins/AnalysisPlugin.java @@ -87,7 +87,7 @@ default Map> getCharFilters() { /** * Override to add additional {@link TokenFilter}s that need access to the AnalysisModule. - * The default implementation calls the existing getTokenFilters() method for backward compatibility. + * The default implementation for plugins that don't need AnalysisModule calls the existing getTokenFilters() method. */ default Map> getTokenFilters(AnalysisModule analysisModule) { return getTokenFilters(); diff --git a/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java b/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java index 625f79b26b01b..74bc987c44b15 100644 --- a/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java +++ b/server/src/test/java/org/opensearch/indices/analysis/AnalysisModuleTests.java @@ -526,10 +526,7 @@ public boolean incrementToken() throws IOException { /** * Tests registration and functionality of token filters that require access to the AnalysisModule. - * This test verifies: - * 1. Token filter registration using the extended getTokenFilters(AnalysisModule) method - * 2. Filter functionality in both predefined and custom analyzers - * 3. Proper access to AnalysisModule reference within filter factory creation + * This test verifies the token filter registration using the extended getTokenFilters(AnalysisModule) method */ public void testTokenFilterRegistrationWithModuleReference() throws IOException { class TestPlugin implements AnalysisPlugin {