[BUG] Token Filter Order: word_delimiter_graph and synonym_graph #16263

aswad1 · 2024-10-10T00:49:45Z

Describe the bug

I am migrating from Solr to OpenSearch and need to maintain the same analyzer behavior that I had in Solr, where the word_delimiter_graph filter is applied before the synonym expansion.

In Solr, this order worked without issues, and I used FlattenGraphFilterFactory to handle token graph flattening after synonym processing. I need to maintain this specific order to keep consistent search behavior during the migration. Any guidance or suggestions would be greatly appreciated.

Solr schema:

     </analyzer>
     <analyzer type="query">
        <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="[({.,\[\]\“\”/})]" replacement=" " />
        <tokenizer class="solr.WhitespaceTokenizerFactory" />
		<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict.txt" ignoreCase="true"/>
        <!--Move ASCII folding, lowerCase, and Hunspell before synonyms so that clean singular terms are sent to the synonyms-->
     <filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true" />
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" preserveOriginal="0" catenateAll="1" splitOnCaseChange="0"/>
        <filter class="solr.HunspellStemFilterFactory" dictionary="en-US.dic" affix="en-US.aff" ignoreCase="true" />
		<filter class="solr.ManagedSynonymGraphFilterFactory" managed="english" />
        <filter class="solr.PatternReplaceFilterFactory" pattern="(-)" replacement=" " replace="all" />
        <filter class="solr.FlattenGraphFilterFactory" /> <!-- required on index analyzers after graph filters -->
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
     </analyzer>
  </fieldType>

OpenSearch mapping, where I am facing the error for the analyzer "text_en_index" and "text_en_query":

PUT /my-index-10
{
  "settings": {
    "analysis": {
      "char_filter": {
        "custom_pattern_replace": {
          "type": "pattern_replace",
          "pattern": "[({.,\\[\\]“”/})]",
          "replacement": " "
        }
      },
      "filter": {
        "custom_ascii_folding": {
          "type": "asciifolding",
          "preserve_original": true
        },
        "custom_word_delimiter": {
          "type": "word_delimiter_graph",
          "generate_word_parts": true,
          "preserve_original": false,
          "catenate_all": true,
          "split_on_numerics": false,
          "split_on_case_change": false
        },
          "custom_synonym_filter": {
            "type": "synonym",
            "synonyms_path": "analyzers/F198001551",
            "updateable": true
          },
         "custom_hunspell_stemmer": {
          "type": "hunspell",
          "locale": "en_US"
        },
        "custom_pattern_replace_filter":{
          "type": "pattern_replace",
          "pattern": "(-)",
          "replacement": " ",
          "all":true
        },
        "custom_stemmer_override":{
          "type": "keyword_marker",
          "keywords_path":"analyzers/F225794029",
          "ignore_case": true
        },
		    "custom_synonym_graph_filter":{
			   "type": "synonym_graph",
         "synonyms_path": "analyzers/F3495229"
		  }
      },
      "analyzer": {
        "text_en_index": {
          "type":"custom",
          "char_filter": ["custom_pattern_replace"],
          "tokenizer": "whitespace",
          "filter": [
            "custom_ascii_folding",
            "lowercase",
            "custom_word_delimiter",
            "custom_hunspell_stemmer",
            "custom_synonym_graph_filter",
            "custom_pattern_replace_filter",
            "flatten_graph"
          ]
        },
        "text_en_query": {
          "type":"custom",
          "char_filter": ["custom_pattern_replace"],
          "tokenizer": "whitespace",
          "filter": [
            "custom_stemmer_override",
            "custom_ascii_folding",
            "lowercase",
            "custom_word_delimiter",
            "custom_hunspell_stemmer",
            "custom_synonym_graph_filter",
            "custom_pattern_replace_filter",
            "flatten_graph"
          ]
        },
        "text_id_tx_class_id":{
          "tokenizer": "whitespace"
        },
         "text_general_index_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"
          ]
        },
        "text_general_query_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "custom_synonym_filter"
          ]
        },
        "en": {
          "tokenizer": "standard",
          "filter": [ "custom_hunspell_stemmer" ]
        },
		    "managed_synonym_analyzer":{
		      "tokenizer": "standard",
          "filter": [ "custom_synonym_graph_filter" ]
		}
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {
        "type": "keyword",
        "index": true,
        "store": true
      },
      "id_tx":{
        "type": "text",
        "analyzer": "text_id_tx_class_id",
        "store": true,
        "index" :true
      },
      "description": {
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
      "TM5":{
        "type":"keyword",
        "store": true,
        "index" :true
      },
      "status":{
        "type":"keyword",
        "store": true,
        "index" :true
      },
       "version":{
        "type":"keyword",
        "store": true,
        "index" :true
      },
      "class": {
        "type": "text",
        "analyzer": "text_general_index_analyzer",
        "search_analyzer": "text_general_query_analyzer",
        "store": true
      },
      "long class":{
        "type": "text",
        "analyzer": "text_general_index_analyzer",
        "search_analyzer": "text_general_query_analyzer",
        "store": true
      },
      "class_id":{
        "type":"text",
        "analyzer":"text_id_tx_class_id",
        "index":true,
        "store":true
      },
      "notes":{
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
       "employee_notes":{
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
      "editor_notes":{
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index" :true
      },
      "begin_effective_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "end_effective_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "goods_services":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "record_state":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "action_flag":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "creation_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "created_by":{
        "type":"keyword",
        "store":true,
        "index":true
      },
      "last_updated_date":{
        "type":"date",
        "store":true,
        "index": true,
        "format":"strict_date_optional_time||epoch_millis"
      },
      "last_updated_by":{
        "type":"keyword",
        "store":true,
        "index":true
      }
    }
  }
}

In OpenSearch, when I try to apply the word_delimiter_graph filter before the synonym_graph filter (as required by my use case), I receive the following error:

{
  "error": {
    "root_cause": [
      {
        "type": "illegal_argument_exception",
        "reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
      }
    ],
    "type": "illegal_argument_exception",
    "reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
  },
  "status": 400
}

Question:

Is there a known limitation in OpenSearch that prevents the word_delimiter_graph filter from being applied before the synonym_graph filter?
Is there any recommended workaround or configuration that would allow me to maintain the same filter order while avoiding the token graph parsing error?

Related component

Other

To Reproduce

N/A

Expected behavior

N/A

Additional Details

Plugins
Please list all plugins currently enabled.

Screenshots
If applicable, add screenshots to help explain your problem.

Host/Environment (please complete the following information):

OS: [e.g. iOS]
Version [e.g. 22]

Additional context
Add any other context about the problem here.

The text was updated successfully, but these errors were encountered:

nupurjaiswal · 2024-10-11T19:57:35Z

Hi,
I tried the below approaches recommended by AWS support:

Tried flag “lenient: true” with synonym graph filter and word delimiter filter
Error: "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
Tried multiplexer filter as below:

"multiplexer_filter":{
    "type":"multiplexer"
"filters":["custom_word_delimiter","custom_hunspell_stemmer","custom_synonym_graph_filter"]

I was able to create the index, but while indexing the document I got the below error

 “raise HTTP_EXCEPTIONS.get(status_code, TransportError)(
opensearchpy.exceptions.RequestError: RequestError(400, 'illegal_argument_exception', 'Increment must be zero or greater: -1')”

which is already mentioned under: https://discuss.elastic.co/t/word-delimiter-graph-token-synonym-graph-token/278942

Tried multiplexer and added “lenient: true” in synonym graph filter as below:

"multiplexer_filter":{
    "type":"multiplexer"
"filters":["custom_word_delimiter","custom_hunspell_stemmer","custom_synonym_graph_filter"]
"custom_synonym_graph_filter":{
"type": "synonym_graph",
         "synonyms_path": "analyzers/F3495229",
         "lenient":true
		  }

I was able to create index and finish indexing, but when I tried querying I wasn’t getting the current result

Result:

POST /my-index-12/_analyze
{
  "analyzer": "text_en_index",
  "text": "note-book"
}

{
  "tokens": [
    {
      "token": "note book",
      "start_offset": 0,
      "end_offset": 9,
      "type": "word",
      "position": 0
    }
  ]
}

Expected result:

{
  "tokens": [
    {
      "token": "notebook",
      "start_offset": 0,
      "end_offset": 9,
      "type": "word",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "note",
      "start_offset": 0,
      "end_offset": 4,
      "type": "word",
      "position": 0
    },
    {
      "token": "book",
      "start_offset": 5,
      "end_offset": 9,
      "type": "word",
      "position": 1
    }
  ]
}

Please let me know if you have any other recommendation.

Thanks,
Nupur

msfroh · 2024-10-21T19:53:12Z

Hi there -- the issue here is that the synonym_graph filter will use the analyzer chain defined up to that point to tokenize the synonym file. You can't use a word_delimiter_filter to read a synonym file, because it outputs multiple tokens in the same position.

We could add a configuration setting to synonym and synonym_graph filters to allow you to specify a custom analyzer to use when reading the synonym file.

nupurjaiswal · 2024-10-21T22:37:36Z

Hi Michael,

Do you know how soon can this be implemented? Will I be able to specify the word_delimeter_filter before the synonym graph filter with this approach?

Thanks,
Nupur

prudhvigodithi · 2024-10-26T19:11:53Z

Hey, I was experimenting with the issue to understand the error better, and as suggested by @msfroh I've added a new configuration setting synonym_analyzer for synonym_graph and synonym (Draft PR with the change #16488).

Here’s the output I’ve observed.

Query

curl -X PUT "localhost:9200/my-index-15" -H "Content-Type: application/json" -d '{
  "settings": {
    "analysis": {
      "char_filter": {
        "custom_pattern_replace": {
          "type": "pattern_replace",
          "pattern": "[({.,\\[\\]“”/})]",
          "replacement": " "
        }
      },
      "filter": {
        "custom_ascii_folding": {
          "type": "asciifolding",
          "preserve_original": true
        },
        "custom_word_delimiter": {
          "type": "word_delimiter_graph",
          "generate_word_parts": true,
          "catenate_all": true,
          "split_on_numerics": false,
          "split_on_case_change": false
        },
        "custom_synonym_filter": {
          "type": "synonym",
          "synonyms": [
            "laptop, notebook",
            "smartphone, mobile phone, cell phone",
            "tv, television"
          ],
          "updateable": true,
          "synonym_analyzer": "standard"
        },
        "custom_pattern_replace_filter": {
          "type": "pattern_replace",
          "pattern": "(-)",
          "replacement": " ",
          "all": true
        },
        "custom_stemmer_override": {
          "type": "keyword_marker",
          "keywords": [
            "run",
            "running",
            "jumps",
            "quick"
          ],
          "ignore_case": true
        },
        "custom_synonym_graph_filter": {
          "type": "synonym_graph",
          "synonyms": [
            "laptop => notebook",
            "smartphone, mobile phone, cell phone => smartphone",
            "tv, television => television",
            "word processor => text editor"
          ],
          "synonym_analyzer": "standard"
        }
      },
      "analyzer": {
        "text_en_index": {
          "type": "custom",
          "char_filter": ["custom_pattern_replace"],
          "tokenizer": "whitespace",
          "filter": [
            "custom_ascii_folding",
            "lowercase",
            "custom_word_delimiter",
            "custom_synonym_graph_filter",
            "custom_pattern_replace_filter",
            "flatten_graph"
          ]
        },
        "text_en_query": {
          "type": "custom",
          "char_filter": ["custom_pattern_replace"],
          "tokenizer": "whitespace",
          "filter": [
            "custom_stemmer_override",
            "custom_ascii_folding",
            "lowercase",
            "custom_word_delimiter",
            "custom_synonym_graph_filter",
            "custom_pattern_replace_filter",
            "flatten_graph"
          ]
        },
        "text_id_tx_class_id": {
          "tokenizer": "whitespace"
        },
        "text_general_index_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase"
          ]
        },
        "text_general_query_analyzer": {
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "custom_synonym_filter"
          ]
        },
        "en": {
          "tokenizer": "standard"
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {
        "type": "keyword",
        "index": true,
        "store": true
      },
      "id_tx": {
        "type": "text",
        "analyzer": "text_id_tx_class_id",
        "store": true,
        "index": true
      },
      "description": {
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index": true
      },
      "TM5": {
        "type": "keyword",
        "store": true,
        "index": true
      },
      "status": {
        "type": "keyword",
        "store": true,
        "index": true
      },
      "version": {
        "type": "keyword",
        "store": true,
        "index": true
      },
      "class": {
        "type": "text",
        "analyzer": "text_general_index_analyzer",
        "search_analyzer": "text_general_query_analyzer",
        "store": true
      },
      "long class": {
        "type": "text",
        "analyzer": "text_general_index_analyzer",
        "search_analyzer": "text_general_query_analyzer",
        "store": true
      },
      "class_id": {
        "type": "text",
        "analyzer": "text_id_tx_class_id",
        "index": true,
        "store": true
      },
      "notes": {
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index": true
      },
      "employee_notes": {
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index": true
      },
      "editor_notes": {
        "type": "text",
        "analyzer": "text_en_index",
        "search_analyzer": "text_en_query",
        "store": true,
        "index": true
      }
    }
  }
}'

Output

{"acknowledged":true,"shards_acknowledged":true,"index":"my-index-15"}

Query

curl -X POST "localhost:9200/my-index-15/_analyze" -H "Content-Type: application/json" -d '{
  "analyzer": "text_en_index",
  "text": "note-book"
}'  | jq '.'

Output

{
  "tokens": [
    {
      "token": "notebook",
      "start_offset": 0,
      "end_offset": 9,
      "type": "word",
      "position": 0,
      "positionLength": 2
    },
    {
      "token": "note",
      "start_offset": 0,
      "end_offset": 4,
      "type": "word",
      "position": 0
    },
    {
      "token": "book",
      "start_offset": 5,
      "end_offset": 9,
      "type": "word",
      "position": 1
    }
  ]
}

@nupurjaiswal @aswad1 can you please confirm if the query and the output is as expected ? If you have any additional queries to test this behaviour please share.

Thank you
@getsaurabh02

prudhvigodithi · 2024-10-28T02:53:03Z

Some more tests which works as expected.

curl -X PUT "localhost:9200/no-synonym-analyzer-test" -H "Content-Type: application/json" -d '{
  "settings": {
    "analysis": {
      "filter": {
        "custom_synonym_graph_filter": {
          "type": "synonym_graph",
          "synonyms": [
            "note book, note-book => notebook",
            "mobile phone, mobile-phone => smartphone"
          ]
        }
      },
      "analyzer": {
        "test_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "custom_synonym_graph_filter"
          ]
        }
      }
    }
  }
}'

 curl -X POST "localhost:9200/no-synonym-analyzer-test/_analyze" -H "Content-Type: application/json" -d '{
  "analyzer": "test_analyzer",
  "text": "note-book"
}' | jq '.'

When used custom `synonym_analyzer`.

curl -X PUT "localhost:9200/synonym-test-19" -H "Content-Type: application/json" -d '{
  "settings": {
    "analysis": {
      "analyzer": {
        "keyword_synonym_analyzer": {
          "type": "custom",
          "tokenizer": "keyword",
          "filter": ["lowercase"]
        },
        "standard_synonym_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase"]
        },
        "whitespace_synonym_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "filter": ["lowercase"]
        },
        "keyword_test": {
          "tokenizer": "keyword",
          "filter": ["lowercase", "keyword_syn_filter"]
        },
        "standard_test": {
          "tokenizer": "standard",
          "filter": ["lowercase", "standard_syn_filter"]
        },
        "whitespace_test": {
          "tokenizer": "whitespace",
          "filter": ["lowercase", "whitespace_syn_filter"]
        }
      },
      "filter": {
        "keyword_syn_filter": {
          "type": "synonym_graph",
          "synonyms": ["note book, notebook => notebook"],
          "synonym_analyzer": "keyword_synonym_analyzer"
        },
        "standard_syn_filter": {
          "type": "synonym_graph",
          "synonyms": ["note book, notebook => notebook"],
          "synonym_analyzer": "standard_synonym_analyzer"
        },
        "whitespace_syn_filter": {
          "type": "synonym_graph",
          "synonyms": ["note book, notebook => notebook"],
          "synonym_analyzer": "whitespace_synonym_analyzer"
        }
      }
    }
  }
}'


curl -X POST "localhost:9200/synonym-test-19/_analyze" -H "Content-Type: application/json" -d '{
  "analyzer": "whitespace_test",
  "text": "note book"
}' | jq '.'

Scenario when used custom `synonym_analyzer` with custom which has `word_delimiter_graph` and `synonym_graph`

When used "synonym_analyzer": "custom_synonym_analyzer" meaning to pass a custom analyzer to synonym_analyzer, this again goes back to user configured analyzer chain and expectation is to ensure the synonym_graph is passed before word_delimiter_graph (or handled as required), else it will throw the same error. Example notice the custom_synonym_analyzer.

Adding @msfroh for inputs here, should this be handled by the user when configuring the analyzer chain for custom analyzer which will be used in synonym_analyzer?

curl -X PUT "localhost:9200/custom-test-11" -H "Content-Type: application/json" -d '{
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_synonym_analyzer": {       
          "type": "custom",
          "tokenizer": "whitespace",
          "filter": [
            "lowercase"
          ]
        },
        "test_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "filter": [
            "lowercase",
            "custom_word_delimiter",
            "custom_advanced_synonym"
          ]
        }
      },
      "filter": {
        "custom_word_delimiter": {
          "type": "word_delimiter_graph",
          "generate_word_parts": true,
          "catenate_all": true,
          "preserve_original": false,
          "split_on_numerics": false,
          "split_on_case_change": false
        },
        "custom_advanced_synonym": {
          "type": "synonym_graph",
          "synonyms": [
            "note-book => notebook",
            "note book => notebook",
            "mobile-phone => smartphone",
            "mobile phone => smartphone"
          ],
          "synonym_analyzer": "custom_synonym_analyzer"  
        }
      }
    }
  }
}'

dblock · 2024-10-28T16:11:44Z

[Catch All Triage - 1, 2, 3]

nupurjaiswal · 2024-10-28T16:38:17Z

@prudhvigodithi The mapping you posted earlier with "synonym_analyzer": "standard" is not working. I am getting the error "Token filter [custom_word_delimiter] cannot be used to parse synonyms"

prudhvigodithi · 2024-10-28T16:52:15Z

Hey @nupurjaiswal, did you try to build OpenSearch from my fork and test it? I have created a draft PR #16488 with the proposal to use synonym_analyzer, this is not yet released.
Thanks

nupurjaiswal · 2024-10-28T19:27:27Z

@prudhvigodithi I was using Opensearch on AWS. I am not running opensearch locally. I cloned your project from here: https://github.com/prudhvigodithi/OpenSearch/tree/bug-fix-synonym and tried to use "gradlew assemble", but I am facing errors while building it. Can you please share the instruction on how to build and run Opensearch using your fix? That would be helpful

prudhvigodithi · 2024-10-28T21:40:49Z

Hey @nupurjaiswal sure,

Clone my fork https://github.com/prudhvigodithi/OpenSearch/tree/bug-fix-synonym.
Make sure you have java 21 on your local.
Assuming your local is linux

./gradlew :distribution:archives:linux-tar:assemble -x :distribution:docker:buildArm64DockerImage -x :distribution:docker:buildDockerImage -x :distribution:docker:buildPpc64leDockerImage -x :distribution:docker:buildS390xDockerImage -x distribution:docker:docker-arm64-export:exportArm64DockerImage -x :distribution:docker:docker-export:exportDockerImage -x :distribution:docker:docker-ppc64le-export:exportDockerImage -x :distribution:docker:docker-s390x-export:exportDockerImage

./gradlew run -PnumNodes=3 this should start OpenSearch on port 9200.

Once started you can run your PUT query. #16263 (comment)

Also FYI I got some feedback on the PR, will take a look address the comments.

nupurjaiswal · 2024-10-29T04:39:10Z

Hi @prudhvigodithi I followed for your instruction: Cloned the repo "https://github.com/prudhvigodithi/OpenSearch.git", Installed java 21, followed your command build and run opensearch. The build was successful. I used postman to do the PUT query, I am still getting the below issue:

{
    "error": {
        "root_cause": [
            {
                "type": "illegal_argument_exception",
                "reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
            }
        ],
        "type": "illegal_argument_exception",
        "reason": "Token filter [custom_word_delimiter] cannot be used to parse synonyms"
    },
    "status": 400
}

prudhvigodithi · 2024-10-29T16:19:03Z

Hey @nupurjaiswal can you please confirm you tried with bug-fix-synonym (git clone https://github.com/prudhvigodithi/OpenSearch.git -b bug-fix-synonym) branch from my fork ?

nupurjaiswal · 2024-10-30T23:04:22Z

Thanks a lot, @prudhvigodithi. I realized I was on the wrong branch, but I was able to create the index and test queries once I switched to your branch. Your fix solved the issue. I will continue with the remaining testing and keep you updated.

aswad1 added bug Something isn't working untriaged labels Oct 10, 2024

github-actions bot added the Other label Oct 10, 2024

prudhvigodithi mentioned this issue Oct 26, 2024

Add a new configuration setting synonym_analyzer for synonym_graph and synonym. #16488

Merged

3 tasks

dblock removed the untriaged label Oct 28, 2024

prudhvigodithi self-assigned this Oct 30, 2024

prudhvigodithi mentioned this issue Oct 31, 2024

[BUG] Using synonym filter after hunspell. #16530

Open

getsaurabh02 added the v2.19.0 Issues and PRs related to version 2.19.0 label Nov 4, 2024

prudhvigodithi mentioned this issue Nov 12, 2024

[Backport 2.x] Add a new configuration setting synonym_analyzer for synonym_graph and synonym #16625

Merged

3 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG] Token Filter Order: word_delimiter_graph and synonym_graph #16263

[BUG] Token Filter Order: word_delimiter_graph and synonym_graph #16263

aswad1 commented Oct 10, 2024 •

edited by msfroh

Loading

nupurjaiswal commented Oct 11, 2024

msfroh commented Oct 21, 2024

nupurjaiswal commented Oct 21, 2024

prudhvigodithi commented Oct 26, 2024 •

edited

Loading

prudhvigodithi commented Oct 28, 2024 •

edited

Loading

dblock commented Oct 28, 2024

nupurjaiswal commented Oct 28, 2024

prudhvigodithi commented Oct 28, 2024

nupurjaiswal commented Oct 28, 2024

prudhvigodithi commented Oct 28, 2024

nupurjaiswal commented Oct 29, 2024

prudhvigodithi commented Oct 29, 2024 •

edited

Loading

nupurjaiswal commented Oct 30, 2024

[BUG] Token Filter Order: word_delimiter_graph and synonym_graph #16263

[BUG] Token Filter Order: word_delimiter_graph and synonym_graph #16263

Comments

aswad1 commented Oct 10, 2024 • edited by msfroh Loading

Describe the bug

Solr schema:

OpenSearch mapping, where I am facing the error for the analyzer "text_en_index" and "text_en_query":

In OpenSearch, when I try to apply the word_delimiter_graph filter before the synonym_graph filter (as required by my use case), I receive the following error:

Related component

To Reproduce

Expected behavior

Additional Details

nupurjaiswal commented Oct 11, 2024

msfroh commented Oct 21, 2024

nupurjaiswal commented Oct 21, 2024

prudhvigodithi commented Oct 26, 2024 • edited Loading

Query

Output

Query

Output

prudhvigodithi commented Oct 28, 2024 • edited Loading

Some more tests which works as expected.

When used custom synonym_analyzer.

Scenario when used custom synonym_analyzer with custom which has word_delimiter_graph and synonym_graph

dblock commented Oct 28, 2024

nupurjaiswal commented Oct 28, 2024

prudhvigodithi commented Oct 28, 2024

nupurjaiswal commented Oct 28, 2024

prudhvigodithi commented Oct 28, 2024

nupurjaiswal commented Oct 29, 2024

prudhvigodithi commented Oct 29, 2024 • edited Loading

nupurjaiswal commented Oct 30, 2024

aswad1 commented Oct 10, 2024 •

edited by msfroh

Loading

prudhvigodithi commented Oct 26, 2024 •

edited

Loading

prudhvigodithi commented Oct 28, 2024 •

edited

Loading

When used custom `synonym_analyzer`.

Scenario when used custom `synonym_analyzer` with custom which has `word_delimiter_graph` and `synonym_graph`

prudhvigodithi commented Oct 29, 2024 •

edited

Loading