From d44ff501fe054e3d38d5bd422d0ba904ed5c9770 Mon Sep 17 00:00:00 2001 From: tohidemyname Date: Wed, 5 Jun 2024 08:45:08 +0800 Subject: [PATCH 1/2] ShingleFilter produces invalid queries https://github.com/apache/lucenenet/issues/943 --- .../Analysis/Shingle/ShingleFilter.cs | 11 ++- .../Analysis/Shingle/ShingleFilterTest.cs | 99 ++++++++++++++++++- 2 files changed, 107 insertions(+), 3 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs index 9cadafd2d2..760fdc2548 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs @@ -366,7 +366,16 @@ public override bool IncrementToken() noShingleOutput = false; } offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset); - posLenAtt.PositionLength = builtGramSize; + // posLenAtt.PositionLength = builtGramSize; + if (outputUnigrams) + { + posLenAtt.PositionLength = builtGramSize; + } + else + { + // position length for this token is the number of position created by shingles of smaller size. + posLenAtt.PositionLength = Math.Max(1, (builtGramSize - minShingleSize) + 1); + } isOutputHere = true; gramSize.Advance(); tokenAvailable = true; diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs index 1cf5ad6878..91a576c389 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.8.1 using Lucene.Net.Analysis.Core; using Lucene.Net.Analysis.TokenAttributes; using NUnit.Framework; @@ -408,6 +408,101 @@ public virtual void TestPositionIncrementGreaterThanNWithoutUnigrams() this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false); } + + + [Test] + public void testPositionLength() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new String[] {"to be or not", "be or not to", "or not to be"}, + new int[] {0, 3, 6}, + new int[] { 12, 15, 18 }, + null, + new int[] { 1, 1, 1 }, + new int[] { 1, 1, 1 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + + a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to", + "or not to be", "not to", "not to be", "to be"}, + new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 }, + new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 }, + null, + new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 }, + new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to", + "or not to be", "not to be"}, + new int[] { 0, 0, 3, 3, 6, 6, 9 }, + new int[] { 8, 12, 12, 15, 15, 18, 18 }, + null, + new int[] { 1, 0, 1, 0, 1, 0, 1, 0 }, + new int[] { 1, 2, 1, 2, 1, 2, 1, 2 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to", + "be or not to be", "or not to", "or not to be", "not to be"}, + new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 }, + new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 }, + null, + new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 }, + new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + } + [Test] public virtual void TestReset() { @@ -618,4 +713,4 @@ public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller() AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); } } -} \ No newline at end of file +} From fd87edbb01f6f0dfd858d1d3712180249baee80e Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Mon, 28 Oct 2024 20:43:47 -0600 Subject: [PATCH 2/2] Add LUCENENET-specific backport comment, fix test name, fix test position and code style --- .../Analysis/Shingle/ShingleFilter.cs | 2 +- .../Analysis/Shingle/ShingleFilterTest.cs | 187 +++++++++--------- 2 files changed, 93 insertions(+), 96 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs index 760fdc2548..f36f9b8a74 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs @@ -366,7 +366,7 @@ public override bool IncrementToken() noShingleOutput = false; } offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset); - // posLenAtt.PositionLength = builtGramSize; + // LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708) if (outputUnigrams) { posLenAtt.PositionLength = builtGramSize; diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs index 91a576c389..8b3fc9ee96 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Shingle/ShingleFilterTest.cs @@ -408,101 +408,6 @@ public virtual void TestPositionIncrementGreaterThanNWithoutUnigrams() this.shingleFilterTest(2, 3, TEST_TOKEN_POS_INCR_GREATER_THAN_N, TRI_GRAM_TOKENS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false); } - - - [Test] - public void testPositionLength() - { - Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => - { - MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); - Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); - ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4); - filter.SetOutputUnigrams(false); - return new TokenStreamComponents(tokenizer, filter); - }); - - AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), - new String[] {"to be or not", "be or not to", "or not to be"}, - new int[] {0, 3, 6}, - new int[] { 12, 15, 18 }, - null, - new int[] { 1, 1, 1 }, - new int[] { 1, 1, 1 }, - 18, - // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets - // finishing at the same position - false); - - - a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => - { - MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); - Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); - ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4); - filter.SetOutputUnigrams(false); - return new TokenStreamComponents(tokenizer, filter); - }); - - AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), - new String[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to", - "or not to be", "not to", "not to be", "to be"}, - new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 }, - new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 }, - null, - new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 }, - new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 }, - 18, - // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets - // finishing at the same position - false); - - a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => - { - MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); - Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); - ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4); - filter.SetOutputUnigrams(false); - return new TokenStreamComponents(tokenizer, filter); - }); - - - AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), - new String[] {"to be or", "to be or not", "be or not", "be or not to", "or not to", - "or not to be", "not to be"}, - new int[] { 0, 0, 3, 3, 6, 6, 9 }, - new int[] { 8, 12, 12, 15, 15, 18, 18 }, - null, - new int[] { 1, 0, 1, 0, 1, 0, 1, 0 }, - new int[] { 1, 2, 1, 2, 1, 2, 1, 2 }, - 18, - // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets - // finishing at the same position - false); - - a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => - { - MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); - Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); - ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5); - filter.SetOutputUnigrams(false); - return new TokenStreamComponents(tokenizer, filter); - }); - - AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), - new String[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to", - "be or not to be", "or not to", "or not to be", "not to be"}, - new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 }, - new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 }, - null, - new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 }, - new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 }, - 18, - // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets - // finishing at the same position - false); - } - [Test] public virtual void TestReset() { @@ -712,5 +617,97 @@ public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller() AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20); } + + // LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708) + [Test] + public void TestPositionLength() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new string[] {"to be or not", "be or not to", "or not to be"}, + new int[] {0, 3, 6}, + new int[] { 12, 15, 18 }, + null, + new int[] { 1, 1, 1 }, + new int[] { 1, 1, 1 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new string[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to", + "or not to be", "not to", "not to be", "to be"}, + new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 }, + new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 }, + null, + new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 }, + new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new string[] {"to be or", "to be or not", "be or not", "be or not to", "or not to", + "or not to be", "not to be"}, + new int[] { 0, 0, 3, 3, 6, 6, 9 }, + new int[] { 8, 12, 12, 15, 15, 18, 18 }, + null, + new int[] { 1, 0, 1, 0, 1, 0, 1, 0 }, + new int[] { 1, 2, 1, 2, 1, 2, 1, 2 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + + a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + MockBytesAttributeFactory factory = new MockBytesAttributeFactory(); + Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); + ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5); + filter.SetOutputUnigrams(false); + return new TokenStreamComponents(tokenizer, filter); + }); + + AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"), + new string[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to", + "be or not to be", "or not to", "or not to be", "not to be"}, + new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 }, + new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 }, + null, + new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 }, + new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 }, + 18, + // offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets + // finishing at the same position + false); + } } }