Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ShingleFilter produces invalid queries #946

Merged
merged 2 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion src/Lucene.Net.Analysis.Common/Analysis/Shingle/ShingleFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,16 @@ public override bool IncrementToken()
noShingleOutput = false;
}
offsetAtt.SetOffset(offsetAtt.StartOffset, nextToken.offsetAtt.EndOffset);
posLenAtt.PositionLength = builtGramSize;
// LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
if (outputUnigrams)
{
posLenAtt.PositionLength = builtGramSize;
}
else
{
// position length for this token is the number of position created by shingles of smaller size.
posLenAtt.PositionLength = Math.Max(1, (builtGramSize - minShingleSize) + 1);
}
isOutputHere = true;
gramSize.Advance();
tokenAvailable = true;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Lucene version compatibility level 4.8.1
// Lucene version compatibility level 4.8.1
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.TokenAttributes;
using NUnit.Framework;
Expand Down Expand Up @@ -617,5 +617,97 @@ public virtual void TestTwoTrailingHolesTriShingleWithTokenFiller()

AssertTokenStreamContents(filter, new string[] { "purple", "purplewizard", "purplewizard", "wizard", "wizard", "wizard" }, new int[] { 0, 0, 0, 7, 7, 7 }, new int[] { 6, 13, 20, 13, 20, 20 }, new int[] { 1, 0, 0, 1, 0, 0 }, 20);
}

// LUCENENET-specific: backported fix from Lucene 6.5.0 (LUCENE-7708)
[Test]
public void TestPositionLength()
{
Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 4, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be or not", "be or not to", "or not to be"},
new int[] {0, 3, 6},
new int[] { 12, 15, 18 },
null,
new int[] { 1, 1, 1 },
new int[] { 1, 1, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 2, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be", "to be or", "to be or not", "be or", "be or not", "be or not to", "or not", "or not to",
"or not to be", "not to", "not to be", "to be"},
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 6, 9, 9, 13 },
new int[] { 5, 8, 12, 8, 12, 15, 12, 15, 18, 15, 18, 18 },
null,
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1 },
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 4);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be or", "to be or not", "be or not", "be or not to", "or not to",
"or not to be", "not to be"},
new int[] { 0, 0, 3, 3, 6, 6, 9 },
new int[] { 8, 12, 12, 15, 15, 18, 18 },
null,
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 },
new int[] { 1, 2, 1, 2, 1, 2, 1, 2 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);

a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
MockBytesAttributeFactory factory = new MockBytesAttributeFactory();
Tokenizer tokenizer = new MockTokenizer(factory, reader, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
ShingleFilter filter = new ShingleFilter(tokenizer, 3, 5);
filter.SetOutputUnigrams(false);
return new TokenStreamComponents(tokenizer, filter);
});

AssertTokenStreamContents(a.GetTokenStream("", "to be or not to be"),
new string[] {"to be or", "to be or not", "to be or not to", "be or not", "be or not to",
"be or not to be", "or not to", "or not to be", "not to be"},
new int[] { 0, 0, 0, 3, 3, 3, 6, 6, 9, 9 },
new int[] { 8, 12, 15, 12, 15, 18, 15, 18, 18 },
null,
new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 1, 0 },
new int[] { 1, 2, 3, 1, 2, 3, 1, 2, 1 },
18,
// offsets are correct but assertTokenStreamContents does not handle multiple terms with different offsets
// finishing at the same position
false);
}
}
}
}
Loading