Skip to content

Commit

Permalink
FEATURE: Lucene.Net.Analysis.Miscellaneous: Added TypeAsSynonymFilter…
Browse files Browse the repository at this point in the history
… from Lucene 8.2.0 because it is called out in the docs as part of the process of configuring Lucene.Net.Analysis.OpenNLP. Changed CannedTokenStream to set ITypeAttribute.Type because it is required by the tests for TypeAsSynonymFilter.
  • Loading branch information
NightOwl888 committed Feb 1, 2024
1 parent e742bbb commit 79d4610
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Lucene version compatibility level 8.2.0
// LUCENENET NOTE: Ported because Lucene.Net.Analysis.OpenNLP requires this to be useful.
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Util;
#nullable enable

namespace Lucene.Net.Analysis.Miscellaneous
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/// <summary>
/// Adds the <see cref="ITypeAttribute.Type"/> as a synonym,
/// i.e. another token at the same position, optionally with a specified prefix prepended.
/// </summary>
public sealed class TypeAsSynonymFilter : TokenFilter
{
private readonly ICharTermAttribute termAtt;
private readonly ITypeAttribute typeAtt;
private readonly IPositionIncrementAttribute posIncrAtt;
private readonly string? prefix;

private State? savedToken = null;

/// <summary>
/// Initializes a new instance of <see cref="TypeAsSynonymFilter"/> with
/// the specified token stream.
/// </summary>
/// <param name="input">Input token stream.</param>
public TypeAsSynonymFilter(TokenStream input)
: this(input, null)
{
}

/// <summary>
/// Initializes a new instance of <see cref="TypeAsSynonymFilter"/> with
/// the specified token stream and prefix.
/// </summary>
/// <param name="input">Input token stream.</param>
/// <param name="prefix">Prepend this string to every token type emitted as token text.
/// If <c>null</c>, nothing will be prepended.</param>
public TypeAsSynonymFilter(TokenStream input, string? prefix)
: base(input)
{
this.prefix = prefix;
termAtt = AddAttribute<ICharTermAttribute>();
typeAtt = AddAttribute<ITypeAttribute>();
posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
}


public override bool IncrementToken()
{
if (savedToken != null)
{
// Emit last token's type at the same position
RestoreState(savedToken);
savedToken = null;
termAtt.SetEmpty();
if (prefix != null)
{
termAtt.Append(prefix);
}
termAtt.Append(typeAtt.Type);
posIncrAtt.PositionIncrement = 0;
return true;
}
else if (m_input.IncrementToken())
{
// Ho pending token type to emit
savedToken = CaptureState();
return true;
}
return false;
}

public override void Reset()
{
base.Reset();
savedToken = null;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Lucene version compatibility level 8.2.0
// LUCENENET NOTE: Ported because Lucene.Net.Analysis.OpenNLP requires this to be useful.
using Lucene.Net.Analysis.Util;
using System;
using System.Collections.Generic;
#nullable enable

namespace Lucene.Net.Analysis.Miscellaneous
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/// <summary>
/// Factory for <see cref="TypeAsSynonymFilter"/>.
/// <code>
/// &lt;fieldType name="text_type_as_synonym" class="solr.TextField" positionIncrementGap="100"&gt;
/// &lt;analyzer&gt;
/// &lt;tokenizer class="solr.UAX29URLEmailTokenizerFactory"/&gt;
/// &lt;filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_" /&gt;
/// &lt;/analyzer&gt;
/// &lt;/fieldType&gt;
/// </code>
///
/// <para/>
/// If the optional <c>prefix</c> parameter is used, the specified value will be prepended
/// to the type, e.g.with prefix = "_type_", for a token "example.com" with type "&lt;URL&gt;",
/// the emitted synonym will have text "_type_&lt;URL&gt;".
/// </summary>
public class TypeAsSynonymFilterFactory : TokenFilterFactory
{
private readonly string prefix;

public TypeAsSynonymFilterFactory(IDictionary<string, string> args)
: base(args)
{
prefix = Get(args, "prefix"); // default value is null
if (args.Count > 0)
{
throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args));
}
}

public override TokenStream Create(TokenStream input)
{
return new TypeAsSynonymFilter(input, prefix);
}
}
}
10 changes: 9 additions & 1 deletion src/Lucene.Net.TestFramework/Analysis/CannedTokenStream.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.TokenAttributes;

namespace Lucene.Net.Analysis
{
Expand Down Expand Up @@ -31,6 +31,7 @@ public sealed class CannedTokenStream : TokenStream
private readonly IPositionLengthAttribute posLengthAtt;
private readonly IOffsetAttribute offsetAtt;
private readonly IPayloadAttribute payloadAtt;
private readonly ITypeAttribute typeAtt; // LUCENENET specific - See IncrementToken()
private readonly int finalOffset;
private readonly int finalPosInc;

Expand All @@ -49,6 +50,7 @@ public CannedTokenStream(int finalPosInc, int finalOffset, params Token[] tokens
posLengthAtt = AddAttribute<IPositionLengthAttribute>();
offsetAtt = AddAttribute<IOffsetAttribute>();
payloadAtt = AddAttribute<IPayloadAttribute>();
typeAtt = AddAttribute<ITypeAttribute>(); // LUCENENET specific - See IncrementToken()

this.tokens = tokens;
this.finalOffset = finalOffset;
Expand Down Expand Up @@ -76,6 +78,12 @@ public override bool IncrementToken()
posLengthAtt.PositionLength = token.PositionLength;
offsetAtt.SetOffset(token.StartOffset, token.EndOffset);
payloadAtt.Payload = token.Payload;

// LUCENENET: This change is from https://github.com/apache/lucene/commit/72eaeab7151d421a28ecec1634b8c48599e524f5.
// We need it for the TestTypeAsSynonymFilterFactory tests to pass (from lucene 8.2.0).
// But we don't yet have all of the PackedTokenAttributeImpl plumbing it takes to do it the way they did,
// so setting it explicitly as a workaround.
typeAtt.Type = token.Type;
return true;
}
else
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using Lucene.Net.Analysis.Util;
using NUnit.Framework;

namespace Lucene.Net.Analysis.Miscellaneous
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

public class TestTypeAsSynonymFilterFactory : BaseTokenStreamFactoryTestCase
{
private static readonly Token[] TOKENS = { token("Visit", "<ALPHANUM>"), token("example.com", "<URL>") };

[Test]
public void TestBasic()
{
TokenStream stream = new CannedTokenStream(TOKENS);
stream = TokenFilterFactory("TypeAsSynonym").Create(stream);
AssertTokenStreamContents(stream, new string[] { "Visit", "<ALPHANUM>", "example.com", "<URL>" },
null, null, new string[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
}

[Test]
public void TestPrefix()
{
TokenStream stream = new CannedTokenStream(TOKENS);
stream = TokenFilterFactory("TypeAsSynonym", "prefix", "_type_").Create(stream);
AssertTokenStreamContents(stream, new string[] { "Visit", "_type_<ALPHANUM>", "example.com", "_type_<URL>" },
null, null, new string[] { "<ALPHANUM>", "<ALPHANUM>", "<URL>", "<URL>" }, new int[] { 1, 0, 1, 0 });
}

private static Token token(string term, string type)
{
Token token = new Token();
token.SetEmpty();
token.Append(term);
token.Type = type;
return token;
}
}
}

0 comments on commit 79d4610

Please sign in to comment.