diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilter.cs new file mode 100644 index 0000000000..b72ec02e0d --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilter.cs @@ -0,0 +1,97 @@ +// Lucene version compatibility level 8.2.0 +// LUCENENET NOTE: Ported because Lucene.Net.Analysis.OpenNLP requires this to be useful. +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Util; +#nullable enable + +namespace Lucene.Net.Analysis.Miscellaneous +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Adds the as a synonym, + /// i.e. another token at the same position, optionally with a specified prefix prepended. + /// + public sealed class TypeAsSynonymFilter : TokenFilter + { + private readonly ICharTermAttribute termAtt; + private readonly ITypeAttribute typeAtt; + private readonly IPositionIncrementAttribute posIncrAtt; + private readonly string? prefix; + + private State? savedToken = null; + + /// + /// Initializes a new instance of with + /// the specified token stream. + /// + /// Input token stream. + public TypeAsSynonymFilter(TokenStream input) + : this(input, null) + { + } + + /// + /// Initializes a new instance of with + /// the specified token stream and prefix. + /// + /// Input token stream. + /// Prepend this string to every token type emitted as token text. + /// If null, nothing will be prepended. + public TypeAsSynonymFilter(TokenStream input, string? prefix) + : base(input) + { + this.prefix = prefix; + termAtt = AddAttribute(); + typeAtt = AddAttribute(); + posIncrAtt = AddAttribute(); + } + + + public override bool IncrementToken() + { + if (savedToken != null) + { + // Emit last token's type at the same position + RestoreState(savedToken); + savedToken = null; + termAtt.SetEmpty(); + if (prefix != null) + { + termAtt.Append(prefix); + } + termAtt.Append(typeAtt.Type); + posIncrAtt.PositionIncrement = 0; + return true; + } + else if (m_input.IncrementToken()) + { + // Ho pending token type to emit + savedToken = CaptureState(); + return true; + } + return false; + } + + public override void Reset() + { + base.Reset(); + savedToken = null; + } + } +} diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilterFactory.cs new file mode 100644 index 0000000000..462be60bd7 --- /dev/null +++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TypeAsSynonymFilterFactory.cs @@ -0,0 +1,62 @@ +// Lucene version compatibility level 8.2.0 +// LUCENENET NOTE: Ported because Lucene.Net.Analysis.OpenNLP requires this to be useful. +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; +#nullable enable + +namespace Lucene.Net.Analysis.Miscellaneous +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// + /// Factory for . + /// + /// <fieldType name="text_type_as_synonym" class="solr.TextField" positionIncrementGap="100"> + /// <analyzer> + /// <tokenizer class="solr.UAX29URLEmailTokenizerFactory"/> + /// <filter class="solr.TypeAsSynonymFilterFactory" prefix="_type_" /> + /// </analyzer> + /// </fieldType> + /// + /// + /// + /// If the optional prefix parameter is used, the specified value will be prepended + /// to the type, e.g.with prefix = "_type_", for a token "example.com" with type "<URL>", + /// the emitted synonym will have text "_type_<URL>". + /// + public class TypeAsSynonymFilterFactory : TokenFilterFactory + { + private readonly string prefix; + + public TypeAsSynonymFilterFactory(IDictionary args) + : base(args) + { + prefix = Get(args, "prefix"); // default value is null + if (args.Count > 0) + { + throw new ArgumentException(string.Format(J2N.Text.StringFormatter.CurrentCulture, "Unknown parameters: {0}", args)); + } + } + + public override TokenStream Create(TokenStream input) + { + return new TypeAsSynonymFilter(input, prefix); + } + } +} diff --git a/src/Lucene.Net.TestFramework/Analysis/CannedTokenStream.cs b/src/Lucene.Net.TestFramework/Analysis/CannedTokenStream.cs index e7d787d041..824eaec95d 100644 --- a/src/Lucene.Net.TestFramework/Analysis/CannedTokenStream.cs +++ b/src/Lucene.Net.TestFramework/Analysis/CannedTokenStream.cs @@ -1,4 +1,4 @@ -using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Analysis.TokenAttributes; namespace Lucene.Net.Analysis { @@ -31,6 +31,7 @@ public sealed class CannedTokenStream : TokenStream private readonly IPositionLengthAttribute posLengthAtt; private readonly IOffsetAttribute offsetAtt; private readonly IPayloadAttribute payloadAtt; + private readonly ITypeAttribute typeAtt; // LUCENENET specific - See IncrementToken() private readonly int finalOffset; private readonly int finalPosInc; @@ -49,6 +50,7 @@ public CannedTokenStream(int finalPosInc, int finalOffset, params Token[] tokens posLengthAtt = AddAttribute(); offsetAtt = AddAttribute(); payloadAtt = AddAttribute(); + typeAtt = AddAttribute(); // LUCENENET specific - See IncrementToken() this.tokens = tokens; this.finalOffset = finalOffset; @@ -76,6 +78,12 @@ public override bool IncrementToken() posLengthAtt.PositionLength = token.PositionLength; offsetAtt.SetOffset(token.StartOffset, token.EndOffset); payloadAtt.Payload = token.Payload; + + // LUCENENET: This change is from https://github.com/apache/lucene/commit/72eaeab7151d421a28ecec1634b8c48599e524f5. + // We need it for the TestTypeAsSynonymFilterFactory tests to pass (from lucene 8.2.0). + // But we don't yet have all of the PackedTokenAttributeImpl plumbing it takes to do it the way they did, + // so setting it explicitly as a workaround. + typeAtt.Type = token.Type; return true; } else diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestTypeAsSynonymFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestTypeAsSynonymFilterFactory.cs new file mode 100644 index 0000000000..08f7e396ef --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestTypeAsSynonymFilterFactory.cs @@ -0,0 +1,54 @@ +using Lucene.Net.Analysis.Util; +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Miscellaneous +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestTypeAsSynonymFilterFactory : BaseTokenStreamFactoryTestCase + { + private static readonly Token[] TOKENS = { token("Visit", ""), token("example.com", "") }; + + [Test] + public void TestBasic() + { + TokenStream stream = new CannedTokenStream(TOKENS); + stream = TokenFilterFactory("TypeAsSynonym").Create(stream); + AssertTokenStreamContents(stream, new string[] { "Visit", "", "example.com", "" }, + null, null, new string[] { "", "", "", "" }, new int[] { 1, 0, 1, 0 }); + } + + [Test] + public void TestPrefix() + { + TokenStream stream = new CannedTokenStream(TOKENS); + stream = TokenFilterFactory("TypeAsSynonym", "prefix", "_type_").Create(stream); + AssertTokenStreamContents(stream, new string[] { "Visit", "_type_", "example.com", "_type_" }, + null, null, new string[] { "", "", "", "" }, new int[] { 1, 0, 1, 0 }); + } + + private static Token token(string term, string type) + { + Token token = new Token(); + token.SetEmpty(); + token.Append(term); + token.Type = type; + return token; + } + } +}