Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add stopword support. #5

Merged
merged 1 commit into from
Aug 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions src/ZoneTree.FullTextSearch/Core/Tokenizer/MemorySliceExtension.cs

This file was deleted.

31 changes: 31 additions & 0 deletions src/ZoneTree.FullTextSearch/Core/Tokenizer/SliceExtension.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
namespace ZoneTree.FullTextSearch.Core.Tokenizer;

/// <summary>
/// A static class that provides extension methods for slicing operations on ReadOnlyMemory and ReadOnlySpan.
/// </summary>
public static class SliceExtension
{
/// <summary>
/// Slices the specified ReadOnlyMemory using the provided Slice object.
/// </summary>
/// <typeparam name="T">The type of the elements in the ReadOnlyMemory.</typeparam>
/// <param name="memory">The ReadOnlyMemory to be sliced.</param>
/// <param name="slice">An instance of the Slice class containing the offset and length for slicing.</param>
/// <returns>A sliced ReadOnlyMemory segment according to the specified offset and length.</returns>
public static ReadOnlyMemory<T> Slice<T>(this ReadOnlyMemory<T> memory, Slice slice)
{
return memory.Slice(slice.Offset, slice.Length);
}

/// <summary>
/// Slices the specified ReadOnlySpan using the provided Slice object.
/// </summary>
/// <typeparam name="T">The type of the elements in the ReadOnlySpan.</typeparam>
/// <param name="readonlySpan">The ReadOnlySpan to be sliced.</param>
/// <param name="slice">An instance of the Slice class containing the offset and length for slicing.</param>
/// <returns>A sliced ReadOnlySpan segment according to the specified offset and length.</returns>
public static ReadOnlySpan<T> Slice<T>(this ReadOnlySpan<T> readonlySpan, Slice slice)
{
return readonlySpan.Slice(slice.Offset, slice.Length);
}
}
86 changes: 80 additions & 6 deletions src/ZoneTree.FullTextSearch/Core/Tokenizer/WordTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,65 @@ public sealed class WordTokenizer : IWordTokenizer
/// </summary>
public bool IncludeDigits { get; }

/// <summary>
/// Gets a value indicating whether stop words should be used during tokenization.
/// If true, tokens matching stop words will be excluded from the results.
/// </summary>
public bool UseStopWords { get; }

/// <summary>
/// A set of hash codes representing stop words to be excluded from tokenization results when <see cref="UseStopWords"/> is true.
/// </summary>
HashSet<ulong> StopWords { get; } = new();

/// <summary>
/// Initializes a new instance of the <see cref="WordTokenizer"/> class with the specified
/// minimum token length and an option to include digits in the tokens.
/// minimum token length, an option to include digits, and an option to use stop words.
/// </summary>
/// <param name="mimimumTokenLength">The minimum length of tokens to include in the results. Must be non-negative.</param>
/// <param name="includeDigits">Whether to include digits in the tokens. Defaults to false.</param>
/// <param name="useStopWords">Whether to filter out stop words from the tokens. Defaults to false.</param>
/// <exception cref="ArgumentException">Thrown when <paramref name="mimimumTokenLength"/> is negative.</exception>
public WordTokenizer(int mimimumTokenLength = 3, bool includeDigits = false)
public WordTokenizer(int mimimumTokenLength = 3, bool includeDigits = false, bool useStopWords = false)
{
if (mimimumTokenLength < 0)
throw new ArgumentException($"{nameof(mimimumTokenLength)} can't be negative.");
MimimumTokenLength = mimimumTokenLength;
IncludeDigits = includeDigits;
UseStopWords = useStopWords;
if (useStopWords)
AddStopWords(DefaultStopWords);
}

/// <summary>
/// The default list of stop words used when <see cref="UseStopWords"/> is enabled.
/// </summary>
static readonly string[] DefaultStopWords = new string[] {
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"};

/// <summary>
/// Adds an array of stop words to the internal stop words set. Each word is hashed
/// and stored in the <see cref="StopWords"/> set.
/// </summary>
/// <param name="stopWords">The array of stop words to add.</param>
public void AddStopWords(string[] stopWords)
{
var len = stopWords.Length;
for (var i = 0; i < len; i++)
{
string stopWord = stopWords[i];
StopWords.Add(HashCodeGenerator.GetHashCode(stopWord));
}
}

/// <summary>
/// Splits the given text into a list of slices, where each slice represents a token.
/// Tokens are determined based on the settings for minimum token length and whether digits are included.
/// Optionally filters out tokens that match stop words.
/// </summary>
/// <param name="text">The text to tokenize.</param>
/// <returns>A read-only list of <see cref="Slice"/> objects, each representing a token within the text.</returns>
Expand All @@ -48,6 +89,8 @@ public IReadOnlyList<Slice> GetSlices(ReadOnlySpan<char> text)
var tokens = new List<Slice>(len / 15);
int tokenStart = 0;
int tokenEnd = 0;
var useStopWords = UseStopWords;
var stopWords = StopWords;
for (var i = 0; i < len; i++)
{
var currentChar = text[i];
Expand All @@ -58,18 +101,33 @@ public IReadOnlyList<Slice> GetSlices(ReadOnlySpan<char> text)
continue;
}
if (tokenStart < tokenEnd - diff)
tokens.Add(new Slice(tokenStart, i - tokenStart));
{
var slice = new Slice(tokenStart, i - tokenStart);
if (!useStopWords ||
!stopWords.Contains(
HashCodeGenerator.GetHashCode(
text.Slice(slice))))
tokens.Add(slice);
}
tokenStart = i + 1;
tokenEnd = i + 1;
}
if (tokenStart < tokenEnd - diff)
tokens.Add(new Slice(tokenStart, tokenEnd - tokenStart));
{
var slice = new Slice(tokenStart, tokenEnd - tokenStart);
if (!useStopWords ||
!stopWords.Contains(
HashCodeGenerator.GetHashCode(
text.Slice(slice))))
tokens.Add(slice);
}
return tokens;
}

/// <summary>
/// Enumerates the slices of the given text, where each slice represents a token.
/// Tokens are determined based on the settings for minimum token length and whether digits are included.
/// Optionally filters out tokens that match stop words.
/// </summary>
/// <param name="text">The text to tokenize.</param>
/// <returns>An enumerable collection of <see cref="Slice"/> objects, each representing a token within the text.</returns>
Expand All @@ -81,6 +139,8 @@ public IEnumerable<Slice> EnumerateSlices(ReadOnlyMemory<char> text)
var len = text.Length;
int tokenStart = 0;
int tokenEnd = 0;
var useStopWords = UseStopWords;
var stopWords = StopWords;
for (var i = 0; i < len; i++)
{
var currentChar = text.Span[i];
Expand All @@ -91,11 +151,25 @@ public IEnumerable<Slice> EnumerateSlices(ReadOnlyMemory<char> text)
continue;
}
if (tokenStart < tokenEnd - diff)
yield return new Slice(tokenStart, i - tokenStart);
{
var slice = new Slice(tokenStart, i - tokenStart);
if (!useStopWords ||
!stopWords.Contains(
HashCodeGenerator.GetHashCode(
text.Slice(slice))))
yield return slice;
}
tokenStart = i + 1;
tokenEnd = i + 1;
}
if (tokenStart < tokenEnd - diff)
yield return new Slice(tokenStart, tokenEnd - tokenStart);
{
var slice = new Slice(tokenStart, tokenEnd - tokenStart);
if (!useStopWords ||
!stopWords.Contains(
HashCodeGenerator.GetHashCode(
text.Slice(slice))))
yield return slice;
}
}
}
4 changes: 2 additions & 2 deletions src/ZoneTree.FullTextSearch/Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
<Authors>Ahmed Yasin Koculu</Authors>
<PackageId>ZoneTree.FullTextSearch</PackageId>
<Title>ZoneTree.FullTextSearch</Title>
<ProductVersion>1.0.1.0</ProductVersion>
<Version>1.0.1.0</Version>
<ProductVersion>1.0.2.0</ProductVersion>
<Version>1.0.2.0</Version>
<Authors>Ahmed Yasin Koculu</Authors>
<AssemblyTitle>ZoneTree.FullTextSearch</AssemblyTitle>
<Description>ZoneTree.FullTextSearch is an open-source library that extends ZoneTree to provide efficient full-text search capabilities. It offers a fast, embedded search engine suitable for applications that require high performance and do not rely on external databases.</Description>
Expand Down
Loading