koculu · koculu · Aug 24, 2024 · Aug 24, 2024
diff --git a/src/ZoneTree.FullTextSearch/Core/Tokenizer/MemorySliceExtension.cs b/src/ZoneTree.FullTextSearch/Core/Tokenizer/MemorySliceExtension.cs
diff --git a/src/ZoneTree.FullTextSearch/Core/Tokenizer/SliceExtension.cs b/src/ZoneTree.FullTextSearch/Core/Tokenizer/SliceExtension.cs
@@ -0,0 +1,31 @@
+namespace ZoneTree.FullTextSearch.Core.Tokenizer;
+
+/// <summary>
+/// A static class that provides extension methods for slicing operations on ReadOnlyMemory and ReadOnlySpan.
+/// </summary>
+public static class SliceExtension
+{
+    /// <summary>
+    /// Slices the specified ReadOnlyMemory using the provided Slice object.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the ReadOnlyMemory.</typeparam>
+    /// <param name="memory">The ReadOnlyMemory to be sliced.</param>
+    /// <param name="slice">An instance of the Slice class containing the offset and length for slicing.</param>
+    /// <returns>A sliced ReadOnlyMemory segment according to the specified offset and length.</returns>
+    public static ReadOnlyMemory<T> Slice<T>(this ReadOnlyMemory<T> memory, Slice slice)
+    {
+        return memory.Slice(slice.Offset, slice.Length);
+    }
+
+    /// <summary>
+    /// Slices the specified ReadOnlySpan using the provided Slice object.
+    /// </summary>
+    /// <typeparam name="T">The type of the elements in the ReadOnlySpan.</typeparam>
+    /// <param name="readonlySpan">The ReadOnlySpan to be sliced.</param>
+    /// <param name="slice">An instance of the Slice class containing the offset and length for slicing.</param>
+    /// <returns>A sliced ReadOnlySpan segment according to the specified offset and length.</returns>
+    public static ReadOnlySpan<T> Slice<T>(this ReadOnlySpan<T> readonlySpan, Slice slice)
+    {
+        return readonlySpan.Slice(slice.Offset, slice.Length);
+    }
+}
diff --git a/src/ZoneTree.FullTextSearch/Core/Tokenizer/WordTokenizer.cs b/src/ZoneTree.FullTextSearch/Core/Tokenizer/WordTokenizer.cs
@@ -18,24 +18,65 @@ public sealed class WordTokenizer : IWordTokenizer
     /// </summary>
     public bool IncludeDigits { get; }
 
+    /// <summary>
+    /// Gets a value indicating whether stop words should be used during tokenization.
+    /// If true, tokens matching stop words will be excluded from the results.
+    /// </summary>
+    public bool UseStopWords { get; }
+
+    /// <summary>
+    /// A set of hash codes representing stop words to be excluded from tokenization results when <see cref="UseStopWords"/> is true.
+    /// </summary>
+    HashSet<ulong> StopWords { get; } = new();
+
     /// <summary>
     /// Initializes a new instance of the <see cref="WordTokenizer"/> class with the specified
-    /// minimum token length and an option to include digits in the tokens.
+    /// minimum token length, an option to include digits, and an option to use stop words.
     /// </summary>
     /// <param name="mimimumTokenLength">The minimum length of tokens to include in the results. Must be non-negative.</param>
     /// <param name="includeDigits">Whether to include digits in the tokens. Defaults to false.</param>
+    /// <param name="useStopWords">Whether to filter out stop words from the tokens. Defaults to false.</param>
     /// <exception cref="ArgumentException">Thrown when <paramref name="mimimumTokenLength"/> is negative.</exception>
-    public WordTokenizer(int mimimumTokenLength = 3, bool includeDigits = false)
+    public WordTokenizer(int mimimumTokenLength = 3, bool includeDigits = false, bool useStopWords = false)
     {
         if (mimimumTokenLength < 0)
             throw new ArgumentException($"{nameof(mimimumTokenLength)} can't be negative.");
         MimimumTokenLength = mimimumTokenLength;
         IncludeDigits = includeDigits;
+        UseStopWords = useStopWords;
+        if (useStopWords)
+            AddStopWords(DefaultStopWords);
+    }
+
+    /// <summary>
+    /// The default list of stop words used when <see cref="UseStopWords"/> is enabled.
+    /// </summary>
+    static readonly string[] DefaultStopWords = new string[] {
+            "a", "an", "and", "are", "as", "at", "be", "but", "by",
+            "for", "if", "in", "into", "is", "it",
+            "no", "not", "of", "on", "or", "such",
+            "that", "the", "their", "then", "there", "these",
+            "they", "this", "to", "was", "will", "with"};
+
+    /// <summary>
+    /// Adds an array of stop words to the internal stop words set. Each word is hashed
+    /// and stored in the <see cref="StopWords"/> set.
+    /// </summary>
+    /// <param name="stopWords">The array of stop words to add.</param>
+    public void AddStopWords(string[] stopWords)
+    {
+        var len = stopWords.Length;
+        for (var i = 0; i < len; i++)
+        {
+            string stopWord = stopWords[i];
+            StopWords.Add(HashCodeGenerator.GetHashCode(stopWord));
+        }
     }
 
     /// <summary>
     /// Splits the given text into a list of slices, where each slice represents a token. 
     /// Tokens are determined based on the settings for minimum token length and whether digits are included.
+    /// Optionally filters out tokens that match stop words.
     /// </summary>
     /// <param name="text">The text to tokenize.</param>
     /// <returns>A read-only list of <see cref="Slice"/> objects, each representing a token within the text.</returns>
@@ -48,6 +89,8 @@ public IReadOnlyList<Slice> GetSlices(ReadOnlySpan<char> text)
         var tokens = new List<Slice>(len / 15);
         int tokenStart = 0;
         int tokenEnd = 0;
+        var useStopWords = UseStopWords;
+        var stopWords = StopWords;
         for (var i = 0; i < len; i++)
         {
             var currentChar = text[i];
@@ -58,18 +101,33 @@ public IReadOnlyList<Slice> GetSlices(ReadOnlySpan<char> text)
                 continue;
             }
             if (tokenStart < tokenEnd - diff)
-                tokens.Add(new Slice(tokenStart, i - tokenStart));
+            {
+                var slice = new Slice(tokenStart, i - tokenStart);
+                if (!useStopWords ||
+                    !stopWords.Contains(
+                        HashCodeGenerator.GetHashCode(
+                            text.Slice(slice))))
+                    tokens.Add(slice);
+            }
             tokenStart = i + 1;
             tokenEnd = i + 1;
         }
         if (tokenStart < tokenEnd - diff)
-            tokens.Add(new Slice(tokenStart, tokenEnd - tokenStart));
+        {
+            var slice = new Slice(tokenStart, tokenEnd - tokenStart);
+            if (!useStopWords ||
+                !stopWords.Contains(
+                    HashCodeGenerator.GetHashCode(
+                        text.Slice(slice))))
+                tokens.Add(slice);
+        }
         return tokens;
     }
 
     /// <summary>
     /// Enumerates the slices of the given text, where each slice represents a token.
     /// Tokens are determined based on the settings for minimum token length and whether digits are included.
+    /// Optionally filters out tokens that match stop words.
     /// </summary>
     /// <param name="text">The text to tokenize.</param>
     /// <returns>An enumerable collection of <see cref="Slice"/> objects, each representing a token within the text.</returns>
@@ -81,6 +139,8 @@ public IEnumerable<Slice> EnumerateSlices(ReadOnlyMemory<char> text)
         var len = text.Length;
         int tokenStart = 0;
         int tokenEnd = 0;
+        var useStopWords = UseStopWords;
+        var stopWords = StopWords;
         for (var i = 0; i < len; i++)
         {
             var currentChar = text.Span[i];
@@ -91,11 +151,25 @@ public IEnumerable<Slice> EnumerateSlices(ReadOnlyMemory<char> text)
                 continue;
             }
             if (tokenStart < tokenEnd - diff)
-                yield return new Slice(tokenStart, i - tokenStart);
+            {
+                var slice = new Slice(tokenStart, i - tokenStart);
+                if (!useStopWords ||
+                    !stopWords.Contains(
+                        HashCodeGenerator.GetHashCode(
+                            text.Slice(slice))))
+                    yield return slice;
+            }
             tokenStart = i + 1;
             tokenEnd = i + 1;
         }
         if (tokenStart < tokenEnd - diff)
-            yield return new Slice(tokenStart, tokenEnd - tokenStart);
+        {
+            var slice = new Slice(tokenStart, tokenEnd - tokenStart);
+            if (!useStopWords ||
+                !stopWords.Contains(
+                    HashCodeGenerator.GetHashCode(
+                        text.Slice(slice))))
+                yield return slice;
+        }
     }
 }
diff --git a/src/ZoneTree.FullTextSearch/Directory.Build.props b/src/ZoneTree.FullTextSearch/Directory.Build.props
@@ -5,8 +5,8 @@
     <Authors>Ahmed Yasin Koculu</Authors>
     <PackageId>ZoneTree.FullTextSearch</PackageId>
     <Title>ZoneTree.FullTextSearch</Title>
-    <ProductVersion>1.0.1.0</ProductVersion>
-    <Version>1.0.1.0</Version>
+    <ProductVersion>1.0.2.0</ProductVersion>
+    <Version>1.0.2.0</Version>
     <Authors>Ahmed Yasin Koculu</Authors>
     <AssemblyTitle>ZoneTree.FullTextSearch</AssemblyTitle>
     <Description>ZoneTree.FullTextSearch is an open-source library that extends ZoneTree to provide efficient full-text search capabilities. It offers a fast, embedded search engine suitable for applications that require high performance and do not rely on external databases.</Description>