Merge branch 'dev' into add-spelling

PyThaiNLP · Jan 13, 2025 · c9d8c91 · c9d8c91
2 parents f49c480 + 3adc978
commit c9d8c91
Show file tree

Hide file tree

Showing 13 changed files with 207 additions and 12 deletions.
diff --git a/docs/api/khavee.rst b/docs/api/khavee.rst
@@ -49,6 +49,7 @@ Example
 Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry:
 
 ::
+
   from pythainlp.khavee import KhaveeVerifier
   
   # Initialize a KhaveeVerifier instance

diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst
@@ -44,6 +44,8 @@ Modules
 
     The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.
 
+.. autoclass:: display_cell_tokenize
+
 Tokenization Engines
 --------------------
 

diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -287,6 +287,11 @@ Modules
 
     The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner.
 
+.. autofunction:: longest_common_subsequence
+    :noindex:
+
+    The `longest_common_subsequence` function is find the longest common subsequence between two strings.
+
 .. autofunction:: pythainlp.util.morse.morse_encode
     :noindex:
 

diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py
@@ -16,6 +16,7 @@
     "syllable_tokenize",
     "word_detokenize",
     "word_tokenize",
+    "display_cell_tokenize",
 ]
 
 from pythainlp.corpus import thai_syllables, thai_words
@@ -38,6 +39,7 @@
     syllable_tokenize,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 
 from pythainlp.corpus import get_corpus as _get_corpus

diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py
@@ -8,7 +8,7 @@
 :See Also:
     * `GitHub repository <https://github.com/PyThaiNLP/attacut>`_
 """
-from typing import List
+from typing import Dict, List
 
 from attacut import Tokenizer
 
@@ -26,6 +26,9 @@ def tokenize(self, text: str) -> List[str]:
         return self._tokenizer.tokenize(text)
 
 
+_tokenizers: Dict[str, AttacutTokenizer] = {}
+
+
 def segment(text: str, model: str = "attacut-sc") -> List[str]:
     """
     Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
@@ -40,6 +43,8 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
     if not text or not isinstance(text, str):
         return []
 
-    _tokenizer = AttacutTokenizer(model)
+    global _tokenizers
+    if model not in _tokenizers:
+        _tokenizers[model] = AttacutTokenizer(model)
 
-    return _tokenizer.tokenize(text)
+    return _tokenizers[model].tokenize(text)
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -733,6 +733,46 @@ def syllable_tokenize(
     )
 
 
+def display_cell_tokenize(text: str) -> List[str]:
+    """
+    Display cell tokenizer.
+
+    Tokenizes Thai text into display cells without splitting tone marks.
+
+    :param str text: text to be tokenized
+    :return: list of display cells
+    :rtype: List[str]
+    :Example:
+
+    Tokenize Thai text into display cells::
+
+        from pythainlp.tokenize import display_cell_tokenize
+
+        text = "แม่น้ำอยู่ที่ไหน"
+        display_cell_tokenize(text)
+        # output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    display_cells = []
+    current_cell = ""
+    text = text.replace("ำ", "ํา")
+
+    for char in text:
+        if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
+            current_cell += char
+        else:
+            if current_cell:
+                display_cells.append(current_cell)
+            current_cell = char
+
+    if current_cell:
+        display_cells.append(current_cell)
+
+    return display_cells
+
+
 class Tokenizer:
     """
     Tokenizer class for a custom tokenizer.

diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py
@@ -12,7 +12,7 @@
 
 """
 import re
-from typing import List, Union
+from typing import Dict, List, Union
 
 from pythainlp import thai_tonemarks
 from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
@@ -134,16 +134,25 @@ def __segment(self, text: str):
                     token_statuses.append(_KNOWN)
                 begin_pos += len(match)
 
-        return tokens
+        # Group consecutive spaces into one token
+        grouped_tokens = []
+        for token in tokens:
+            if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace():
+                grouped_tokens[-1] += token
+            else:
+                grouped_tokens.append(token)
+
+        return grouped_tokens
 
     def tokenize(self, text: str) -> List[str]:
         tokens = self.__segment(text)
         return tokens
 
 
-def segment(
-    text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
-) -> List[str]:
+_tokenizers: Dict[int, LongestMatchTokenizer] = {}
+
+
+def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]:
     """
     Dictionary-based longest matching word segmentation.
 
@@ -157,4 +166,9 @@ def segment(
     if not custom_dict:
         custom_dict = DEFAULT_WORD_DICT_TRIE
 
-    return LongestMatchTokenizer(custom_dict).tokenize(text)
+    global _tokenizers
+    custom_dict_ref_id = id(custom_dict)
+    if custom_dict_ref_id not in _tokenizers:
+        _tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)
+
+    return _tokenizers[custom_dict_ref_id].tokenize(text)
diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py
@@ -182,7 +182,7 @@ def segment(
         # try to break by space first
         space_idx = sample.rfind(" ")
         if space_idx >= 0:
-            cut_pos = space_idx + 1
+            cut_pos = space_idx + 1 + _TEXT_SCAN_BEGIN
         else:
             tokens = list(_onecut(sample, custom_dict))
             token_max_idx = 0

diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py
@@ -15,9 +15,10 @@
 
 from icu import BreakIterator, Locale
 
+bd = BreakIterator.createWordInstance(Locale("th"))
 
 def _gen_words(text: str) -> str:
-    bd = BreakIterator.createWordInstance(Locale("th"))
+    global bd
     bd.setText(text)
     p = bd.first()
     for q in bd:

diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
 # SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
@@ -26,6 +26,7 @@
     "is_native_thai",
     "isthai",
     "isthaichar",
+    "longest_common_subsequence",
     "nectec_to_ipa",
     "normalize",
     "now_reign_year",
@@ -95,6 +96,7 @@
     thai_to_eng,
 )
 from pythainlp.util.keywords import find_keyword, rank
+from pythainlp.util.lcs import longest_common_subsequence
 from pythainlp.util.normalize import (
     maiyamok,
     normalize,

diff --git a/pythainlp/util/lcs.py b/pythainlp/util/lcs.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+def longest_common_subsequence(str1: str, str2: str) -> str:
+    """
+    Find the longest common subsequence between two strings.
+
+    :param str str1: The first string.
+    :param str str2: The second string.
+    :return: The longest common subsequence.
+    :rtype: str
+
+    :Example:
+    ::
+
+        from pythainlp.util.lcs import longest_common_subsequence
+
+        print(longest_common_subsequence("ABCBDAB", "BDCAB"))
+        # output: "BDAB"
+    """
+    m = len(str1)
+    n = len(str2)
+
+    # Create a 2D array to store lengths of longest common subsequence.
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+
+    # Build the dp array from bottom up.
+    for i in range(m + 1):
+        for j in range(n + 1):
+            if i == 0 or j == 0:
+                dp[i][j] = 0
+            elif str1[i - 1] == str2[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1] + 1
+            else:
+                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+
+    # Following code is used to print LCS
+    index = dp[m][n]
+
+    # Create a character array to store the lcs string
+    lcs = [""] * (index + 1)
+    lcs[index] = ""
+
+    # Start from the right-most-bottom-most corner and
+    # one by one store characters in lcs[]
+    i = m
+    j = n
+    while i > 0 and j > 0:
+
+        # If current character in str1 and str2 are same, then
+        # current character is part of LCS
+        if str1[i - 1] == str2[j - 1]:
+            lcs[index - 1] = str1[i - 1]
+            i -= 1
+            j -= 1
+            index -= 1
+
+        # If not same, then find the larger of two and
+        # go in the direction of larger value
+        elif dp[i - 1][j] > dp[i][j - 1]:
+            i -= 1
+        else:
+            j -= 1
+
+    return "".join(lcs)
diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
@@ -19,6 +19,7 @@
     tcc_p,
     word_detokenize,
     word_tokenize,
+    display_cell_tokenize,
 )
 from pythainlp.util import dict_trie
 
@@ -390,6 +391,40 @@ def test_longest(self):
             longest_tokenizer.word_tokenize("เฉียบพลัน"),
             ["เฉียบพลัน"],
         )
+        self.assertEqual(
+            longest.segment("ทดสอบ  ทดสอบ  ทดสอบ"),
+            ["ทดสอบ", "  ", "ทดสอบ", "  ", "ทดสอบ"],
+        )
+        self.assertEqual(
+            longest.segment("ทดสอบ  ทดสอบ"),
+            ["ทดสอบ", "  ", "ทดสอบ"],
+        )
+        self.assertEqual(
+            longest.segment("ทดสอบ    ทดสอบ"),
+            ["ทดสอบ", "    ", "ทดสอบ"],
+        )
+
+    def test_longest_custom_dict(self):
+        """Test switching the custom dict on longest segment function"""
+
+        self.assertEqual(
+            word_tokenize("ทดสอบ  ทดสอบ", engine="longest"),
+            ["ทดสอบ", "  ", "ทดสอบ"],
+        )
+        self.assertEqual(
+            word_tokenize(
+                "ปวดเฉียบพลัน", engine="longest", custom_dict=dict_trie(["ปวดเฉียบพลัน"])
+            ),
+            ["ปวดเฉียบพลัน"],
+        )
+        self.assertEqual(
+            word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict=dict_trie(["ทดสอบท"])),
+            ["ทดสอบท", "ดสอบ"],
+        )
+        self.assertEqual(
+            word_tokenize("ทดสอบ  ทดสอบ", engine="longest"),
+            ["ทดสอบ", "  ", "ทดสอบ"],
+        )
 
     def test_mm(self):
         self.assertEqual(multi_cut.segment(None), [])
@@ -604,3 +639,13 @@ def test_tcc_p(self):
         # )
         self.assertEqual(list(tcc_p.tcc("")), [])
         self.assertEqual(tcc_p.tcc_pos(""), set())
+
+    def test_display_cell_tokenize(self):
+        self.assertEqual(display_cell_tokenize(""), [])
+        self.assertEqual(
+            display_cell_tokenize("แม่น้ำอยู่ที่ไหน"),
+            ["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"]
+        )
+        self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี'])
+        self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
+        self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])
diff --git a/tests/core/test_util.py b/tests/core/test_util.py
@@ -32,6 +32,7 @@
     ipa_to_rtgs,
     isthai,
     isthaichar,
+    longest_common_subsequence,
     nectec_to_ipa,
     normalize,
     now_reign_year,
@@ -855,3 +856,13 @@ def test_spelling(self):
         self.assertEqual(
             spelling("กั้น"), ['กอ', 'อะ', 'นอ', 'กัน', 'ไม้โท', 'กั้น']
         )
+
+    def test_longest_common_subsequence(self):
+        self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB")
+        self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")
+        self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH")
+        self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC")
+        self.assertEqual(longest_common_subsequence("ABC", "DEF"), "")
+        self.assertEqual(longest_common_subsequence("", "ABC"), "")
+        self.assertEqual(longest_common_subsequence("ABC", ""), "")
+        self.assertEqual(longest_common_subsequence("", ""), "")
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,6 +49,7 @@ Example @@
     Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry:
     ::
       from pythainlp.khavee import KhaveeVerifier
       # Initialize a KhaveeVerifier instance
@@ Expand Down @@