diff --git a/docs/api/khavee.rst b/docs/api/khavee.rst index 97e7117ea..ea6dcf134 100644 --- a/docs/api/khavee.rst +++ b/docs/api/khavee.rst @@ -49,6 +49,7 @@ Example Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry: :: + from pythainlp.khavee import KhaveeVerifier # Initialize a KhaveeVerifier instance diff --git a/docs/api/tokenize.rst b/docs/api/tokenize.rst index 41952d748..5fe02fdc2 100644 --- a/docs/api/tokenize.rst +++ b/docs/api/tokenize.rst @@ -44,6 +44,8 @@ Modules The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs. +.. autoclass:: display_cell_tokenize + Tokenization Engines -------------------- diff --git a/docs/api/util.rst b/docs/api/util.rst index e8b354641..a21ccaf48 100644 --- a/docs/api/util.rst +++ b/docs/api/util.rst @@ -287,6 +287,11 @@ Modules The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner. +.. autofunction:: longest_common_subsequence + :noindex: + + The `longest_common_subsequence` function is find the longest common subsequence between two strings. + .. autofunction:: pythainlp.util.morse.morse_encode :noindex: diff --git a/pythainlp/tokenize/__init__.py b/pythainlp/tokenize/__init__.py index 40a83d369..e9727a351 100644 --- a/pythainlp/tokenize/__init__.py +++ b/pythainlp/tokenize/__init__.py @@ -16,6 +16,7 @@ "syllable_tokenize", "word_detokenize", "word_tokenize", + "display_cell_tokenize", ] from pythainlp.corpus import thai_syllables, thai_words @@ -38,6 +39,7 @@ syllable_tokenize, word_detokenize, word_tokenize, + display_cell_tokenize, ) from pythainlp.corpus import get_corpus as _get_corpus diff --git a/pythainlp/tokenize/attacut.py b/pythainlp/tokenize/attacut.py index 64cdddde4..d6826fe01 100644 --- a/pythainlp/tokenize/attacut.py +++ b/pythainlp/tokenize/attacut.py @@ -8,7 +8,7 @@ :See Also: * `GitHub repository `_ """ -from typing import List +from typing import Dict, List from attacut import Tokenizer @@ -26,6 +26,9 @@ def tokenize(self, text: str) -> List[str]: return self._tokenizer.tokenize(text) +_tokenizers: Dict[str, AttacutTokenizer] = {} + + def segment(text: str, model: str = "attacut-sc") -> List[str]: """ Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai @@ -40,6 +43,8 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]: if not text or not isinstance(text, str): return [] - _tokenizer = AttacutTokenizer(model) + global _tokenizers + if model not in _tokenizers: + _tokenizers[model] = AttacutTokenizer(model) - return _tokenizer.tokenize(text) + return _tokenizers[model].tokenize(text) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 1f8304c42..c3e73649d 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -733,6 +733,46 @@ def syllable_tokenize( ) +def display_cell_tokenize(text: str) -> List[str]: + """ + Display cell tokenizer. + + Tokenizes Thai text into display cells without splitting tone marks. + + :param str text: text to be tokenized + :return: list of display cells + :rtype: List[str] + :Example: + + Tokenize Thai text into display cells:: + + from pythainlp.tokenize import display_cell_tokenize + + text = "แม่น้ำอยู่ที่ไหน" + display_cell_tokenize(text) + # output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น'] + """ + if not text or not isinstance(text, str): + return [] + + display_cells = [] + current_cell = "" + text = text.replace("ำ", "ํา") + + for char in text: + if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char): + current_cell += char + else: + if current_cell: + display_cells.append(current_cell) + current_cell = char + + if current_cell: + display_cells.append(current_cell) + + return display_cells + + class Tokenizer: """ Tokenizer class for a custom tokenizer. diff --git a/pythainlp/tokenize/longest.py b/pythainlp/tokenize/longest.py index 8bd21bf45..42b9e5bdf 100644 --- a/pythainlp/tokenize/longest.py +++ b/pythainlp/tokenize/longest.py @@ -12,7 +12,7 @@ """ import re -from typing import List, Union +from typing import Dict, List, Union from pythainlp import thai_tonemarks from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE @@ -134,16 +134,25 @@ def __segment(self, text: str): token_statuses.append(_KNOWN) begin_pos += len(match) - return tokens + # Group consecutive spaces into one token + grouped_tokens = [] + for token in tokens: + if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace(): + grouped_tokens[-1] += token + else: + grouped_tokens.append(token) + + return grouped_tokens def tokenize(self, text: str) -> List[str]: tokens = self.__segment(text) return tokens -def segment( - text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE -) -> List[str]: +_tokenizers: Dict[int, LongestMatchTokenizer] = {} + + +def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]: """ Dictionary-based longest matching word segmentation. @@ -157,4 +166,9 @@ def segment( if not custom_dict: custom_dict = DEFAULT_WORD_DICT_TRIE - return LongestMatchTokenizer(custom_dict).tokenize(text) + global _tokenizers + custom_dict_ref_id = id(custom_dict) + if custom_dict_ref_id not in _tokenizers: + _tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict) + + return _tokenizers[custom_dict_ref_id].tokenize(text) diff --git a/pythainlp/tokenize/newmm.py b/pythainlp/tokenize/newmm.py index cd9f238cb..c999fb7bd 100644 --- a/pythainlp/tokenize/newmm.py +++ b/pythainlp/tokenize/newmm.py @@ -182,7 +182,7 @@ def segment( # try to break by space first space_idx = sample.rfind(" ") if space_idx >= 0: - cut_pos = space_idx + 1 + cut_pos = space_idx + 1 + _TEXT_SCAN_BEGIN else: tokens = list(_onecut(sample, custom_dict)) token_max_idx = 0 diff --git a/pythainlp/tokenize/pyicu.py b/pythainlp/tokenize/pyicu.py index 492417f00..213080a85 100644 --- a/pythainlp/tokenize/pyicu.py +++ b/pythainlp/tokenize/pyicu.py @@ -15,9 +15,10 @@ from icu import BreakIterator, Locale +bd = BreakIterator.createWordInstance(Locale("th")) def _gen_words(text: str) -> str: - bd = BreakIterator.createWordInstance(Locale("th")) + global bd bd.setText(text) p = bd.first() for q in bd: diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py index 8e5ce8cb8..7613257a6 100644 --- a/pythainlp/util/__init__.py +++ b/pythainlp/util/__init__.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*- # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 @@ -26,6 +26,7 @@ "is_native_thai", "isthai", "isthaichar", + "longest_common_subsequence", "nectec_to_ipa", "normalize", "now_reign_year", @@ -95,6 +96,7 @@ thai_to_eng, ) from pythainlp.util.keywords import find_keyword, rank +from pythainlp.util.lcs import longest_common_subsequence from pythainlp.util.normalize import ( maiyamok, normalize, diff --git a/pythainlp/util/lcs.py b/pythainlp/util/lcs.py new file mode 100644 index 000000000..104f5674c --- /dev/null +++ b/pythainlp/util/lcs.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +def longest_common_subsequence(str1: str, str2: str) -> str: + """ + Find the longest common subsequence between two strings. + + :param str str1: The first string. + :param str str2: The second string. + :return: The longest common subsequence. + :rtype: str + + :Example: + :: + + from pythainlp.util.lcs import longest_common_subsequence + + print(longest_common_subsequence("ABCBDAB", "BDCAB")) + # output: "BDAB" + """ + m = len(str1) + n = len(str2) + + # Create a 2D array to store lengths of longest common subsequence. + dp = [[0] * (n + 1) for _ in range(m + 1)] + + # Build the dp array from bottom up. + for i in range(m + 1): + for j in range(n + 1): + if i == 0 or j == 0: + dp[i][j] = 0 + elif str1[i - 1] == str2[j - 1]: + dp[i][j] = dp[i - 1][j - 1] + 1 + else: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + + # Following code is used to print LCS + index = dp[m][n] + + # Create a character array to store the lcs string + lcs = [""] * (index + 1) + lcs[index] = "" + + # Start from the right-most-bottom-most corner and + # one by one store characters in lcs[] + i = m + j = n + while i > 0 and j > 0: + + # If current character in str1 and str2 are same, then + # current character is part of LCS + if str1[i - 1] == str2[j - 1]: + lcs[index - 1] = str1[i - 1] + i -= 1 + j -= 1 + index -= 1 + + # If not same, then find the larger of two and + # go in the direction of larger value + elif dp[i - 1][j] > dp[i][j - 1]: + i -= 1 + else: + j -= 1 + + return "".join(lcs) diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index 054a2ab7b..e7ee674f2 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -19,6 +19,7 @@ tcc_p, word_detokenize, word_tokenize, + display_cell_tokenize, ) from pythainlp.util import dict_trie @@ -390,6 +391,40 @@ def test_longest(self): longest_tokenizer.word_tokenize("เฉียบพลัน"), ["เฉียบพลัน"], ) + self.assertEqual( + longest.segment("ทดสอบ ทดสอบ ทดสอบ"), + ["ทดสอบ", " ", "ทดสอบ", " ", "ทดสอบ"], + ) + self.assertEqual( + longest.segment("ทดสอบ ทดสอบ"), + ["ทดสอบ", " ", "ทดสอบ"], + ) + self.assertEqual( + longest.segment("ทดสอบ ทดสอบ"), + ["ทดสอบ", " ", "ทดสอบ"], + ) + + def test_longest_custom_dict(self): + """Test switching the custom dict on longest segment function""" + + self.assertEqual( + word_tokenize("ทดสอบ ทดสอบ", engine="longest"), + ["ทดสอบ", " ", "ทดสอบ"], + ) + self.assertEqual( + word_tokenize( + "ปวดเฉียบพลัน", engine="longest", custom_dict=dict_trie(["ปวดเฉียบพลัน"]) + ), + ["ปวดเฉียบพลัน"], + ) + self.assertEqual( + word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict=dict_trie(["ทดสอบท"])), + ["ทดสอบท", "ดสอบ"], + ) + self.assertEqual( + word_tokenize("ทดสอบ ทดสอบ", engine="longest"), + ["ทดสอบ", " ", "ทดสอบ"], + ) def test_mm(self): self.assertEqual(multi_cut.segment(None), []) @@ -604,3 +639,13 @@ def test_tcc_p(self): # ) self.assertEqual(list(tcc_p.tcc("")), []) self.assertEqual(tcc_p.tcc_pos(""), set()) + + def test_display_cell_tokenize(self): + self.assertEqual(display_cell_tokenize(""), []) + self.assertEqual( + display_cell_tokenize("แม่น้ำอยู่ที่ไหน"), + ["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"] + ) + self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี']) + self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"]) + self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"]) diff --git a/tests/core/test_util.py b/tests/core/test_util.py index 430a9ce7a..73fcce8b9 100644 --- a/tests/core/test_util.py +++ b/tests/core/test_util.py @@ -32,6 +32,7 @@ ipa_to_rtgs, isthai, isthaichar, + longest_common_subsequence, nectec_to_ipa, normalize, now_reign_year, @@ -855,3 +856,13 @@ def test_spelling(self): self.assertEqual( spelling("กั้น"), ['กอ', 'อะ', 'นอ', 'กัน', 'ไม้โท', 'กั้น'] ) + + def test_longest_common_subsequence(self): + self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB") + self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB") + self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH") + self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC") + self.assertEqual(longest_common_subsequence("ABC", "DEF"), "") + self.assertEqual(longest_common_subsequence("", "ABC"), "") + self.assertEqual(longest_common_subsequence("ABC", ""), "") + self.assertEqual(longest_common_subsequence("", ""), "")