Skip to content

Commit

Permalink
Merge branch 'dev' into add-spelling
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong authored Jan 13, 2025
2 parents f49c480 + 3adc978 commit c9d8c91
Show file tree
Hide file tree
Showing 13 changed files with 207 additions and 12 deletions.
1 change: 1 addition & 0 deletions docs/api/khavee.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Example
Here's a basic example of how to use the `KhaveeVerifier` class to verify Thai poetry:

::

from pythainlp.khavee import KhaveeVerifier
# Initialize a KhaveeVerifier instance
Expand Down
2 changes: 2 additions & 0 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ Modules

The `Tokenizer` class is a versatile tool for customizing tokenization processes and managing tokenization models. It provides various methods and attributes to fine-tune tokenization according to your specific needs.

.. autoclass:: display_cell_tokenize

Tokenization Engines
--------------------

Expand Down
5 changes: 5 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,11 @@ Modules

The `Trie` class is a data structure for efficient dictionary operations. It's a valuable resource for managing and searching word lists and dictionaries in a structured and efficient manner.

.. autofunction:: longest_common_subsequence
:noindex:

The `longest_common_subsequence` function is find the longest common subsequence between two strings.

.. autofunction:: pythainlp.util.morse.morse_encode
:noindex:

Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"syllable_tokenize",
"word_detokenize",
"word_tokenize",
"display_cell_tokenize",
]

from pythainlp.corpus import thai_syllables, thai_words
Expand All @@ -38,6 +39,7 @@
syllable_tokenize,
word_detokenize,
word_tokenize,
display_cell_tokenize,
)

from pythainlp.corpus import get_corpus as _get_corpus
Expand Down
11 changes: 8 additions & 3 deletions pythainlp/tokenize/attacut.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
:See Also:
* `GitHub repository <https://github.com/PyThaiNLP/attacut>`_
"""
from typing import List
from typing import Dict, List

from attacut import Tokenizer

Expand All @@ -26,6 +26,9 @@ def tokenize(self, text: str) -> List[str]:
return self._tokenizer.tokenize(text)


_tokenizers: Dict[str, AttacutTokenizer] = {}


def segment(text: str, model: str = "attacut-sc") -> List[str]:
"""
Wrapper for AttaCut - Fast and Reasonably Accurate Word Tokenizer for Thai
Expand All @@ -40,6 +43,8 @@ def segment(text: str, model: str = "attacut-sc") -> List[str]:
if not text or not isinstance(text, str):
return []

_tokenizer = AttacutTokenizer(model)
global _tokenizers
if model not in _tokenizers:
_tokenizers[model] = AttacutTokenizer(model)

return _tokenizer.tokenize(text)
return _tokenizers[model].tokenize(text)
40 changes: 40 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,46 @@ def syllable_tokenize(
)


def display_cell_tokenize(text: str) -> List[str]:
"""
Display cell tokenizer.
Tokenizes Thai text into display cells without splitting tone marks.
:param str text: text to be tokenized
:return: list of display cells
:rtype: List[str]
:Example:
Tokenize Thai text into display cells::
from pythainlp.tokenize import display_cell_tokenize
text = "แม่น้ำอยู่ที่ไหน"
display_cell_tokenize(text)
# output: ['แ', 'ม่', 'น้ํ', 'า', 'อ', 'ยู่', 'ที่', 'ไ', 'ห', 'น']
"""
if not text or not isinstance(text, str):
return []

display_cells = []
current_cell = ""
text = text.replace("ำ", "ํา")

for char in text:
if re.match(r"[\u0E31\u0E34-\u0E3A\u0E47-\u0E4E]", char):
current_cell += char
else:
if current_cell:
display_cells.append(current_cell)
current_cell = char

if current_cell:
display_cells.append(current_cell)

return display_cells


class Tokenizer:
"""
Tokenizer class for a custom tokenizer.
Expand Down
26 changes: 20 additions & 6 deletions pythainlp/tokenize/longest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""
import re
from typing import List, Union
from typing import Dict, List, Union

from pythainlp import thai_tonemarks
from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
Expand Down Expand Up @@ -134,16 +134,25 @@ def __segment(self, text: str):
token_statuses.append(_KNOWN)
begin_pos += len(match)

return tokens
# Group consecutive spaces into one token
grouped_tokens = []
for token in tokens:
if token.isspace() and grouped_tokens and grouped_tokens[-1].isspace():
grouped_tokens[-1] += token
else:
grouped_tokens.append(token)

return grouped_tokens

def tokenize(self, text: str) -> List[str]:
tokens = self.__segment(text)
return tokens


def segment(
text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE
) -> List[str]:
_tokenizers: Dict[int, LongestMatchTokenizer] = {}


def segment(text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE) -> List[str]:
"""
Dictionary-based longest matching word segmentation.
Expand All @@ -157,4 +166,9 @@ def segment(
if not custom_dict:
custom_dict = DEFAULT_WORD_DICT_TRIE

return LongestMatchTokenizer(custom_dict).tokenize(text)
global _tokenizers
custom_dict_ref_id = id(custom_dict)
if custom_dict_ref_id not in _tokenizers:
_tokenizers[custom_dict_ref_id] = LongestMatchTokenizer(custom_dict)

return _tokenizers[custom_dict_ref_id].tokenize(text)
2 changes: 1 addition & 1 deletion pythainlp/tokenize/newmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def segment(
# try to break by space first
space_idx = sample.rfind(" ")
if space_idx >= 0:
cut_pos = space_idx + 1
cut_pos = space_idx + 1 + _TEXT_SCAN_BEGIN
else:
tokens = list(_onecut(sample, custom_dict))
token_max_idx = 0
Expand Down
3 changes: 2 additions & 1 deletion pythainlp/tokenize/pyicu.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

from icu import BreakIterator, Locale

bd = BreakIterator.createWordInstance(Locale("th"))

def _gen_words(text: str) -> str:
bd = BreakIterator.createWordInstance(Locale("th"))
global bd
bd.setText(text)
p = bd.first()
for q in bd:
Expand Down
4 changes: 3 additions & 1 deletion pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
Expand Down Expand Up @@ -26,6 +26,7 @@
"is_native_thai",
"isthai",
"isthaichar",
"longest_common_subsequence",
"nectec_to_ipa",
"normalize",
"now_reign_year",
Expand Down Expand Up @@ -95,6 +96,7 @@
thai_to_eng,
)
from pythainlp.util.keywords import find_keyword, rank
from pythainlp.util.lcs import longest_common_subsequence
from pythainlp.util.normalize import (
maiyamok,
normalize,
Expand Down
67 changes: 67 additions & 0 deletions pythainlp/util/lcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0

def longest_common_subsequence(str1: str, str2: str) -> str:
"""
Find the longest common subsequence between two strings.
:param str str1: The first string.
:param str str2: The second string.
:return: The longest common subsequence.
:rtype: str
:Example:
::
from pythainlp.util.lcs import longest_common_subsequence
print(longest_common_subsequence("ABCBDAB", "BDCAB"))
# output: "BDAB"
"""
m = len(str1)
n = len(str2)

# Create a 2D array to store lengths of longest common subsequence.
dp = [[0] * (n + 1) for _ in range(m + 1)]

# Build the dp array from bottom up.
for i in range(m + 1):
for j in range(n + 1):
if i == 0 or j == 0:
dp[i][j] = 0
elif str1[i - 1] == str2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

# Following code is used to print LCS
index = dp[m][n]

# Create a character array to store the lcs string
lcs = [""] * (index + 1)
lcs[index] = ""

# Start from the right-most-bottom-most corner and
# one by one store characters in lcs[]
i = m
j = n
while i > 0 and j > 0:

# If current character in str1 and str2 are same, then
# current character is part of LCS
if str1[i - 1] == str2[j - 1]:
lcs[index - 1] = str1[i - 1]
i -= 1
j -= 1
index -= 1

# If not same, then find the larger of two and
# go in the direction of larger value
elif dp[i - 1][j] > dp[i][j - 1]:
i -= 1
else:
j -= 1

return "".join(lcs)
45 changes: 45 additions & 0 deletions tests/core/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
tcc_p,
word_detokenize,
word_tokenize,
display_cell_tokenize,
)
from pythainlp.util import dict_trie

Expand Down Expand Up @@ -390,6 +391,40 @@ def test_longest(self):
longest_tokenizer.word_tokenize("เฉียบพลัน"),
["เฉียบพลัน"],
)
self.assertEqual(
longest.segment("ทดสอบ ทดสอบ ทดสอบ"),
["ทดสอบ", " ", "ทดสอบ", " ", "ทดสอบ"],
)
self.assertEqual(
longest.segment("ทดสอบ ทดสอบ"),
["ทดสอบ", " ", "ทดสอบ"],
)
self.assertEqual(
longest.segment("ทดสอบ ทดสอบ"),
["ทดสอบ", " ", "ทดสอบ"],
)

def test_longest_custom_dict(self):
"""Test switching the custom dict on longest segment function"""

self.assertEqual(
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
["ทดสอบ", " ", "ทดสอบ"],
)
self.assertEqual(
word_tokenize(
"ปวดเฉียบพลัน", engine="longest", custom_dict=dict_trie(["ปวดเฉียบพลัน"])
),
["ปวดเฉียบพลัน"],
)
self.assertEqual(
word_tokenize("ทดสอบทดสอบ", engine="longest", custom_dict=dict_trie(["ทดสอบท"])),
["ทดสอบท", "ดสอบ"],
)
self.assertEqual(
word_tokenize("ทดสอบ ทดสอบ", engine="longest"),
["ทดสอบ", " ", "ทดสอบ"],
)

def test_mm(self):
self.assertEqual(multi_cut.segment(None), [])
Expand Down Expand Up @@ -604,3 +639,13 @@ def test_tcc_p(self):
# )
self.assertEqual(list(tcc_p.tcc("")), [])
self.assertEqual(tcc_p.tcc_pos(""), set())

def test_display_cell_tokenize(self):
self.assertEqual(display_cell_tokenize(""), [])
self.assertEqual(
display_cell_tokenize("แม่น้ำอยู่ที่ไหน"),
["แ", "ม่", "น้ํ", "า", "อ", "ยู่", "ที่", "ไ", "ห", "น"]
)
self.assertEqual(display_cell_tokenize("สวัสดี"), ['ส', 'วั', 'ส', 'ดี'])
self.assertEqual(display_cell_tokenize("ทดสอบ"), ["ท", "ด", "ส", "อ", "บ"])
self.assertEqual(display_cell_tokenize("ภาษาไทย"), ["ภ", "า", "ษ", "า", "ไ", "ท", "ย"])
11 changes: 11 additions & 0 deletions tests/core/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
ipa_to_rtgs,
isthai,
isthaichar,
longest_common_subsequence,
nectec_to_ipa,
normalize,
now_reign_year,
Expand Down Expand Up @@ -855,3 +856,13 @@ def test_spelling(self):
self.assertEqual(
spelling("กั้น"), ['กอ', 'อะ', 'นอ', 'กัน', 'ไม้โท', 'กั้น']
)

def test_longest_common_subsequence(self):
self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB")
self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")
self.assertEqual(longest_common_subsequence("ABCDGH", "AEDFHR"), "ADH")
self.assertEqual(longest_common_subsequence("ABC", "AC"), "AC")
self.assertEqual(longest_common_subsequence("ABC", "DEF"), "")
self.assertEqual(longest_common_subsequence("", "ABC"), "")
self.assertEqual(longest_common_subsequence("ABC", ""), "")
self.assertEqual(longest_common_subsequence("", ""), "")

0 comments on commit c9d8c91

Please sign in to comment.