Skip to content

Commit

Permalink
Add list of words in sent_tokenize testset
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong committed Oct 28, 2024
1 parent 76f3310 commit 1a2b457
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 3 deletions.
6 changes: 3 additions & 3 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ def sent_tokenize(
result = []
_temp = []
for i, w in enumerate(text):
if re.findall(r"\s", w) != [] and re.findall(r"\w", w) == []:
if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []:
if _temp == []:
continue
result.append(_temp)
Expand All @@ -478,8 +478,8 @@ def sent_tokenize(
for i, w in enumerate(text):
if (
(re.findall(r"\s", w) != [] or
re.findall(r"\n", w) != [])
and re.findall(r"\w", w) == []
re.findall(r"\n", w) != []) and
re.findall(r"\w", w) == []
):
if _temp == []:
continue
Expand Down
17 changes: 17 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,23 @@ def test_sent_tokenize(self):
# engine="wtp-large",
# ),
# )
sent_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]
self.assertEqual(
sent_tokenize(sent_4, engine="crfcut"),
[["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]],
)
self.assertEqual(
sent_tokenize(sent_4, engine="whitespace"),
[["ผม", "กิน", "ข้าว"], ["\n", "เธอ", "เล่น", "เกม"]],
)
self.assertEqual(
sent_tokenize(sent_4, engine="whitespace+newline"),
[["ผม", "กิน", "ข้าว"], ["เธอ", "เล่น", "เกม"]],
)
self.assertEqual(
sent_tokenize(sent_4, engine="thaisum"),
[["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]],
)
self.assertFalse(
" "
in sent_tokenize(
Expand Down

0 comments on commit 1a2b457

Please sign in to comment.