From 1a2b457063a604175c9ece8146f1c425920e6d4e Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Mon, 28 Oct 2024 22:29:31 +0700 Subject: [PATCH] Add list of words in sent_tokenize testset --- pythainlp/tokenize/core.py | 6 +++--- tests/test_tokenize.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 931980532..57669337d 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -460,7 +460,7 @@ def sent_tokenize( result = [] _temp = [] for i, w in enumerate(text): - if re.findall(r"\s", w) != [] and re.findall(r"\w", w) == []: + if re.findall(r" ", w) != [] and re.findall(r"\w", w) == []: if _temp == []: continue result.append(_temp) @@ -478,8 +478,8 @@ def sent_tokenize( for i, w in enumerate(text): if ( (re.findall(r"\s", w) != [] or - re.findall(r"\n", w) != []) - and re.findall(r"\w", w) == [] + re.findall(r"\n", w) != []) and + re.findall(r"\w", w) == [] ): if _temp == []: continue diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index ad5a1f5e9..0d6026168 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -333,6 +333,23 @@ def test_sent_tokenize(self): # engine="wtp-large", # ), # ) + sent_4 = ["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"] + self.assertEqual( + sent_tokenize(sent_4, engine="crfcut"), + [["ผม", "กิน", "ข้าว", " ", "\n", "เธอ", "เล่น", "เกม"]], + ) + self.assertEqual( + sent_tokenize(sent_4, engine="whitespace"), + [["ผม", "กิน", "ข้าว"], ["\n", "เธอ", "เล่น", "เกม"]], + ) + self.assertEqual( + sent_tokenize(sent_4, engine="whitespace+newline"), + [["ผม", "กิน", "ข้าว"], ["เธอ", "เล่น", "เกม"]], + ) + self.assertEqual( + sent_tokenize(sent_4, engine="thaisum"), + [["ผม", "กิน", "ข้าว", " ", "เธอ", "เล่น", "เกม"]], + ) self.assertFalse( " " in sent_tokenize(