From 315fe438378a29a1356443a9ed2f0a88d4309e98 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 12 Jan 2025 20:23:09 +0700 Subject: [PATCH 1/3] Fix custom dict error for unsupported tokenization engines Fixes #1065 Add error handling for unsupported custom dictionaries in `word_tokenize` function. * Add a check for unsupported engines in the `word_tokenize` function in `pythainlp/tokenize/core.py`. * Raise a `NotImplementedError` if `custom_dict` is passed to an unsupported engine such as `attacut`, `icu`, `nercut`, `sefr_cut`, `tltk`, and `oskut`. * Update the docstring for the `word_tokenize` function to reflect the changes. --- pythainlp/tokenize/core.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 1f8304c42..2496bfc03 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -111,10 +111,10 @@ def word_tokenize( :param str engine: name of the tokenizer to be used :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support) :param bool keep_whitespace: True to keep whitespace, a common mark - for end of phrase in Thai. - Otherwise, whitespace is omitted. + for end of phrase in Thai. + Otherwise, whitespace is omitted. :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated. - Otherwise, formatted numeric could be wrongly separated. + Otherwise, formatted numeric could be wrongly separated. :return: list of words :rtype: List[str] @@ -230,6 +230,10 @@ def word_tokenize( segments = segment(text, custom_dict, safe_mode=True) elif engine == "attacut": + if custom_dict: + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) from pythainlp.tokenize.attacut import segment segments = segment(text) @@ -250,22 +254,42 @@ def word_tokenize( else: segments = segment(text) elif engine == "icu": + if custom_dict: + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) from pythainlp.tokenize.pyicu import segment segments = segment(text) elif engine == "nercut": + if custom_dict: + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) from pythainlp.tokenize.nercut import segment segments = segment(text) elif engine == "sefr_cut": + if custom_dict: + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) from pythainlp.tokenize.sefr_cut import segment segments = segment(text) elif engine == "tltk": + if custom_dict: + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) from pythainlp.tokenize.tltk import segment segments = segment(text) elif engine == "oskut": + if custom_dict: + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) from pythainlp.tokenize.oskut import segment segments = segment(text) @@ -366,7 +390,7 @@ def sent_tokenize( and ``wtp-large`` to use ``wtp-canine-s-12l`` model. * *whitespace+newline* - split by whitespace and newline. * *whitespace* - split by whitespace, specifically with \ - :class:`regex` pattern ``r" +"`` + :class:`regex` pattern ``r" +"`` :Example: Split the text based on *whitespace*:: @@ -814,9 +838,9 @@ def __init__( used to create a trie, or an instantiated :class:`pythainlp.util.Trie` object. :param str engine: choose between different options of tokenizer engines - (i.e. *newmm*, *mm*, *longest*, *deepcut*) + (i.e. *newmm*, *mm*, *longest*, *deepcut*) :param bool keep_whitespace: True to keep whitespace, a common mark - for end of phrase in Thai + for end of phrase in Thai """ self.__trie_dict = Trie([]) if custom_dict: From 1f385633d729adecbaaf354ba60eef5d75329cd7 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Tue, 14 Jan 2025 11:32:25 +0700 Subject: [PATCH 2/3] Update custom dict error for unsupported tokenization engines --- pythainlp/tokenize/core.py | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 2496bfc03..2fd58fed6 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -221,6 +221,18 @@ def word_tokenize( segments = [] + if custom_dict and engine in ( + "attacut", + "icu", + "nercut", + "sefr_cut", + "tltk", + "oskut" + ): + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) + if engine in ("newmm", "onecut"): from pythainlp.tokenize.newmm import segment @@ -230,10 +242,6 @@ def word_tokenize( segments = segment(text, custom_dict, safe_mode=True) elif engine == "attacut": - if custom_dict: - raise NotImplementedError( - f"The {engine} engine does not support custom dictionaries." - ) from pythainlp.tokenize.attacut import segment segments = segment(text) @@ -254,42 +262,22 @@ def word_tokenize( else: segments = segment(text) elif engine == "icu": - if custom_dict: - raise NotImplementedError( - f"The {engine} engine does not support custom dictionaries." - ) from pythainlp.tokenize.pyicu import segment segments = segment(text) elif engine == "nercut": - if custom_dict: - raise NotImplementedError( - f"The {engine} engine does not support custom dictionaries." - ) from pythainlp.tokenize.nercut import segment segments = segment(text) elif engine == "sefr_cut": - if custom_dict: - raise NotImplementedError( - f"The {engine} engine does not support custom dictionaries." - ) from pythainlp.tokenize.sefr_cut import segment segments = segment(text) elif engine == "tltk": - if custom_dict: - raise NotImplementedError( - f"The {engine} engine does not support custom dictionaries." - ) from pythainlp.tokenize.tltk import segment segments = segment(text) elif engine == "oskut": - if custom_dict: - raise NotImplementedError( - f"The {engine} engine does not support custom dictionaries." - ) from pythainlp.tokenize.oskut import segment segments = segment(text) From 9393cb8b1e64056e79fd54729fbf7a39ae770176 Mon Sep 17 00:00:00 2001 From: Wannaphong Date: Tue, 14 Jan 2025 13:32:11 +0700 Subject: [PATCH 3/3] Update test_tokenize.py --- tests/core/test_tokenize.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index c1fd7ba06..b279931a3 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -355,6 +355,11 @@ def test_word_tokenize(self): "ไฟ", word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) + with self.assertRaises(NotImplementedError): + word_tokenize( + "รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]), engine="icu" + ) + def test_etcc(self): self.assertEqual(etcc.segment(None), []) self.assertEqual(etcc.segment(""), [])