diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index 1f8304c42..2fd58fed6 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -111,10 +111,10 @@ def word_tokenize( :param str engine: name of the tokenizer to be used :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support) :param bool keep_whitespace: True to keep whitespace, a common mark - for end of phrase in Thai. - Otherwise, whitespace is omitted. + for end of phrase in Thai. + Otherwise, whitespace is omitted. :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated. - Otherwise, formatted numeric could be wrongly separated. + Otherwise, formatted numeric could be wrongly separated. :return: list of words :rtype: List[str] @@ -221,6 +221,18 @@ def word_tokenize( segments = [] + if custom_dict and engine in ( + "attacut", + "icu", + "nercut", + "sefr_cut", + "tltk", + "oskut" + ): + raise NotImplementedError( + f"The {engine} engine does not support custom dictionaries." + ) + if engine in ("newmm", "onecut"): from pythainlp.tokenize.newmm import segment @@ -366,7 +378,7 @@ def sent_tokenize( and ``wtp-large`` to use ``wtp-canine-s-12l`` model. * *whitespace+newline* - split by whitespace and newline. * *whitespace* - split by whitespace, specifically with \ - :class:`regex` pattern ``r" +"`` + :class:`regex` pattern ``r" +"`` :Example: Split the text based on *whitespace*:: @@ -814,9 +826,9 @@ def __init__( used to create a trie, or an instantiated :class:`pythainlp.util.Trie` object. :param str engine: choose between different options of tokenizer engines - (i.e. *newmm*, *mm*, *longest*, *deepcut*) + (i.e. *newmm*, *mm*, *longest*, *deepcut*) :param bool keep_whitespace: True to keep whitespace, a common mark - for end of phrase in Thai + for end of phrase in Thai """ self.__trie_dict = Trie([]) if custom_dict: diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py index c1fd7ba06..b279931a3 100644 --- a/tests/core/test_tokenize.py +++ b/tests/core/test_tokenize.py @@ -355,6 +355,11 @@ def test_word_tokenize(self): "ไฟ", word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"])) ) + with self.assertRaises(NotImplementedError): + word_tokenize( + "รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]), engine="icu" + ) + def test_etcc(self): self.assertEqual(etcc.segment(None), []) self.assertEqual(etcc.segment(""), [])