From 315fe438378a29a1356443a9ed2f0a88d4309e98 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Sun, 12 Jan 2025 20:23:09 +0700
Subject: [PATCH 1/3] Fix custom dict error for unsupported tokenization
 engines

Fixes #1065

Add error handling for unsupported custom dictionaries in `word_tokenize` function.

* Add a check for unsupported engines in the `word_tokenize` function in `pythainlp/tokenize/core.py`.
* Raise a `NotImplementedError` if `custom_dict` is passed to an unsupported engine such as `attacut`, `icu`, `nercut`, `sefr_cut`, `tltk`, and `oskut`.
* Update the docstring for the `word_tokenize` function to reflect the changes.
---
 pythainlp/tokenize/core.py | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 1f8304c42..2496bfc03 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -111,10 +111,10 @@ def word_tokenize(
     :param str engine: name of the tokenizer to be used
     :param pythainlp.util.Trie custom_dict: dictionary trie (some engine may not support)
     :param bool keep_whitespace: True to keep whitespace, a common mark
-                                 for end of phrase in Thai.
-                                 Otherwise, whitespace is omitted.
+                                  for end of phrase in Thai.
+                                  Otherwise, whitespace is omitted.
     :param bool join_broken_num: True to rejoin formatted numeric that could be wrongly separated.
-                                 Otherwise, formatted numeric could be wrongly separated.
+                                  Otherwise, formatted numeric could be wrongly separated.
 
     :return: list of words
     :rtype: List[str]
@@ -230,6 +230,10 @@ def word_tokenize(
 
         segments = segment(text, custom_dict, safe_mode=True)
     elif engine == "attacut":
+        if custom_dict:
+            raise NotImplementedError(
+                f"The {engine} engine does not support custom dictionaries."
+            )
         from pythainlp.tokenize.attacut import segment
 
         segments = segment(text)
@@ -250,22 +254,42 @@ def word_tokenize(
         else:
             segments = segment(text)
     elif engine == "icu":
+        if custom_dict:
+            raise NotImplementedError(
+                f"The {engine} engine does not support custom dictionaries."
+            )
         from pythainlp.tokenize.pyicu import segment
 
         segments = segment(text)
     elif engine == "nercut":
+        if custom_dict:
+            raise NotImplementedError(
+                f"The {engine} engine does not support custom dictionaries."
+            )
         from pythainlp.tokenize.nercut import segment
 
         segments = segment(text)
     elif engine == "sefr_cut":
+        if custom_dict:
+            raise NotImplementedError(
+                f"The {engine} engine does not support custom dictionaries."
+            )
         from pythainlp.tokenize.sefr_cut import segment
 
         segments = segment(text)
     elif engine == "tltk":
+        if custom_dict:
+            raise NotImplementedError(
+                f"The {engine} engine does not support custom dictionaries."
+            )
         from pythainlp.tokenize.tltk import segment
 
         segments = segment(text)
     elif engine == "oskut":
+        if custom_dict:
+            raise NotImplementedError(
+                f"The {engine} engine does not support custom dictionaries."
+            )
         from pythainlp.tokenize.oskut import segment
 
         segments = segment(text)
@@ -366,7 +390,7 @@ def sent_tokenize(
             and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
         * *whitespace+newline* - split by whitespace and newline.
         * *whitespace* - split by whitespace, specifically with \
-                         :class:`regex` pattern  ``r" +"``
+                          :class:`regex` pattern  ``r" +"``
     :Example:
 
     Split the text based on *whitespace*::
@@ -814,9 +838,9 @@ def __init__(
                     used to create a trie, or an instantiated
                     :class:`pythainlp.util.Trie` object.
         :param str engine: choose between different options of tokenizer engines
-                           (i.e.  *newmm*, *mm*, *longest*, *deepcut*)
+                            (i.e.  *newmm*, *mm*, *longest*, *deepcut*)
         :param bool keep_whitespace: True to keep whitespace, a common mark
-                                    for end of phrase in Thai
+                                     for end of phrase in Thai
         """
         self.__trie_dict = Trie([])
         if custom_dict:

From 1f385633d729adecbaaf354ba60eef5d75329cd7 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Tue, 14 Jan 2025 11:32:25 +0700
Subject: [PATCH 2/3] Update custom dict error for unsupported tokenization
 engines

---
 pythainlp/tokenize/core.py | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index 2496bfc03..2fd58fed6 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -221,6 +221,18 @@ def word_tokenize(
 
     segments = []
 
+    if custom_dict and engine in (
+        "attacut",
+        "icu",
+        "nercut",
+        "sefr_cut",
+        "tltk",
+        "oskut"
+    ):
+        raise NotImplementedError(
+            f"The {engine} engine does not support custom dictionaries."
+        )
+
     if engine in ("newmm", "onecut"):
         from pythainlp.tokenize.newmm import segment
 
@@ -230,10 +242,6 @@ def word_tokenize(
 
         segments = segment(text, custom_dict, safe_mode=True)
     elif engine == "attacut":
-        if custom_dict:
-            raise NotImplementedError(
-                f"The {engine} engine does not support custom dictionaries."
-            )
         from pythainlp.tokenize.attacut import segment
 
         segments = segment(text)
@@ -254,42 +262,22 @@ def word_tokenize(
         else:
             segments = segment(text)
     elif engine == "icu":
-        if custom_dict:
-            raise NotImplementedError(
-                f"The {engine} engine does not support custom dictionaries."
-            )
         from pythainlp.tokenize.pyicu import segment
 
         segments = segment(text)
     elif engine == "nercut":
-        if custom_dict:
-            raise NotImplementedError(
-                f"The {engine} engine does not support custom dictionaries."
-            )
         from pythainlp.tokenize.nercut import segment
 
         segments = segment(text)
     elif engine == "sefr_cut":
-        if custom_dict:
-            raise NotImplementedError(
-                f"The {engine} engine does not support custom dictionaries."
-            )
         from pythainlp.tokenize.sefr_cut import segment
 
         segments = segment(text)
     elif engine == "tltk":
-        if custom_dict:
-            raise NotImplementedError(
-                f"The {engine} engine does not support custom dictionaries."
-            )
         from pythainlp.tokenize.tltk import segment
 
         segments = segment(text)
     elif engine == "oskut":
-        if custom_dict:
-            raise NotImplementedError(
-                f"The {engine} engine does not support custom dictionaries."
-            )
         from pythainlp.tokenize.oskut import segment
 
         segments = segment(text)

From 9393cb8b1e64056e79fd54729fbf7a39ae770176 Mon Sep 17 00:00:00 2001
From: Wannaphong <wannaphong@yahoo.com>
Date: Tue, 14 Jan 2025 13:32:11 +0700
Subject: [PATCH 3/3] Update test_tokenize.py

---
 tests/core/test_tokenize.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/core/test_tokenize.py b/tests/core/test_tokenize.py
index c1fd7ba06..b279931a3 100644
--- a/tests/core/test_tokenize.py
+++ b/tests/core/test_tokenize.py
@@ -355,6 +355,11 @@ def test_word_tokenize(self):
             "ไฟ", word_tokenize("รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]))
         )
 
+        with self.assertRaises(NotImplementedError):
+            word_tokenize(
+                "รถไฟฟ้า", custom_dict=dict_trie(["ไฟ"]), engine="icu"
+            )
+
     def test_etcc(self):
         self.assertEqual(etcc.segment(None), [])
         self.assertEqual(etcc.segment(""), [])