Skip to content

Commit

Permalink
Add pythainlp.util.spelling
Browse files Browse the repository at this point in the history
Thai word to spelling

spelling("เรียน") -> ['รอ', 'เอีย', 'นอ', 'เรียน']
  • Loading branch information
wannaphong committed Jan 10, 2025
1 parent 9a9d11f commit f49c480
Show file tree
Hide file tree
Showing 4 changed files with 169 additions and 2 deletions.
4 changes: 4 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ Modules

The `words_to_num` function is a numeral conversion utility that translates Thai word numerals into numerical form. It is important for numerical data extraction and computation.

.. autofunction:: spelling
:noindex:
The `spelling` function is a text processing tool for spelling Thai word.

.. autofunction:: pythainlp.util.spell_words.spell_syllable
:noindex:

Expand Down
10 changes: 9 additions & 1 deletion pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,15 @@
"reorder_vowels",
"rhyme",
"sound_syllable",
"spelling",
"spell_words",
"syllable_length",
"syllable_open_close_detector",
"text_to_arabic_digit",
"text_to_num",
"text_to_thai_digit",
"th_zodiac",
"thai_consonant_to_spelling",
"thai_digit_to_arabic_digit",
"thai_keyboard_dist",
"thai_strptime",
Expand All @@ -64,6 +66,7 @@
"to_idna",
"to_lunar_date",
"tone_detector",
"tone_to_spelling",
"words_to_num",
]

Expand Down Expand Up @@ -132,4 +135,9 @@
syllable_open_close_detector,
tone_detector,
)
from pythainlp.util.pronounce import rhyme
from pythainlp.util.pronounce import (
rhyme,
spelling,
tone_to_spelling,
thai_consonant_to_spelling,
)
144 changes: 143 additions & 1 deletion pythainlp/util/pronounce.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from typing import List
import re

from pythainlp.corpus import thai_words
from pythainlp.khavee import KhaveeVerifier
from pythainlp.tokenize import syllable_tokenize
from pythainlp.tokenize import Tokenizer
from pythainlp import thai_consonants, thai_tonemarks
from pythainlp.util import remove_tonemark

kv = KhaveeVerifier()
all_thai_words_dict = None
Expand All @@ -30,11 +34,149 @@ def rhyme(word: str) -> List[str]:
"""
global all_thai_words_dict
list_sumpus = []
if all_thai_words_dict == None:
if all_thai_words_dict is None:
all_thai_words_dict = [
i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1
]
for i in all_thai_words_dict:
if kv.is_sumpus(word, i) and i != word:
list_sumpus.append(i)
return sorted(list_sumpus)


thai_vowel = ''.join((
"อะ,อา,อิ,อี,อึ,อื,อุ,อู,เอะ,เอ,แอะ,แอ,เอียะ,เอีย,เอือะ,เอือ,อัวะ,อัว,โอะ,",
"โอ,เอาะ,ออ,เออะ,เออ,อำ,ใอ,ไอ,เอา,ฤ,ฤๅ,ฦ,ฦๅ"
)).split(",")
thai_vowel_all = [
("([ก-ฮ])ะ", "\\1อะ"),
("([ก-ฮ])า", "\\1อา"),
("อิ".replace("อ", "([ก-ฮ])"), "อิ".replace("อ", "\\1อ")),
("อี".replace("อ", "([ก-ฮ])"), "อี".replace("อ", "\\1อ")),
("อึ".replace("อ", "([ก-ฮ])", 1), "อึ".replace("อ", "\\1อ", 1)),
("อื".replace("อ", "([ก-ฮ])", 1), "อื".replace("อ", "\\1อ", 1)),
("อุ".replace("อ", "([ก-ฮ])", 1), "อุ".replace("อ", "\\1อ", 1)),
("อู".replace("อ", "([ก-ฮ])", 1), "อู".replace("อ", "\\1อ", 1)),
("เอะ".replace("อ", "([ก-ฮ])", 1), "\\1เอะ"),
("เอ".replace("อ", "([ก-ฮ])", 1), "\\1เอ"),
("แอะ".replace("อ", "([ก-ฮ])", 1), "\\1แอะ"),
("แอ".replace("อ", "([ก-ฮ])", 1), "\\1แอ"),
("เอียะ".replace("อ", "([ก-ฮ])", 1), "\\1เอียะ"),
("เอีย".replace("อ", "([ก-ฮ])", 1), "\\1เอีย"),
("เอือะ".replace("อ", "([ก-ฮ])", 1), "\\1เอือะ"),
("เอือ".replace("อ", "([ก-ฮ])", 1), "\\1เอือ"),
("อัวะ".replace("อ", "([ก-ฮ])", 1), "\\1อัวะ"),
("อัว".replace("อ", "([ก-ฮ])", 1), "\\1อัว"),
("โอะ".replace("อ", "([ก-ฮ])", 1), "\\1โอะ"),
("โอ".replace("อ", "([ก-ฮ])", 1), "\\1โอ"),
("เอาะ".replace("อ", "([ก-ฮ])", 1), "\\1เอาะ"),
("ออ".replace("อ", "([ก-ฮ])", 1), "\\1ออ"),
("เออะ".replace("อ", "([ก-ฮ])", 1), "\\1เออะ"),
("เออ".replace("อ", "([ก-ฮ])", 1), "\\1เออ"),
("อำ".replace("อ", "([ก-ฮ])", 1), "\\1อำ"),
("ใอ".replace("อ", "([ก-ฮ])", 1), "\\1ใอ"),
("ไอ".replace("อ", "([ก-ฮ])", 1), "\\1ไอ"),
("เอา".replace("อ", "([ก-ฮ])", 1), "\\1เอา"),
("อั".replace("อ", "([ก-ฮ])", 1), "\\1อะ"),
]
thai_vowel_all.sort(key=lambda t: len(t[0]), reverse=True)


def thai_consonant_to_spelling(c: str) -> str:
"""
Thai consonants to spelling
:param str c: A Thai consonant
:return: spelling
:rtype: str
:Example:
::
from pythainlp.util import thai_consonant_to_spelling
print(tone_to_spelling("ก"))
# output: กอ
"""
if len(c) == 1 and c in thai_consonants:
return c + "อ"
return c


def tone_to_spelling(t: str) -> str:
"""
Thai tonemarks to spelling
:param str t: A Thai tonemarks
:return: spelling
:rtype: str
:Example:
::
from pythainlp.util import tone_to_spelling
print(tone_to_spelling("่")) # ไม้เอก
# output: ไม้เอก
"""
if t == "่":
return "ไม้เอก"
elif t == "้":
return "ไม้โท"
elif t == "๊":
return "ไม้ตรี"
elif t == "๋":
return "ไม้จัตวา"
return t


def spelling(word: str) -> List[str]:
"""
Thai word to spelling
This funnction support Thai root word only.
:param str word: A Thai word
:return: spelling
:rtype: List[str]
:Example:
::
from pythainlp.util import spelling
print(spelling("เรียน"))
# output: ['รอ', 'เอีย', 'นอ', 'เรียน']
print(spelling("เฝ้า)
# output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
"""
if not word or not isinstance(word, str):
return []
thai_vowel_tokenizer = Tokenizer(
custom_dict=thai_vowel + list(thai_consonants),
engine="longest"
)
word_pre = remove_tonemark(word).replace("็", "")
tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks]
word_output = word_pre
for i, j in thai_vowel_all:
if len(re.findall(i, word_pre, re.U)) > 0:
if "็" in word and i == "เ([ก-ฮ])":
word_output = re.sub(i, "\\1เอะ", word_pre)
else:
word_output = re.sub(i, j, word_pre)
break
list_word_output = thai_vowel_tokenizer.word_tokenize(word_output)
output = [
i for i in [thai_consonant_to_spelling(i) for i in list_word_output]
if '์' not in i
]
if word_pre == word:
return output + [word]
elif tone != []:
return output + [word_pre, tone[0], word]
elif "็" in word:
return output + [word]
else:
return output + [word_pre, word]
13 changes: 13 additions & 0 deletions tests/core/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
to_lunar_date,
tone_detector,
words_to_num,
spelling,
)
from pythainlp.util.morse import morse_decode, morse_encode

Expand Down Expand Up @@ -842,3 +843,15 @@ def test_th_zodiac(self):

# def test_abbreviation_to_full_text(self):
# self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))

def test_spelling(self):
self.assertEqual(spelling([]), [])
self.assertEqual(spelling("เรียน"), ['รอ', 'เอีย', 'นอ', 'เรียน'])
self.assertEqual(
spelling("เฝ้า"), ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
)
self.assertEqual(spelling("คน"), ['คอ', 'นอ', 'คน'])
self.assertEqual(spelling("กัน"), ['กอ', 'อะ', 'นอ', 'กัน'])
self.assertEqual(
spelling("กั้น"), ['กอ', 'อะ', 'นอ', 'กัน', 'ไม้โท', 'กั้น']
)

0 comments on commit f49c480

Please sign in to comment.