Source code for pythainlp.util.pronounce

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from typing import List
import re

from pythainlp.corpus import thai_words
from pythainlp.khavee import KhaveeVerifier
from pythainlp.tokenize import syllable_tokenize
from pythainlp.tokenize import Tokenizer
from pythainlp import thai_consonants, thai_tonemarks
from pythainlp.util import remove_tonemark

kv = KhaveeVerifier()
all_thai_words_dict = None


[docs] def rhyme(word: str) -> List[str]: """ Find Thai rhyme :param str word: A Thai word :return: All list Thai rhyme words :rtype: List[str] :Example: :: from pythainlp.util import rhyme print(rhyme("จีบ")) # output: ['กลีบ', 'กีบ', 'ครีบ', ...] """ global all_thai_words_dict list_sumpus = [] if all_thai_words_dict is None: all_thai_words_dict = [ i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1 ] for i in all_thai_words_dict: if kv.is_sumpus(word, i) and i != word: list_sumpus.append(i) return sorted(list_sumpus)
thai_vowel = ''.join(( "อะ,อา,อิ,อี,อึ,อื,อุ,อู,เอะ,เอ,แอะ,แอ,เอียะ,เอีย,เอือะ,เอือ,อัวะ,อัว,โอะ,", "โอ,เอาะ,ออ,เออะ,เออ,อำ,ใอ,ไอ,เอา,ฤ,ฤๅ,ฦ,ฦๅ" )).split(",") thai_vowel_all = [ ("([ก-ฮ])ะ", "\\1อะ"), ("([ก-ฮ])า", "\\1อา"), ("อิ".replace("อ", "([ก-ฮ])"), "อิ".replace("อ", "\\1อ")), ("อี".replace("อ", "([ก-ฮ])"), "อี".replace("อ", "\\1อ")), ("อึ".replace("อ", "([ก-ฮ])", 1), "อึ".replace("อ", "\\1อ", 1)), ("อื".replace("อ", "([ก-ฮ])", 1), "อื".replace("อ", "\\1อ", 1)), ("อุ".replace("อ", "([ก-ฮ])", 1), "อุ".replace("อ", "\\1อ", 1)), ("อู".replace("อ", "([ก-ฮ])", 1), "อู".replace("อ", "\\1อ", 1)), ("เอะ".replace("อ", "([ก-ฮ])", 1), "\\1เอะ"), ("เอ".replace("อ", "([ก-ฮ])", 1), "\\1เอ"), ("แอะ".replace("อ", "([ก-ฮ])", 1), "\\1แอะ"), ("แอ".replace("อ", "([ก-ฮ])", 1), "\\1แอ"), ("เอียะ".replace("อ", "([ก-ฮ])", 1), "\\1เอียะ"), ("เอีย".replace("อ", "([ก-ฮ])", 1), "\\1เอีย"), ("เอือะ".replace("อ", "([ก-ฮ])", 1), "\\1เอือะ"), ("เอือ".replace("อ", "([ก-ฮ])", 1), "\\1เอือ"), ("อัวะ".replace("อ", "([ก-ฮ])", 1), "\\1อัวะ"), ("อัว".replace("อ", "([ก-ฮ])", 1), "\\1อัว"), ("โอะ".replace("อ", "([ก-ฮ])", 1), "\\1โอะ"), ("โอ".replace("อ", "([ก-ฮ])", 1), "\\1โอ"), ("เอาะ".replace("อ", "([ก-ฮ])", 1), "\\1เอาะ"), ("ออ".replace("อ", "([ก-ฮ])", 1), "\\1ออ"), ("เออะ".replace("อ", "([ก-ฮ])", 1), "\\1เออะ"), ("เออ".replace("อ", "([ก-ฮ])", 1), "\\1เออ"), ("อำ".replace("อ", "([ก-ฮ])", 1), "\\1อำ"), ("ใอ".replace("อ", "([ก-ฮ])", 1), "\\1ใอ"), ("ไอ".replace("อ", "([ก-ฮ])", 1), "\\1ไอ"), ("เอา".replace("อ", "([ก-ฮ])", 1), "\\1เอา"), ("อั".replace("อ", "([ก-ฮ])", 1), "\\1อะ"), ] thai_vowel_all.sort(key=lambda t: len(t[0]), reverse=True)
[docs] def thai_consonant_to_spelling(c: str) -> str: """ Thai consonants to spelling :param str c: A Thai consonant :return: spelling :rtype: str :Example: :: from pythainlp.util import thai_consonant_to_spelling print(tone_to_spelling("ก")) # output: กอ """ if len(c) == 1 and c in thai_consonants: return c + "อ" return c
[docs] def tone_to_spelling(t: str) -> str: """ Thai tonemarks to spelling :param str t: A Thai tonemarks :return: spelling :rtype: str :Example: :: from pythainlp.util import tone_to_spelling print(tone_to_spelling("่")) # ไม้เอก # output: ไม้เอก """ if t == "่": return "ไม้เอก" elif t == "้": return "ไม้โท" elif t == "๊": return "ไม้ตรี" elif t == "๋": return "ไม้จัตวา" return t
def spelling(word: str) -> List[str]: """ Thai word to spelling This funnction support Thai root word only. :param str word: A Thai word :return: spelling :rtype: List[str] :Example: :: from pythainlp.util import spelling print(spelling("เรียน")) # output: ['รอ', 'เอีย', 'นอ', 'เรียน'] print(spelling("เฝ้า) # output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า'] """ if not word or not isinstance(word, str): return [] thai_vowel_tokenizer = Tokenizer( custom_dict=thai_vowel + list(thai_consonants), engine="longest" ) word_pre = remove_tonemark(word).replace("็", "") tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks] word_output = word_pre for i, j in thai_vowel_all: if len(re.findall(i, word_pre, re.U)) > 0: if "็" in word and i == "เ([ก-ฮ])": word_output = re.sub(i, "\\1เอะ", word_pre) else: word_output = re.sub(i, j, word_pre) break list_word_output = thai_vowel_tokenizer.word_tokenize(word_output) output = [ i for i in [thai_consonant_to_spelling(i) for i in list_word_output] if '์' not in i ] if word_pre == word: return output + [word] elif tone != []: return output + [word_pre, tone[0], word] elif "็" in word: return output + [word] else: return output + [word_pre, word]