Source code for pythainlp.khavee.core

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: C901

from typing import List, Union

from pythainlp.tokenize import subword_tokenize
from pythainlp.util import remove_tonemark, sound_syllable


[docs] class KhaveeVerifier:
[docs] def __init__(self): """ KhaveeVerifier: Thai Poetry verifier """
[docs] def check_sara(self, word: str) -> str: """ Check the vowels in the Thai word. :param str word: Thai word :return: vowel name of the word :rtype: str :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_sara("เริง")) # output: 'เออ' """ sara = [] countoa = 0 # In case of การันย์ if "์" in word[-1]: word = word[:-2] # In case of สระเดี่ยว for i in word: if i in ("ะ", "ั"): sara.append("อะ") elif i == "ิ": sara.append("อิ") elif i == "ุ": sara.append("อุ") elif i == "ึ": sara.append("อึ") elif i == "ี": sara.append("อี") elif i == "ู": sara.append("อู") elif i == "ื": sara.append("อือ") elif i == "เ": sara.append("เอ") elif i == "แ": sara.append("แอ") elif i == "า": sara.append("อา") elif i == "โ": sara.append("โอ") elif i == "ำ": sara.append("อำ") elif i == "อ": countoa += 1 sara.append("ออ") elif i == "ั" and "ว" in word: sara.append("อัว") elif i in ("ไ", "ใ"): sara.append("ไอ") elif i == "็": sara.append("ออ") elif "รร" in word: if self.check_marttra(word) == "กม": sara.append("อำ") else: sara.append("อะ") # In case of ออ if countoa == 1 and "อ" in word[-1] and "เ" not in word: sara.remove("ออ") # In case of เอ เอ countA = 0 for i in sara: if i == "เอ": countA = countA + 1 if countA > 1: sara.remove("เอ") sara.remove("เอ") sara.append("แ") # In case of สระประสม if "เอ" in sara and "อะ" in sara: sara.remove("เอ") sara.remove("อะ") sara.append("เอะ") elif "แอ" in sara and "อะ" in sara: sara.remove("แอ") sara.remove("อะ") sara.append("แอะ") if "เอะ" in sara and "ออ" in sara: sara.remove("เอะ") sara.remove("ออ") sara.append("เออะ") elif "เอ" in sara and "อิ" in sara: sara.remove("เอ") sara.remove("อิ") sara.append("เออ") elif "เอ" in sara and "ออ" in sara and "อ" in word[-1]: sara.remove("เอ") sara.remove("ออ") sara.append("เออ") elif "โอ" in sara and "อะ" in sara: sara.remove("โอ") sara.remove("อะ") sara.append("โอะ") elif "เอ" in sara and "อี" in sara: sara.remove("เอ") sara.remove("อี") sara.append("เอีย") elif "เอ" in sara and "อือ" in sara: sara.remove("เอ") sara.remove("อือ") sara.append("อัว") elif "เอ" in sara and "อา" in sara: sara.remove("เอ") sara.remove("อา") sara.append("เอา") elif "เ" in word and "า" in word and "ะ" in word: sara = [] sara.append("เอาะ") if "อือ" in sara and "เออ" in sara: sara.remove("เออ") sara.remove("อือ") sara.append("เอือ") elif "ออ" in sara and len(sara) > 1: sara.remove("ออ") elif "ว" in word and len(sara) == 0: sara.append("อัว") if "ั" in word and self.check_marttra(word) == "กา": sara = [] sara.append("ไอ") # In case of อ if word == "เออะ": sara = [] sara.append("เออะ") elif word == "เออ": sara = [] sara.append("เออ") elif word == "เอ": sara = [] sara.append("เอ") elif word == "เอะ": sara = [] sara.append("เอะ") elif word == "เอา": sara = [] sara.append("เอา") elif word == "เอาะ": sara = [] sara.append("เอาะ") if "ฤา" in word or "ฦา" in word: sara = [] sara.append("อือ") elif "ฤ" in word or "ฦ" in word: sara = [] sara.append("อึ") # In case of กน if not sara and len(word) == 2: if word[-1] != "ร": sara.append("โอะ") else: sara.append("ออ") elif not sara and len(word) == 3: sara.append("ออ") # In case of บ่ if word == "บ่": sara = [] sara.append("ออ") if "ํ" in word: sara = [] sara.append("อำ") if "เ" in word and "ื" in word and "อ" in word: sara = [] sara.append("เอือ") if not sara: return "Can't find Sara in this word" return sara[0]
[docs] def check_marttra(self, word: str) -> str: """ Check the Thai spelling Section in the Thai word. :param str word: Thai word :return: name of spelling Section of the word. :rtype: str :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_marttra("สาว")) # output: 'เกอว' """ if word[-1] == "ร" and word[-2] in ["ต", "ท"]: word = word[:-1] word = self.handle_karun_sound_silence(word) word = remove_tonemark(word) if ( "ำ" in word or ("ํ" in word and "า" in word) or "ไ" in word or "ใ" in word ): return "กา" elif ( word[-1] in ["า", "ะ", "ิ", "ี", "ุ", "ู", "อ"] or ("ี" in word and "ย" in word[-1]) or ("ื" in word and "อ" in word[-1]) ): return "กา" elif word[-1] in ["ง"]: return "กง" elif word[-1] in ["ม"]: return "กม" elif word[-1] in ["ย"]: if "ั" in word: return "กา" else: return "เกย" elif word[-1] in ["ว"]: return "เกอว" elif word[-1] in ["ก", "ข", "ค", "ฆ"]: return "กก" elif word[-1] in [ "จ", "ช", "ซ", "ฎ", "ฏ", "ฐ", "ฑ", "ฒ", "ด", "ต", "ถ", "ท", "ธ", "ศ", "ษ", "ส", ]: return "กด" elif word[-1] in ["ญ", ", ณ", "น", "ร", "ล", "ฬ"]: return "กน" elif word[-1] in ["บ", "ป", "พ", "ฟ", "ภ"]: return "กบ" else: if "็" in word: return "กา" else: return "Cant find Marttra in this word"
[docs] def is_sumpus(self, word1: str, word2: str) -> bool: """ Check the rhyme between two words. :param str word1: Thai word :param str word2: Thai word :return: boolean :rtype: bool :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.is_sumpus("สรร", "อัน")) # output: True print(kv.is_sumpus("สรร", "แมว")) # output: False """ marttra1 = self.check_marttra(word1) marttra2 = self.check_marttra(word2) sara1 = self.check_sara(word1) sara2 = self.check_sara(word2) if sara1 == "อะ" and marttra1 == "เกย": sara1 = "ไอ" marttra1 = "กา" elif sara2 == "อะ" and marttra2 == "เกย": sara2 = "ไอ" marttra2 = "กา" if sara1 == "อำ" and marttra1 == "กม": sara1 = "อำ" marttra1 = "กา" elif sara2 == "อำ" and marttra2 == "กม": sara2 = "อำ" marttra2 = "กา" return bool(marttra1 == marttra2 and sara1 == sara2)
[docs] def check_karu_lahu(self, text): if ( self.check_marttra(text) != "กา" or ( self.check_marttra(text) == "กา" and self.check_sara(text) in [ "อา", "อี", "อือ", "อู", "เอ", "แอ", "โอ", "ออ", "เออ", "เอีย", "เอือ", "อัว", ] ) or self.check_sara(text) in ["อำ", "ไอ", "เอา"] ) and text not in ["บ่", "ณ", "ธ", "ก็"]: return "karu" else: return "lahu"
[docs] def check_klon(self, text: str, k_type: int = 8) -> Union[List[str], str]: """ Check the suitability of the poem according to Thai principles. :param str text: Thai poem :param int k_type: type of Thai poem :return: the check results of the suitability of the poem according to Thai principles. :rtype: Union[List[str], str] :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() print(kv.check_klon( 'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง เอ๋งเอ๋งคะนอง \ มีคนจับจอง เขาชื่อน้องเธียร', k_type=4 )) # output: The poem is correct according to the principle. print(kv.check_klon( 'ฉันชื่อหมูกรอบ ฉันชอบกินไก่ แล้วก็วิ่งไล่ หมาชื่อนํ้าทอง ลคคนเก่ง \ เอ๋งเอ๋งเสียงหมา มีคนจับจอง เขาชื่อน้องเธียร', k_type=4 )) # output: [ "Can't find rhyme between paragraphs ('หมา', 'จอง') in paragraph 2", "Can't find rhyme between paragraphs ('หมา', 'ทอง') in paragraph 2" ] """ if k_type == 8: try: error = [] list_sumpus_sent1 = [] list_sumpus_sent2h = [] list_sumpus_sent2l = [] list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent, engine="dict") if len(sub_sent) > 10: error.append( "In sentence " + str(i + 2) + ", there are more than 10 words. " + str(sub_sent) ) if (i + 1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i + 1) % 4 == 2: list_sumpus_sent2h.append( [ sub_sent[1], sub_sent[2], sub_sent[3], sub_sent[4], ] ) list_sumpus_sent2l.append(sub_sent[-1]) elif (i + 1) % 4 == 3: list_sumpus_sent3.append(sub_sent[-1]) elif (i + 1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if ( len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1) ): return "The poem does not have 4 complete sentences." else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: if ( self.is_sumpus(list_sumpus_sent1[i], j) is False ): countwrong += 1 if countwrong > 3: error.append( "Can't find rhyme between paragraphs " + str( ( list_sumpus_sent1[i], list_sumpus_sent2h[i], ) ) + " in paragraph " + str(i + 1) ) if ( self.is_sumpus( list_sumpus_sent2l[i], list_sumpus_sent3[i] ) is False ): error.append( "Can't find rhyme between paragraphs " + str( ( list_sumpus_sent2l[i], list_sumpus_sent3[i], ) ) + " in paragraph " + str(i + 1) ) if i > 0: if ( self.is_sumpus( list_sumpus_sent2l[i], list_sumpus_sent4[i - 1], ) is False ): error.append( "Can't find rhyme between paragraphs " + str( ( list_sumpus_sent2l[i], list_sumpus_sent4[i - 1], ) ) + " in paragraph " + str(i + 1) ) if not error: return ( "The poem is correct according to the principle." ) else: return error except: return "Something went wrong. Make sure you enter it in the correct form of klon 8." elif k_type == 4: try: error = [] list_sumpus_sent1 = [] list_sumpus_sent2h = [] list_sumpus_sent2l = [] list_sumpus_sent3 = [] list_sumpus_sent4 = [] for i, sent in enumerate(text.split()): sub_sent = subword_tokenize(sent, engine="dict") if len(sub_sent) > 5: error.append( "In sentence " + str(i + 2) + ", there are more than 4 words. " + str(sub_sent) ) if (i + 1) % 4 == 1: list_sumpus_sent1.append(sub_sent[-1]) elif (i + 1) % 4 == 2: list_sumpus_sent2h.append([sub_sent[1], sub_sent[2]]) list_sumpus_sent2l.append(sub_sent[-1]) elif (i + 1) % 4 == 3: list_sumpus_sent3.append(sub_sent[-1]) elif (i + 1) % 4 == 0: list_sumpus_sent4.append(sub_sent[-1]) if ( len(list_sumpus_sent1) != len(list_sumpus_sent2h) or len(list_sumpus_sent2h) != len(list_sumpus_sent2l) or len(list_sumpus_sent2l) != len(list_sumpus_sent3) or len(list_sumpus_sent3) != len(list_sumpus_sent4) or len(list_sumpus_sent4) != len(list_sumpus_sent1) ): return "The poem does not have 4 complete sentences." else: for i in range(len(list_sumpus_sent1)): countwrong = 0 for j in list_sumpus_sent2h[i]: if ( self.is_sumpus(list_sumpus_sent1[i], j) is False ): countwrong += 1 if countwrong > 1: error.append( "Can't find rhyme between paragraphs " + str( ( list_sumpus_sent1[i], list_sumpus_sent2h[i], ) ) + " in paragraph " + str(i + 1) ) if ( self.is_sumpus( list_sumpus_sent2l[i], list_sumpus_sent3[i] ) is False ): error.append( "Can't find rhyme between paragraphs " + str( ( list_sumpus_sent2l[i], list_sumpus_sent3[i], ) ) + " in paragraph " + str(i + 1) ) if i > 0: if ( self.is_sumpus( list_sumpus_sent2l[i], list_sumpus_sent4[i - 1], ) is False ): error.append( "Can't find rhyme between paragraphs " + str( ( list_sumpus_sent2l[i], list_sumpus_sent4[i - 1], ) ) + " in paragraph " + str(i + 1) ) if not error: return ( "The poem is correct according to the principle." ) else: return error except: return "Something went wrong. Make sure you enter it in the correct form." else: return "Something went wrong. Make sure you enter it in the correct form."
[docs] def check_aek_too( self, text: Union[List[str], str], dead_syllable_as_aek: bool = False ) -> Union[List[bool], List[str], bool, str]: """ Checker of Thai tonal words :param Union[List[str], str] text: Thai word or list of Thai words :param bool dead_syllable_as_aek: if True, dead syllable will be considered as aek :return: the check result if the word is aek or too or False (not both) or list of check results if input is list :rtype: Union[List[bool], List[str], bool, str] :Example: :: from pythainlp.khavee import KhaveeVerifier kv = KhaveeVerifier() # การเช็คคำเอกโท print( kv.check_aek_too("เอง"), kv.check_aek_too("เอ่ง"), kv.check_aek_too("เอ้ง"), ) # -> False, aek, too print(kv.check_aek_too(["เอง", "เอ่ง", "เอ้ง"])) # ใช้ List ได้เหมือนกัน # -> [False, 'aek', 'too'] """ if isinstance(text, list): return [self.check_aek_too(t, dead_syllable_as_aek) for t in text] if not isinstance(text, str): raise TypeError("text must be str or iterable list[str]") word_characters = [*text] if "่" in word_characters and "้" not in word_characters: return "aek" elif "้" in word_characters and "่" not in word_characters: return "too" if dead_syllable_as_aek and sound_syllable(text) == "dead": return "aek" else: return False
[docs] def handle_karun_sound_silence(self, word: str) -> str: """ Handle silent sounds in Thai words using '์' character (Karun) by stripping all characters before the 'Karun' character that should be silenced :param str text: Thai word :return: Thai word with silent words stripped :rtype: str """ sound_silenced = word.endswith("์") if not sound_silenced: return word thai_consonants = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ" locate_silenced = word.rfind("์") - 1 can_silence_two = word[locate_silenced - 2] in thai_consonants cut_off = 2 if can_silence_two else 1 word = word[: locate_silenced + 1 - cut_off] return word