# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Syllable tools
"""
import re
from pythainlp import thai_consonants, thai_tonemarks
spelling_class = {
"กง": list("ง"),
"กม": list("ม"),
"เกย": list("ย"),
"เกอว": list("ว"),
"กน": list("นญณรลฬ"),
"กก": list("กขคฆ"),
"กด": list("ดจชซฎฏฐฑฒตถทธศษส"),
"กบ": list("บปภพฟ"),
}
thai_consonants_all = list(thai_consonants)
thai_consonants_all.remove("อ")
_temp = list(
"".join(["".join(spelling_class[i]) for i in spelling_class.keys()])
)
not_spelling_class = [j for j in thai_consonants_all if j not in _temp]
# vowel's short sound
short = "ะัิึุ"
re_short = re.compile("เ(.*)ะ|แ(.*)ะ|เ(.*)อะ|โ(.*)ะ|เ(.*)าะ", re.U)
pattern = re.compile("เ(.*)า", re.U) # เ-า is live syllable
_check_1 = []
# these spelling consonant are live syllable.
for i in ["กง", "กน", "กม", "เกย", "เกอว"]:
_check_1.extend(spelling_class[i])
# these spelling consonant are dead syllable.
_check_2 = spelling_class["กก"] + spelling_class["กบ"] + spelling_class["กด"]
thai_low_sonorants = list("งนมยรลว")
thai_low_aspirates = list("คชซทพฟฮ")
thai_low_irregular = list("ฆญณธภฅฌฑฒฬ")
thai_mid_plains = list("กจดตบปอฎฏ")
thai_high_aspirates = list("ขฉถผฝสห")
thai_high_irregular = list("ศษฃฐ")
thai_initial_consonant_type = {
"low": thai_low_sonorants + thai_low_aspirates + thai_low_irregular,
"mid": thai_mid_plains,
"high": thai_high_aspirates + thai_high_irregular,
}
thai_initial_consonant_to_type = {}
for k, v in thai_initial_consonant_type.items():
for i in v:
thai_initial_consonant_to_type[i] = k
[docs]def sound_syllable(syllable: str) -> str:
"""
Sound syllable classification
This function is sound syllable classification.
It is live syllable or dead syllable.
:param str syllable: Thai syllable
:return: syllable's type (live or dead)
:rtype: str
:Example:
::
from pythainlp.util import sound_syllable
print(sound_syllable("มา"))
# output: live
print(sound_syllable("เลข"))
# output: dead
"""
# get consonants
consonants = [i for i in syllable if i in list(thai_consonants_all)]
# get spelling consonants
spelling_consonant = consonants[-1]
# if len of syllable < 2
if len(syllable) < 2:
return "dead"
elif (spelling_consonant in _check_2) and (
any((c in set("าีืแูาเโ")) for c in syllable) == False
and any((c in set("ำใไ")) for c in syllable) == False
and bool(pattern.search(syllable)) != True
):
return "dead"
elif any((c in set("าีืแูาโ")) for c in syllable): # in syllable:
if (
spelling_consonant in _check_1
and bool(re_short.search(syllable)) != True
):
return "live"
elif (
spelling_consonant != syllable[-1]
and bool(re_short.search(syllable)) != True
):
return "live"
elif spelling_consonant in _check_2:
return "dead"
elif bool(re_short.search(syllable)) or any(
(c in set(short)) for c in syllable
):
return "dead"
return "live"
elif any((c in set("ำใไ")) for c in syllable):
return "live" # if these vowel's long sound are live syllable
elif bool(pattern.search(syllable)): # if it is เ-า
return "live"
elif spelling_consonant in _check_1:
if (
bool(re_short.search(syllable))
or any((c in set(short)) for c in syllable)
) and len(consonants) < 2:
return "dead"
return "live"
elif bool(
re_short.search(syllable)
) or any( # if found vowel's short sound
(c in set(short)) for c in syllable
): # consonant in short
return "dead"
else:
return "dead"
[docs]def syllable_open_close_detector(syllable: str) -> str:
"""
Thai syllable open/close detector
This function is use for find Thai syllable that open or closed sound.
:param str syllable: Thai syllable
:return: open / close
:rtype: str
:Example:
::
from pythainlp.util import syllable_open_close_detector
print(syllable_open_close_detector("มาก"))
# output: close
print(syllable_open_close_detector("คะ"))
# output: open
"""
consonants = [i for i in syllable if i in list(thai_consonants)]
if len(consonants) < 2:
return "open"
elif len(consonants) == 2 and consonants[-1] == "อ":
return "open"
return "close"
[docs]def syllable_length(syllable: str) -> str:
"""
Thai syllable length
This function is use for find syllable's length. (long or short)
:param str syllable: Thai syllable
:return: syllable's length (long or short)
:rtype: str
:Example:
::
from pythainlp.util import syllable_length
print(syllable_length("มาก"))
# output: long
print(syllable_length("คะ"))
# output: short
"""
consonants = [i for i in syllable if i in list(thai_consonants)]
if len(consonants) < 3 and any((c in set(short)) for c in syllable):
return "short"
elif bool(re_short.search(syllable)):
return "short"
else:
return "long"
def _tone_mark_detector(syllable: str) -> str:
tone_mark = [i for i in syllable if i in list(thai_tonemarks)]
if tone_mark == []:
return ""
else:
return tone_mark[0]
def _check_sonorant_syllable(syllable: str) -> bool:
_sonorant = [i for i in syllable if i in thai_low_sonorants]
consonants = [i for i in syllable if i in list(thai_consonants)]
if _sonorant[-1] == consonants[-2]:
return True
elif _sonorant[-1] == consonants[-1]:
return True
return False
[docs]def tone_detector(syllable: str) -> str:
"""
Thai tone detector for syllables
:param str syllable: Thai syllable
:return: syllable's tone (l, m, h, r, f or empty if it cannot detector)
:rtype: str
:Example:
::
from pythainlp.util import tone_detector
print(tone_detector("มา"))
# output: m
print(tone_detector("ไม้"))
# output: h
"""
s = sound_syllable(syllable)
# get consonants
consonants = [i for i in syllable if i in list(thai_consonants)]
initial_consonant = consonants[0]
tone_mark = _tone_mark_detector(syllable)
syllable_check = syllable_open_close_detector(syllable)
syllable_check_lenght = syllable_length(syllable)
initial_consonant_type = thai_initial_consonant_to_type[initial_consonant]
# r for store value
r = ""
if len(consonants) > 1 and (
initial_consonant == "อ" or initial_consonant == "ห"
):
consonant_ending = _check_sonorant_syllable(syllable)
if (
initial_consonant == "อ"
and consonant_ending
and s == "live"
and tone_mark == "่"
):
r = "l"
elif initial_consonant == "อ" and consonant_ending and s == "dead":
r = "l"
elif (
initial_consonant == "ห"
and consonant_ending
and s == "live"
and tone_mark == "่"
):
r = "l"
elif (
initial_consonant == "ห"
and consonant_ending
and s == "live"
and tone_mark == "้"
):
r = "f"
elif initial_consonant == "ห" and consonant_ending and s == "dead":
r = "l"
elif initial_consonant == "ห" and consonant_ending and s == "live":
r = "r"
elif (
initial_consonant_type == "low"
and syllable_check_lenght == "short"
and syllable_check == "close"
and s == "dead"
):
r = "h"
elif (
initial_consonant_type == "low"
and syllable_check_lenght == "long"
and syllable_check == "close"
and s == "dead"
):
r = "f"
elif (
initial_consonant_type == "low"
and syllable_check_lenght == "short"
and syllable_check == "open"
):
r = "h"
elif initial_consonant_type == "high" and s == "live" and tone_mark == "่":
r = "l"
elif initial_consonant_type == "mid" and s == "live" and tone_mark == "่":
r = "l"
elif initial_consonant_type == "low" and tone_mark == "้":
r = "h"
elif initial_consonant_type == "mid" and tone_mark == "๋":
r = "r"
elif initial_consonant_type == "mid" and tone_mark == "๊":
r = "h"
elif initial_consonant_type == "low" and tone_mark == "่":
r = "f"
elif initial_consonant_type == "mid" and tone_mark == "้":
r = "f"
elif initial_consonant_type == "high" and tone_mark == "้":
r = "f"
elif initial_consonant_type == "mid" and s == "dead":
r = "l"
elif initial_consonant_type == "high" and s == "dead":
r = "l"
elif initial_consonant_type == "low" and s == "live":
r = "m"
elif initial_consonant_type == "mid" and s == "live":
r = "m"
elif initial_consonant_type == "high" and s == "live":
r = "r"
return r