Source code for pythainlp.util.phoneme

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Phonemes util
"""
import unicodedata

from pythainlp.tokenize import Tokenizer
from pythainlp.util.trie import Trie

consonants_ipa_nectec = [
    ("k", "k", "k^"),
    ("kʰ", "kh"),
    ("ŋ", "ng", "ng^"),
    ("tɕ", "c"),
    ("tɕʰ", "ch"),
    ("s", "s"),
    ("j", "j", "j^"),
    ("d", "d"),
    ("t", "y", "t^"),
    ("tʰ", "th"),
    ("n", "n", "n^"),
    ("b", "b"),
    ("p", "p", "p^"),
    ("pʰ", "ph"),
    ("f", "f"),
    ("m", "m", "m^"),
    ("r", "r"),
    ("l", "l"),
    ("w", "w", "w^"),
    ("h", "h"),
    ("?", "z", "z^"),
]
# ipa, initial, final

monophthong_ipa_nectec = [
    ("i", "i"),
    ("e", "e"),
    ("ɛ", "x"),
    ("ɤ", "q"),
    ("a", "a"),
    ("am", "am^"),
    ("aj", "aj^"),
    ("aw", "aw^"),
    ("u", "u"),
    ("o", "o"),
    ("ɔ", "@"),
    ("ii", "ii"),
    ("ee", "ee"),
    ("ɛɛ", "xx"),
    ("ɯɯ", "vv"),
    ("ɤɤ", "qq"),
    ("aa", "aa"),
    ("uu", "uu"),
    ("oo", "oo"),
    ("", "@@"),  # -อ long
]

diphthong_ipa_nectec = [
    ("ia", "ia"),
    ("ɯa", "va"),
    ("ua", "ua"),
    ("iia", "iia"),
    ("ɯɯa", "vva"),
    ("uua", "uua"),
]

tones_ipa_nectec = [
    ("˧", "0"),
    ("˨˩", "1"),
    ("˥˩", "2"),
    ("˦˥", "3"),
    ("˩˩˦", "4"),
]

dict_nectec_to_ipa = {
    i[1]: i[0]
    for i in consonants_ipa_nectec
    + monophthong_ipa_nectec
    + diphthong_ipa_nectec
    + tones_ipa_nectec
}
dict_nectec_to_ipa.update(
    {i[2]: i[0] for i in consonants_ipa_nectec if len(i) > 2}
)


[docs] def nectec_to_ipa(pronunciation: str) -> str: """ Convert NECTEC system to IPA system :param str pronunciation: NECTEC phoneme :return: IPA that is converted :rtype: str :Example: :: from pythainlp.util import nectec_to_ipa print(nectec_to_ipa("kl-uua-j^-2")) # output : 'kl uua j ˥˩' References ---------- Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing \ and Speech corpus. In: Handbook of Thai Electronic Corpus. \ 1st ed. p. 122–56. """ parts = pronunciation.split("-") ipa = [] for part in parts: if part in dict_nectec_to_ipa.keys(): ipa.append(dict_nectec_to_ipa[part]) else: ipa.append(part) return " ".join(ipa)
dict_ipa_rtgs = { "b": "b", "d": "d", "f": "f", "h": "h", # The conversion of j depends on its position in the syllable. # But, unfortunately, the current implementation cannot handle both cases. # To remove confusions without changing the behavior and breaking existing codes, # it is suggested that the first key-value mapping of j be simply commented out, # as it would be overridden by the second one and thus never take effect from the beginning. # See #846 for a more detailed discussion: https://github.com/PyThaiNLP/pythainlp/issues/846 # "j":"y", "k": "k", "kʰ": "kh", "l": "l", "m": "m", "n": "n", "ŋ": "ng", "p": "p", "pʰ": "ph", "r": "r", "s": "s", "t": "t", "tʰ": "th", "tɕ": "ch", "tɕʰ": "ch", "w": "w", "ʔ": "", "j": "i", "a": "a", "e": "e", "ɛ": "ae", "i": "i", "o": "o", "ɔ": "o", "u": "u", "ɯ": "ue", "ɤ": "oe", "aː": "a", "eː": "e", "ɛː": "ae", "iː": "i", "oː": "o", "ɔː": "o", "uː": "u", "ɯː": "ue", "ɤː": "oe", "ia": "ia", "ua": "ua", "ɯa": "uea", "aj": "ai", "aw": "ao", "ew": "eo", "ɛw": "aeo", "iw": "io", "ɔj": "io", "uj": "ui", "aːj": "ai", "aːw": "ao", "eːw": "eo", "ɛːw": "aeo", "oːj": "oi", "ɔːj": "oi", "ɤːj": "oei", "iaw": "iao", "uaj": "uai", "ɯaj": "ueai", ".": ".", } dict_ipa_rtgs_final = {"w": "o"} trie = Trie(list(dict_ipa_rtgs.keys()) + list(dict_ipa_rtgs_final.keys())) ipa_cut = Tokenizer(custom_dict=trie, engine="newmm")
[docs] def ipa_to_rtgs(ipa: str) -> str: """ Convert IPA system to The Royal Thai General System of Transcription (RTGS) Docs: https://en.wikipedia.org/wiki/Help:IPA/Thai :param str ipa: IPA phoneme :return: The RTGS that is converted, according to rules listed in the Wikipedia page :rtype: str :Example: :: from pythainlp.util import ipa_to_rtgs print(ipa_to_rtgs("kluaj")) # output : 'kluai' """ rtgs_parts = [] ipa_parts = ipa_cut.word_tokenize(ipa) for i, ipa_part in enumerate(ipa_parts): if i == len(ipa_parts) - 1 and ipa_part in list(dict_ipa_rtgs_final): rtgs_parts.append(dict_ipa_rtgs_final[ipa_part]) elif ipa_part in list(dict_ipa_rtgs): rtgs_parts.append(dict_ipa_rtgs[ipa_part]) else: rtgs_parts.append(ipa_part) rtgs = "".join(rtgs_parts) rtgs = ( unicodedata.normalize("NFKD", rtgs) .encode("ascii", "ignore") .decode("utf-8") ) return rtgs
[docs] def remove_tone_ipa(ipa: str) -> str: """ Remove Thai Tones from IPA system :param str ipa: IPA phoneme :return: IPA phoneme with tones removed :rtype: str :Example: :: from pythainlp.util import remove_tone_ipa print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦")) # output : laː.sa.maj """ _list_tone = ["˩˩˦", "˥˩", "˨˩", "˦˥", "˧"] for tone in _list_tone: ipa = ipa.replace(tone, "") return ipa