Source code for pythainlp.util.phoneme

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Phonemes util
"""
import unicodedata
from pythainlp.util.trie import Trie
from pythainlp.tokenize import Tokenizer

consonants_ipa_nectec = [
    ("k", "k", "k^"),
    ("kʰ", "kh"),
    ("ŋ", "ng", "ng^"),
    ("tɕ", "c"),
    ("tɕʰ", "ch"),
    ("s", "s"),
    ("j", "j", "j^"),
    ("d", "d"),
    ("t", "y", "t^"),
    ("tʰ", "th"),
    ("n", "n", "n^"),
    ("b", "b"),
    ("p", "p", "p^"),
    ("pʰ", "ph"),
    ("f", "f"),
    ("m", "m", "m^"),
    ("r", "r"),
    ("l", "l"),
    ("w", "w", "w^"),
    ("h", "h"),
    ("?", "z", "z^"),
]
# ipa, initial, final

monophthong_ipa_nectec = [
    ("i", "i"),
    ("e", "e"),
    ("ɛ", "x"),
    ("ɤ", "q"),
    ("a", "a"),
    ("am", "am^"),
    ("aj", "aj^"),
    ("aw", "aw^"),
    ("u", "u"),
    ("o", "o"),
    ("ɔ", "@"),
    ("ii", "ii"),
    ("ee", "ee"),
    ("ɛɛ", "xx"),
    ("ɯɯ", "vv"),
    ("ɤɤ", "qq"),
    ("aa", "aa"),
    ("uu", "uu"),
    ("oo", "oo"),
    ("", "@@"),  # -อ long
]

diphthong_ipa_nectec = [
    ("ia", "ia"),
    ("ɯa", "va"),
    ("ua", "ua"),
    ("iia", "iia"),
    ("ɯɯa", "vva"),
    ("uua", "uua"),
]

tones_ipa_nectec = [
    ("˧", "0"),
    ("˨˩", "1"),
    ("˥˩", "2"),
    ("˦˥", "3"),
    ("˩˩˦", "4"),
]

dict_nectec_to_ipa = {
    i[1]: i[0]
    for i in consonants_ipa_nectec
    + monophthong_ipa_nectec
    + diphthong_ipa_nectec
    + tones_ipa_nectec
}
dict_nectec_to_ipa.update(
    {i[2]: i[0] for i in consonants_ipa_nectec if len(i) > 2}
)


[docs]def nectec_to_ipa(pronunciation: str) -> str:
    """
    Convert NECTEC system to IPA system

    :param str pronunciation: NECTEC phoneme
    :return: IPA that is converted
    :rtype: str

    :Example:
    ::

        from pythainlp.util import nectec_to_ipa

        print(nectec_to_ipa("kl-uua-j^-2"))
        # output : 'kl uua j ˥˩'


    References
    ----------

    Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing \
        and Speech corpus. In: Handbook of Thai Electronic Corpus. \
        1st ed. p. 122–56.
    """
    parts = pronunciation.split("-")
    ipa = []
    for part in parts:
        if part in dict_nectec_to_ipa.keys():
            ipa.append(dict_nectec_to_ipa[part])
        else:
            ipa.append(part)
    return " ".join(ipa)


dict_ipa_rtgs = {
    "b": "b",
    "d": "d",
    "f": "f",
    "h": "h",
    # The conversion of j depends on its position in the syllable.
    # But, unfortunately, the current implementation cannot handle both cases.
    # To remove confusions without changing the behavior and breaking existing codes,
    # it is suggested that the first key-value mapping of j be simply commented out,
    # as it would be overridden by the second one and thus never take effect from the beginning.
    # See #846 for a more detailed discussion: https://github.com/PyThaiNLP/pythainlp/issues/846
    # "j":"y",
    "k": "k",
    "kʰ": "kh",
    "l": "l",
    "m": "m",
    "n": "n",
    "ŋ": "ng",
    "p": "p",
    "pʰ": "ph",
    "r": "r",
    "s": "s",
    "t": "t",
    "tʰ": "th",
    "tɕ": "ch",
    "tɕʰ": "ch",
    "w": "w",
    "ʔ": "",
    "j": "i",
    "a": "a",
    "e": "e",
    "ɛ": "ae",
    "i": "i",
    "o": "o",
    "ɔ": "o",
    "u": "u",
    "ɯ": "ue",
    "ɤ": "oe",
    "aː": "a",
    "eː": "e",
    "ɛː": "ae",
    "iː": "i",
    "oː": "o",
    "ɔː": "o",
    "uː": "u",
    "ɯː": "ue",
    "ɤː": "oe",
    "ia": "ia",
    "ua": "ua",
    "ɯa": "uea",
    "aj": "ai",
    "aw": "ao",
    "ew": "eo",
    "ɛw": "aeo",
    "iw": "io",
    "ɔj": "io",
    "uj": "ui",
    "aːj": "ai",
    "aːw": "ao",
    "eːw": "eo",
    "ɛːw": "aeo",
    "oːj": "oi",
    "ɔːj": "oi",
    "ɤːj": "oei",
    "iaw": "iao",
    "uaj": "uai",
    "ɯaj": "ueai",
    ".": ".",
}

dict_ipa_rtgs_final = {"w": "o"}
trie = Trie(list(dict_ipa_rtgs.keys()) + list(dict_ipa_rtgs_final.keys()))
ipa_cut = Tokenizer(custom_dict=trie, engine="newmm")


[docs]def ipa_to_rtgs(ipa: str) -> str:
    """
    Convert IPA system to The Royal Thai General System of Transcription (RTGS)

    Docs: https://en.wikipedia.org/wiki/Help:IPA/Thai

    :param str ipa: IPA phoneme
    :return: The RTGS that is converted, according to rules listed in the Wikipedia page
    :rtype: str

    :Example:
    ::

        from pythainlp.util import ipa_to_rtgs

        print(ipa_to_rtgs("kluaj"))
        # output : 'kluai'

    """
    rtgs_parts = []

    ipa_parts = ipa_cut.word_tokenize(ipa)
    for i, ipa_part in enumerate(ipa_parts):
        if i == len(ipa_parts) - 1 and ipa_part in list(dict_ipa_rtgs_final):
            rtgs_parts.append(dict_ipa_rtgs_final[ipa_part])
        elif ipa_part in list(dict_ipa_rtgs):
            rtgs_parts.append(dict_ipa_rtgs[ipa_part])
        else:
            rtgs_parts.append(ipa_part)

    rtgs = "".join(rtgs_parts)
    rtgs = (
        unicodedata.normalize("NFKD", rtgs)
        .encode("ascii", "ignore")
        .decode("utf-8")
    )

    return rtgs


[docs]def remove_tone_ipa(ipa: str) -> str:
    """
    Remove Thai Tones from IPA system

    :param str ipa: IPA phoneme
    :return: IPA phoneme with tones removed
    :rtype: str

    :Example:
    ::

        from pythainlp.util import remove_tone_ipa

        print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"))
        # output : laː.sa.maj

    """
    _list_tone = ["˩˩˦", "˥˩", "˨˩", "˦˥", "˧"]
    for tone in _list_tone:
        ipa = ipa.replace(tone, "")
    return ipa