Source code for pythainlp.util.phoneme

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Phonemes util
"""
import unicodedata
from pythainlp.util.trie import Trie
from pythainlp.tokenize import Tokenizer

consonants_ipa_nectec = [
    ("k","k","k^"),
    ("kʰ","kh"),
    ("ŋ","ng","ng^"),
    ("tɕ","c"),
    ("tɕʰ","ch"),
    ("s","s"),
    ("j","j","j^"),
    ("d","d"),
    ("t","y","t^"),
    ("tʰ","th"),
    ("n","n","n^"),
    ("b","b"),
    ("p","p","p^"),
    ("pʰ","ph"),
    ("f","f"),
    ("m","m","m^"),
    ("r","r"),
    ("l","l"),
    ("w","w","w^"),
    ("h","h"),
    ("?","z","z^")
]
# ipa, initial, final

monophthong_ipa_nectec = [
    ("i","i"),
    ("e","e"),
    ("ɛ","x"),
    ("ɤ","q"),
    ("a","a"),
    ("am","am^"),
    ("aj","aj^"),
    ("aw","aw^"),
    ("u","u"),
    ("o","o"),
    ("ɔ","@"),
    ("ii","ii"),
    ("ee","ee"),
    ("ɛɛ","xx"),
    ("ɯɯ","vv"),
    ("ɤɤ","qq"),
    ("aa","aa"),
    ("uu","uu"),
    ("oo","oo"),
    ("","@@"), #-อ long
]

diphthong_ipa_nectec = [
    ("ia","ia"),
    ("ɯa","va"),
    ("ua","ua"),
    ("iia","iia"),
    ("ɯɯa","vva"),
    ("uua","uua"),
]

tones_ipa_nectec = [
    ("˧","0"),
    ("˨˩","1"),
    ("˥˩","2"),
    ("˦˥","3"),
    ("˩˩˦","4"),
]

dict_nectec_to_ipa = {i[1]:i[0] for i in consonants_ipa_nectec+monophthong_ipa_nectec+diphthong_ipa_nectec+tones_ipa_nectec}
dict_nectec_to_ipa.update({i[2]:i[0] for i in consonants_ipa_nectec if len(i)>2})


[docs]def nectec_to_ipa(pronunciation: str) -> str:
    """
    Converter NECTEC system to IPA system

    :param str pronunciation: NECTEC phoneme
    :return: IPA that be convert
    :rtype: str

    :Example:
    ::

        from pythainlp.util import nectec_to_ipa
 
        print(nectec_to_ipa("kl-uua-j^-2"))
        # output : 'kl uua j ˥˩'


    References
    ----------

    Pornpimon Palingoon, Sumonmas Thatphithakkul. Chapter 4 Speech processing and Speech corpus. In: Handbook of Thai Electronic Corpus. 1st ed. p. 122–56.
    """
    pronunciation = pronunciation.split("-")
    _temp = []
    for i in pronunciation:
        if i in dict_nectec_to_ipa.keys():
            _temp.append(dict_nectec_to_ipa[i])
        else:
            _temp.append(i)
    return ' '.join(_temp)


dict_ipa_rtgs = {
    "b":"b",
    "d":"d",
    "f":"f",
    "h":"h",
    "j":"y",
    "k":"k",
    "kʰ":"kh",
    "l":"l",
    "m":"m",
    "n":"n",
    "ŋ":"ng",
    "p":"p",
    "pʰ":"ph",
    "r":"r",
    "s":"s",
    "t":"t",
    "tʰ":"th",
    "tɕ":"ch",
    "tɕʰ":"ch",
    "w":"w",
    "ʔ":"",
    "j":"i",
    "a":"a",
    "e":"e",
    "ɛ":"ae",
    "i":"i",
    "o":"o",
    "ɔ":"o",
    "u":"u",
    "ɯ":"ue",
    "ɤ":"oe",
    "aː":"a",
    "eː":"e",
    "ɛː":"ae",
    "iː":"i",
    "oː":"o",
    "ɔː":"o",
    "uː":"u",
    "ɯː":"ue",
    "ɤː":"oe",
    "ia":"ia",
    "ua":"ua",
    "ɯa":"uea",
    "aj":"ai",
    "aw":"ao",
    "ew":"eo",
    "ɛw":"aeo",
    "iw":"io",
    "ɔj":"io",
    "uj":"ui",
    "aːj":"ai",
    "aːw":"ao",
    "eːw":"eo",
    "ɛːw":"aeo",
    "oːj":"oi",
    "ɔːj":"oi",
    "ɤːj":"oei",
    "iaw":"iao",
    "uaj":"uai",
    "ɯaj":"ueai",
    ".":".",
}

dict_ipa_rtgs_final = {
    "w":"o"
}
trie = Trie(list(dict_ipa_rtgs.keys())+list(dict_ipa_rtgs_final.keys()))
ipa_cut = Tokenizer(custom_dict=trie, engine="newmm")


[docs]def ipa_to_rtgs(ipa: str) -> str:
    """
    Converter IPA system to The Royal Thai General System of Transcription (RTGS)

    Docs: https://en.wikipedia.org/wiki/Help:IPA/Thai

    :param str ipa: IPA phoneme
    :return: The RTGS that be convert
    :rtype: str

    :Example:
    ::

        from pythainlp.util import ipa_to_rtgs
 
        print(ipa_to_rtgs("kluaj"))
        # output : 'kluai'

    """
    _temp = []
    _list_ipa = ipa_cut.word_tokenize(ipa)
    for i,p in enumerate(_list_ipa):
        if i == len(_list_ipa) -1 and p in list(dict_ipa_rtgs_final.keys()):
            _temp.append(dict_ipa_rtgs_final[p])
        elif p in list(dict_ipa_rtgs.keys()):
            _temp.append(dict_ipa_rtgs[p])
        else:
            _temp.append(p)
    _text = ''.join(_temp)
    _text = unicodedata.normalize('NFKD', _text).encode('ascii', 'ignore')
    return _text.decode("utf-8")


[docs]def remove_tone_ipa(ipa: str) -> str:
    """
    Remove Thai Tone from IPA system

    :param str ipa: IPA phoneme
    :return: IPA phoneme that deleted tone
    :rtype: str

    :Example:
    ::

        from pythainlp.util import remove_tone_ipa
 
        print(remove_tone_ipa("laː˦˥.sa˨˩.maj˩˩˦"))
        # output : laː.sa.maj

    """
    _list_tone = ["˩˩˦", "˥˩", "˨˩", "˦˥", "˧"]
    for tone in _list_tone:
        ipa = ipa.replace(tone, "")
    return ipa