Source code for pythainlp.transliterate.wiktionary

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Thai pronunciation transliteration from Wiktionary th-pron module.
Source code: https://en.wiktionary.org/wiki/Module:th-pron
"""

from __future__ import annotations

import re
import unicodedata
from typing import cast

_THAI_RANGE: str = r"[ก-๛̄]"

_SYSTEMS: dict[str, int] = {
    "paiboon": 0,
    "royin": 1,
    "ipa": 2,
}

_INITIAL: dict[str, dict[str, list[str] | str]] = {
    "ก": {"seq": ["g", "k", "k"], "class": "mid"},
    "จ": {"seq": ["j", "ch", "t͡ɕ"], "class": "mid"},
    "ด": {"seq": ["d", "d", "d"], "class": "mid"},
    "ฎ": {"seq": ["d", "d", "d"], "class": "mid"},
    "ฏ": {"seq": ["dt", "t", "t"], "class": "mid"},
    "ต": {"seq": ["dt", "t", "t"], "class": "mid"},
    "บ": {"seq": ["b", "b", "b"], "class": "mid"},
    "ป": {"seq": ["bp", "p", "p"], "class": "mid"},
    "อ": {"seq": ["", "@", "ʔ"], "class": "mid"},
    "ง": {"seq": ["ng", "$ng", "ŋ"], "class": "low"},
    "ณ": {"seq": ["n", "n", "n"], "class": "low"},
    "น": {"seq": ["n", "n", "n"], "class": "low"},
    "ม": {"seq": ["m", "m", "m"], "class": "low"},
    "ญ": {"seq": ["y", "y", "j"], "class": "low"},
    "ย": {"seq": ["y", "y", "j"], "class": "low"},
    "ร": {"seq": ["r", "r", "r"], "class": "low"},
    "ล": {"seq": ["l", "l", "l"], "class": "low"},
    "ฬ": {"seq": ["l", "l", "l"], "class": "low"},
    "ว": {"seq": ["w", "w", "w"], "class": "low"},
    "ค": {"seq": ["k", "kh", "kʰ"], "class": "low"},
    "ฅ": {"seq": ["k", "kh", "kʰ"], "class": "low"},
    "ฆ": {"seq": ["k", "kh", "kʰ"], "class": "low"},
    "ข": {"seq": ["k", "kh", "kʰ"], "class": "high"},
    "ฃ": {"seq": ["k", "kh", "kʰ"], "class": "high"},
    "ช": {"seq": ["ch", "ch", "t͡ɕʰ"], "class": "low"},
    "ฌ": {"seq": ["ch", "ch", "t͡ɕʰ"], "class": "low"},
    "ฉ": {"seq": ["ch", "ch", "t͡ɕʰ"], "class": "high"},
    "ฑ": {"seq": ["t", "th", "tʰ"], "class": "low"},
    "ฒ": {"seq": ["t", "th", "tʰ"], "class": "low"},
    "ท": {"seq": ["t", "th", "tʰ"], "class": "low"},
    "ธ": {"seq": ["t", "th", "tʰ"], "class": "low"},
    "ฐ": {"seq": ["t", "th", "tʰ"], "class": "high"},
    "ถ": {"seq": ["t", "th", "tʰ"], "class": "high"},
    "พ": {"seq": ["p", "ph", "pʰ"], "class": "low"},
    "ภ": {"seq": ["p", "ph", "pʰ"], "class": "low"},
    "ผ": {"seq": ["p", "ph", "pʰ"], "class": "high"},
    "ฟ": {"seq": ["f", "f", "f"], "class": "low"},
    "ฝ": {"seq": ["f", "f", "f"], "class": "high"},
    "ซ": {"seq": ["s", "s", "s"], "class": "low"},
    "ศ": {"seq": ["s", "s", "s"], "class": "high"},
    "ษ": {"seq": ["s", "s", "s"], "class": "high"},
    "ส": {"seq": ["s", "s", "s"], "class": "high"},
    "ฮ": {"seq": ["h", "h", "h"], "class": "low"},
    "ห": {"seq": ["h", "h", "h"], "class": "high"},
    "หง": {"seq": ["ng", "$ng", "ŋ"], "class": "high"},
    "หน": {"seq": ["n", "n", "n"], "class": "high"},
    "หม": {"seq": ["m", "m", "m"], "class": "high"},
    "หญ": {"seq": ["y", "y", "j"], "class": "high"},
    "หย": {"seq": ["y", "y", "j"], "class": "high"},
    "หร": {"seq": ["r", "r", "r"], "class": "high"},
    "หล": {"seq": ["l", "l", "l"], "class": "high"},
    "หว": {"seq": ["w", "w", "w"], "class": "high"},
    "…": {"seq": ["…", "…", "…"], "class": ""},
    "": {"seq": ["", "", ""], "class": ""},
}

_VOWEL: dict[str, dict[str, list[str]]] = {
    "open": {
        "ะ": ["a", "a", "a"],
        "": ["a", "a", "a"],
        "ิ": ["i", "i", "i"],
        "ึ": ["ʉ", "ue", "ɯ"],
        "ุ": ["u", "u", "u"],
        "เะ": ["e", "e", "eʔ"],
        "แะ": ["ɛ", "ae", "ɛʔ"],
        "โะ": ["o", "o", "oʔ"],
        "เาะ": ["ɔ", "o", "ɔʔ"],
        "็": ["ɔ", "o", "ɔ"],
        "เิ": ["ə", "oe", "ɤ"],
        "เอะ": ["ə", "oe", "ɤʔ"],
        "า": ["aa", "a", "aː"],
        "ี": ["ii", "i", "iː"],
        "ู": ["uu", "u", "uː"],
        "ือ": ["ʉʉ", "ue", "ɯː"],
        "เ": ["ee", "e", "eː"],
        "แ": ["ɛɛ", "ae", "ɛː"],
        "โ": ["oo", "o", "oː"],
        "อ": ["ɔɔ", "o", "ɔː"],
        "ร": ["ɔɔn", "on", "ɔːn"],
        "เอ": ["əə", "oe", "ɤː"],
        "เียะ": ["ia", "ia", "ia̯ʔ"],
        "เือะ": ["ʉa", "uea", "ɯa̯ʔ"],
        "ัวะ": ["ua", "ua", "ua̯ʔ"],
        "เีย": ["iia", "ia", "ia̯"],
        "เือ": ["ʉʉa", "uea", "ɯa̯"],
        "ัว": ["uua", "ua", "ua̯"],
        "ิว": ["iu", "io", "iw"],
        "ีว": ["iiu", "io", "iːw"],
        "เ็ว": ["eo", "eo", "ew"],
        "แ็ว": ["ɛo", "aeo", "ɛw"],
        "เา": ["ao", "ao", "aw"],
        "เว": ["eeo", "eo", "eːw"],
        "แว": ["ɛɛo", "aeo", "ɛːw"],
        "าว": ["aao", "ao", "aːw"],
        "เอว": ["əəo", "oeu", "ɤːw"],
        "โว": ["oow", "ou", "oːw"],
        "เียว": ["iao", "iao", "ia̯w"],
        "ัย": ["ai", "ai", "aj"],
        "ใ": ["ai", "ai", "aj"],
        "ไ": ["ai", "ai", "aj"],
        "ไย": ["ai", "ai", "aj"],
        "ึย": ["ʉi", "uei", "ɯj"],
        "็อย": ["ɔi", "oi", "ɔj"],
        "เิ็ย": ["əi", "oei", "ɤj"],
        "ุย": ["ui", "ui", "uj"],
        "าย": ["aai", "ai", "aːj"],
        "อย": ["ɔɔi", "oi", "ɔːj"],
        "โย": ["ooi", "oi", "oːj"],
        "เย": ["əəi", "oei", "ɤːj"],
        "ูย": ["uui", "ui", "uːj"],
        "วย": ["uai", "uai", "ua̯j"],
        "เือย": ["ʉai", "ueai", "ɯa̯j"],
        "ำ": ["am", "am", "am"],
    },
    "closed": {
        "ั": ["a", "a", "a"],
        "รร": ["a", "a", "a"],
        "ิ": ["i", "i", "i"],
        "ึ": ["ʉ", "ue", "ɯ"],
        "ุ": ["u", "u", "u"],
        "เ": ["ee", "e", "eː"],
        "เ็": ["e", "e", "e"],
        "แ็": ["ɛ", "ae", "ɛ"],
        "แ": ["ɛɛ", "ae", "ɛː"],
        "": ["o", "o", "o"],
        "็อ": ["ɔ", "o", "ɔ"],
        "เิ็": ["ə", "oe", "ɤ"],
        "า": ["aa", "a", "aː"],
        "ี": ["ii", "i", "iː"],
        "ื": ["ʉʉ", "ue", "ɯː"],
        "ู": ["uu", "u", "uː"],
        "โ": ["oo", "o", "oː"],
        "อ": ["ɔɔ", "o", "ɔː"],
        "เิ": ["əə", "oe", "ɤː"],
        "เอ": ["əə", "oe", "ɤː"],
        "เีย": ["iia", "ia", "ia̯"],
        "เือ": ["ʉʉa", "uea", "ɯa̯"],
        "ว": ["uua", "ua", "ua̯"],
        "ไ": ["ai", "ai", "aj"],
        "เา": ["ao", "ao", "aw"],
        "็อย": ["ɔi", "oi", "ɔj"],
    },
}

_UNROM_LONG: dict[str, bool] = {
    "เีย": True,
    "เือ": True,
    "ัว": True,
    "ว": True,
    "เือย": True,
    "วาย": True,
    "เอว": True,
    "เียว": True,
}

_LIVE_EXC: dict[str, bool] = {
    "ัย": True,
    "ใ": True,
    "ไ": True,
    "ไย": True,
    "ุย": True,
    "วย": True,
    "็อย": True,
    "เิ็ย": True,
    "เา": True,
    "ิว": True,
    "เ็ว": True,
    "แ็ว": True,
    "ำ": True,
}

_CODA: dict[str, list[str]] = {
    "ก": ["k", "k", "k̚"],
    "ข": ["k", "k", "k̚"],
    "ฃ": ["k", "k", "k̚"],
    "ค": ["k", "k", "k̚"],
    "ฅ": ["k", "k", "k̚"],
    "ฆ": ["k", "k", "k̚"],
    "จ": ["t", "t", "t̚"],
    "ฉ": ["t", "t", "t̚"],
    "ช": ["ch", "ch", "t͡ɕʰ"],
    "ซ": ["s", "s", "s"],
    "ฌ": ["t", "t", "t̚"],
    "ฎ": ["t", "t", "t̚"],
    "ฏ": ["t", "t", "t̚"],
    "ฐ": ["t", "t", "t̚"],
    "ฑ": ["t", "t", "t̚"],
    "ฒ": ["t", "t", "t̚"],
    "ด": ["t", "t", "t̚"],
    "ต": ["t", "t", "t̚"],
    "ถ": ["t", "t", "t̚"],
    "ท": ["t", "t", "t̚"],
    "ธ": ["t", "t", "t̚"],
    "ศ": ["t", "t", "t̚"],
    "ษ": ["t", "t", "t̚"],
    "ส": ["s", "s", "s"],
    "บ": ["p", "p", "p̚"],
    "ป": ["p", "p", "p̚"],
    "ผ": ["p", "p", "p̚"],
    "ฝ": ["p", "p", "p̚"],
    "พ": ["p", "p", "p̚"],
    "ฟ": ["f", "f", "f"],
    "ภ": ["p", "p", "p̚"],
    "ง": ["ng", "ng$", "ŋ"],
    "ญ": ["n", "n", "n"],
    "ณ": ["n", "n", "n"],
    "น": ["n", "n", "n"],
    "ร": ["n", "n", "n"],
    "ล": ["l", "l", "l"],
    "ฬ": ["n", "n", "n"],
    "ม": ["m", "m", "m"],
    "ฯ": ["ʔ", "ʔ", "ʔ"],
}

_TONE_FROM_MARK: dict[str, dict[str, str]] = {
    "่": {"high": "low", "mid": "low", "low": "falling"},
    "้": {"high": "falling", "mid": "falling", "low": "high"},
    "๊": {"high": "high", "mid": "high", "low": "high"},
    "๋": {"high": "rising", "mid": "rising", "low": "rising"},
    "̄": {"high": "mid", "mid": "mid", "low": "mid"},
}

_TONE_NO_MARK: dict[str, dict[str, str]] = {
    "dead-short": {"high": "low", "mid": "low", "low": "high"},
    "dead-long": {"high": "low", "mid": "low", "low": "falling"},
    "live": {"high": "rising", "mid": "mid", "low": "mid"},
}

_TONE_ROM_MARKS: dict[str, str] = {
    "high": "́",
    "mid": "",
    "low": "̀",
    "rising": "̌",
    "falling": "̂",
}

_TONE_LEVELS: dict[str, str] = {
    "high": "˦˥",
    "mid": "˧",
    "low": "˨˩",
    "rising": "˩˩˦",
    "falling": "˥˩",
}

_SYMBOLS: dict[str, str] = {
    "๐": "0",
    "๑": "1",
    "๒": "2",
    "๓": "3",
    "๔": "4",
    "๕": "5",
    "๖": "6",
    "๗": "7",
    "๘": "8",
    "๙": "9",
}

_MGVC_PATTERN = re.compile(
    r"^([รลว]?)([ิึุ็ีืัำู]?[าอรยว]?[วยร]?ะ?)([คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?)$"
)
_FULL_PATTERN = re.compile(
    r"^([เแโใไ]?)(หฺ[ก-รลว-ฮ])(ฺ?[รลว]?)([ิึุ็ีืัู]?็?[่้๊๋̄]?[าอรยวำ]?[วยร]?ะ?)([คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?[คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?)$"
)
_PARTIAL_PATTERN = re.compile(
    r"^([เแโใไ]?)([ก-รลว-ฮ])(ฺ?[รลว]?)([ิึุ็ีืัู]?็?[่้๊๋̄]?[าอรยวำ]?[วยร]?ะ?)([คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?[คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?)$"
)


def _c2_decomp(c2_char: str, seq_idx: int) -> str:
    return "".join(_CODA.get(char, ["", "", ""])[seq_idx] for char in c2_char)



[docs]
def transliterate_wiktionary(text: str, mode: str = "ipa") -> str:
    """Transliterate Thai text using Wiktionary th-pron logic.

    :param str text: Thai text input (single word or text fragment).
    :param str mode: Output mode: ``paiboon``, ``royin``, or ``ipa``.
    Unsupported modes return the input text unchanged.

    :return: Transliterated text.
    :rtype: str

    :Example:

        >>> transliterate_wiktionary("แมว", mode="royin")
        'maeo'
    """
    seq_idx = _SYSTEMS.get(mode)
    if seq_idx is None:
        return text

    def process_word(match_word: re.Match[str]) -> str:
        word = match_word.group(0)

        if re.search(r"[่้๊๋̄].?[่้๊๋̄]", word):
            return word

        def syllable(match: re.Match[str]) -> str:
            v1, c1, g, v2, c2 = match.groups()

            tmark_match = re.search(r"[่้๊๋̄]", v2)
            tmark = tmark_match.group(0) if tmark_match else None
            v2 = re.sub(r"[่้๊๋̄]", "", v2)

            if re.match(r"^ห.$", c1):
                mgvc_match = _MGVC_PATTERN.match(c1[1] + g + v2 + c2)
                if mgvc_match:
                    g_new, v2_new, c2_new = mgvc_match.groups()
                    c1, g, v2, c2 = "ห", g_new, v2_new, c2_new
                    if g and v2 != "ย":
                        c1, g = c1 + g, ""

            if g == "ล" and not (v2 + c2):
                c2 = g
                g = ""

            openness = "closed" if c2 != "" else "open"

            if (v1 + g + v2) in _VOWEL[openness]:
                orig_v = v1 + g + v2
                v = _VOWEL[openness][orig_v][seq_idx]
                g = ""
            else:
                orig_v = v1 + v2
                v_lookup = _VOWEL[openness].get(v1 + v2)
                v = v_lookup[seq_idx] if v_lookup else (v1 + v2)
                g_clean = g.replace("ฺ", "")
                g_lookup = _INITIAL.get(g_clean, _INITIAL[""])
                g = cast(list[str], g_lookup["seq"])[seq_idx]

            c1_clean = c1.replace("ฺ", "")
            if c1_clean in _INITIAL:
                ini = cast(list[str], _INITIAL[c1_clean]["seq"])[seq_idx]
                cls = cast(str, _INITIAL[c1_clean]["class"])
            else:
                return match.group(0)

            length = (
                "long"
                if re.search(r"([aiʉueɛoɔə])\1", v)
                or "ː" in v
                or orig_v in _UNROM_LONG
                else "short"
            )
            life = (
                "live"
                if re.search(r"[มญณนรลฬง]", c2)
                or (orig_v.endswith("ย") and v.endswith("i"))
                or (c2 == "" and length == "long")
                or _LIVE_EXC.get(orig_v)
                else "dead"
            )

            if c2 in _CODA:
                c2 = _CODA[c2][seq_idx]
            else:
                c2 = _c2_decomp(c2, seq_idx)

            tone_dict = (
                _TONE_FROM_MARK.get(tmark)
                if tmark
                else _TONE_NO_MARK.get(f"{life}-{length}", _TONE_NO_MARK.get(life))
            )
            tone = tone_dict.get(cls) if tone_dict else None

            if mode == "paiboon":
                v = re.sub(
                    r"^([^aiʉueɛoɔə]*)([aiʉueɛoɔə])",
                    f"\\g<1>\\g<2>{_TONE_ROM_MARKS.get(tone, '')}",
                    v,
                )
            elif mode == "ipa":
                c2 = c2 + _TONE_LEVELS.get(tone, "")

            return ini + g + v + c2

        word = _FULL_PATTERN.sub(syllable, word)
        word = _PARTIAL_PATTERN.sub(syllable, word)
        return word

    text = re.sub(f"{_THAI_RANGE}+", lambda m: process_word(m), text)

    text = re.sub(r"[๐-๙]", lambda m: _SYMBOLS.get(m.group(0), m.group(0)), text)

    if mode == "royin":
        text = re.sub(r"^@", "", text)
        text = re.sub(r"([\s\W])@", r"\1", text)
        text = text.replace("@", "-")
        text = re.sub(r"^\$ng", "ng", text)
        text = re.sub(r"([\s\W])\$ng", r"\1ng", text)
        text = re.sub(r"([aeiou])\$ng", r"\1-ng", text)
        text = text.replace("$ng", "ng")
        text = re.sub(r"ng\$([^\w\s])", r"ng\1", text)
        text = re.sub(r"ng\$", "ng", text)

    if mode == "ipa":
        text = re.sub(r"[ \-–]", ".", text)
        text = re.sub(r"([aiɯu])([˥-˩]+)$", r"\1ʔ\2", text)

    return unicodedata.normalize("NFC", text)




[docs]
def get_word_dict(word: str) -> dict[str, str]:
    """Return Wiktionary transliteration outputs in all supported systems.

    :param str word: Thai input word.
    :return: ``dict[str, str]`` with ``word``, ``paiboon``, ``royin``, and ``ipa``.
    :rtype: dict[str, str]

    :Example:

        >>> get_word_dict("แมว")
        {'word': 'แมว', 'paiboon': 'mɛɛo', 'royin': 'maeo', 'ipa': 'mɛːw˧'}
    """
    return {
        "word": word,
        "paiboon": transliterate_wiktionary(word, mode="paiboon"),
        "royin": transliterate_wiktionary(word, mode="royin"),
        "ipa": transliterate_wiktionary(word, mode="ipa"),
    }