# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Thai pronunciation transliteration from Wiktionary th-pron module.
Source code: https://en.wiktionary.org/wiki/Module:th-pron
"""
from __future__ import annotations
import re
import unicodedata
from typing import cast
_THAI_RANGE: str = r"[ก-๛̄]"
_SYSTEMS: dict[str, int] = {
"paiboon": 0,
"royin": 1,
"ipa": 2,
}
_INITIAL: dict[str, dict[str, list[str] | str]] = {
"ก": {"seq": ["g", "k", "k"], "class": "mid"},
"จ": {"seq": ["j", "ch", "t͡ɕ"], "class": "mid"},
"ด": {"seq": ["d", "d", "d"], "class": "mid"},
"ฎ": {"seq": ["d", "d", "d"], "class": "mid"},
"ฏ": {"seq": ["dt", "t", "t"], "class": "mid"},
"ต": {"seq": ["dt", "t", "t"], "class": "mid"},
"บ": {"seq": ["b", "b", "b"], "class": "mid"},
"ป": {"seq": ["bp", "p", "p"], "class": "mid"},
"อ": {"seq": ["", "@", "ʔ"], "class": "mid"},
"ง": {"seq": ["ng", "$ng", "ŋ"], "class": "low"},
"ณ": {"seq": ["n", "n", "n"], "class": "low"},
"น": {"seq": ["n", "n", "n"], "class": "low"},
"ม": {"seq": ["m", "m", "m"], "class": "low"},
"ญ": {"seq": ["y", "y", "j"], "class": "low"},
"ย": {"seq": ["y", "y", "j"], "class": "low"},
"ร": {"seq": ["r", "r", "r"], "class": "low"},
"ล": {"seq": ["l", "l", "l"], "class": "low"},
"ฬ": {"seq": ["l", "l", "l"], "class": "low"},
"ว": {"seq": ["w", "w", "w"], "class": "low"},
"ค": {"seq": ["k", "kh", "kʰ"], "class": "low"},
"ฅ": {"seq": ["k", "kh", "kʰ"], "class": "low"},
"ฆ": {"seq": ["k", "kh", "kʰ"], "class": "low"},
"ข": {"seq": ["k", "kh", "kʰ"], "class": "high"},
"ฃ": {"seq": ["k", "kh", "kʰ"], "class": "high"},
"ช": {"seq": ["ch", "ch", "t͡ɕʰ"], "class": "low"},
"ฌ": {"seq": ["ch", "ch", "t͡ɕʰ"], "class": "low"},
"ฉ": {"seq": ["ch", "ch", "t͡ɕʰ"], "class": "high"},
"ฑ": {"seq": ["t", "th", "tʰ"], "class": "low"},
"ฒ": {"seq": ["t", "th", "tʰ"], "class": "low"},
"ท": {"seq": ["t", "th", "tʰ"], "class": "low"},
"ธ": {"seq": ["t", "th", "tʰ"], "class": "low"},
"ฐ": {"seq": ["t", "th", "tʰ"], "class": "high"},
"ถ": {"seq": ["t", "th", "tʰ"], "class": "high"},
"พ": {"seq": ["p", "ph", "pʰ"], "class": "low"},
"ภ": {"seq": ["p", "ph", "pʰ"], "class": "low"},
"ผ": {"seq": ["p", "ph", "pʰ"], "class": "high"},
"ฟ": {"seq": ["f", "f", "f"], "class": "low"},
"ฝ": {"seq": ["f", "f", "f"], "class": "high"},
"ซ": {"seq": ["s", "s", "s"], "class": "low"},
"ศ": {"seq": ["s", "s", "s"], "class": "high"},
"ษ": {"seq": ["s", "s", "s"], "class": "high"},
"ส": {"seq": ["s", "s", "s"], "class": "high"},
"ฮ": {"seq": ["h", "h", "h"], "class": "low"},
"ห": {"seq": ["h", "h", "h"], "class": "high"},
"หง": {"seq": ["ng", "$ng", "ŋ"], "class": "high"},
"หน": {"seq": ["n", "n", "n"], "class": "high"},
"หม": {"seq": ["m", "m", "m"], "class": "high"},
"หญ": {"seq": ["y", "y", "j"], "class": "high"},
"หย": {"seq": ["y", "y", "j"], "class": "high"},
"หร": {"seq": ["r", "r", "r"], "class": "high"},
"หล": {"seq": ["l", "l", "l"], "class": "high"},
"หว": {"seq": ["w", "w", "w"], "class": "high"},
"…": {"seq": ["…", "…", "…"], "class": ""},
"": {"seq": ["", "", ""], "class": ""},
}
_VOWEL: dict[str, dict[str, list[str]]] = {
"open": {
"ะ": ["a", "a", "a"],
"": ["a", "a", "a"],
"ิ": ["i", "i", "i"],
"ึ": ["ʉ", "ue", "ɯ"],
"ุ": ["u", "u", "u"],
"เะ": ["e", "e", "eʔ"],
"แะ": ["ɛ", "ae", "ɛʔ"],
"โะ": ["o", "o", "oʔ"],
"เาะ": ["ɔ", "o", "ɔʔ"],
"็": ["ɔ", "o", "ɔ"],
"เิ": ["ə", "oe", "ɤ"],
"เอะ": ["ə", "oe", "ɤʔ"],
"า": ["aa", "a", "aː"],
"ี": ["ii", "i", "iː"],
"ู": ["uu", "u", "uː"],
"ือ": ["ʉʉ", "ue", "ɯː"],
"เ": ["ee", "e", "eː"],
"แ": ["ɛɛ", "ae", "ɛː"],
"โ": ["oo", "o", "oː"],
"อ": ["ɔɔ", "o", "ɔː"],
"ร": ["ɔɔn", "on", "ɔːn"],
"เอ": ["əə", "oe", "ɤː"],
"เียะ": ["ia", "ia", "ia̯ʔ"],
"เือะ": ["ʉa", "uea", "ɯa̯ʔ"],
"ัวะ": ["ua", "ua", "ua̯ʔ"],
"เีย": ["iia", "ia", "ia̯"],
"เือ": ["ʉʉa", "uea", "ɯa̯"],
"ัว": ["uua", "ua", "ua̯"],
"ิว": ["iu", "io", "iw"],
"ีว": ["iiu", "io", "iːw"],
"เ็ว": ["eo", "eo", "ew"],
"แ็ว": ["ɛo", "aeo", "ɛw"],
"เา": ["ao", "ao", "aw"],
"เว": ["eeo", "eo", "eːw"],
"แว": ["ɛɛo", "aeo", "ɛːw"],
"าว": ["aao", "ao", "aːw"],
"เอว": ["əəo", "oeu", "ɤːw"],
"โว": ["oow", "ou", "oːw"],
"เียว": ["iao", "iao", "ia̯w"],
"ัย": ["ai", "ai", "aj"],
"ใ": ["ai", "ai", "aj"],
"ไ": ["ai", "ai", "aj"],
"ไย": ["ai", "ai", "aj"],
"ึย": ["ʉi", "uei", "ɯj"],
"็อย": ["ɔi", "oi", "ɔj"],
"เิ็ย": ["əi", "oei", "ɤj"],
"ุย": ["ui", "ui", "uj"],
"าย": ["aai", "ai", "aːj"],
"อย": ["ɔɔi", "oi", "ɔːj"],
"โย": ["ooi", "oi", "oːj"],
"เย": ["əəi", "oei", "ɤːj"],
"ูย": ["uui", "ui", "uːj"],
"วย": ["uai", "uai", "ua̯j"],
"เือย": ["ʉai", "ueai", "ɯa̯j"],
"ำ": ["am", "am", "am"],
},
"closed": {
"ั": ["a", "a", "a"],
"รร": ["a", "a", "a"],
"ิ": ["i", "i", "i"],
"ึ": ["ʉ", "ue", "ɯ"],
"ุ": ["u", "u", "u"],
"เ": ["ee", "e", "eː"],
"เ็": ["e", "e", "e"],
"แ็": ["ɛ", "ae", "ɛ"],
"แ": ["ɛɛ", "ae", "ɛː"],
"": ["o", "o", "o"],
"็อ": ["ɔ", "o", "ɔ"],
"เิ็": ["ə", "oe", "ɤ"],
"า": ["aa", "a", "aː"],
"ี": ["ii", "i", "iː"],
"ื": ["ʉʉ", "ue", "ɯː"],
"ู": ["uu", "u", "uː"],
"โ": ["oo", "o", "oː"],
"อ": ["ɔɔ", "o", "ɔː"],
"เิ": ["əə", "oe", "ɤː"],
"เอ": ["əə", "oe", "ɤː"],
"เีย": ["iia", "ia", "ia̯"],
"เือ": ["ʉʉa", "uea", "ɯa̯"],
"ว": ["uua", "ua", "ua̯"],
"ไ": ["ai", "ai", "aj"],
"เา": ["ao", "ao", "aw"],
"็อย": ["ɔi", "oi", "ɔj"],
},
}
_UNROM_LONG: dict[str, bool] = {
"เีย": True,
"เือ": True,
"ัว": True,
"ว": True,
"เือย": True,
"วาย": True,
"เอว": True,
"เียว": True,
}
_LIVE_EXC: dict[str, bool] = {
"ัย": True,
"ใ": True,
"ไ": True,
"ไย": True,
"ุย": True,
"วย": True,
"็อย": True,
"เิ็ย": True,
"เา": True,
"ิว": True,
"เ็ว": True,
"แ็ว": True,
"ำ": True,
}
_CODA: dict[str, list[str]] = {
"ก": ["k", "k", "k̚"],
"ข": ["k", "k", "k̚"],
"ฃ": ["k", "k", "k̚"],
"ค": ["k", "k", "k̚"],
"ฅ": ["k", "k", "k̚"],
"ฆ": ["k", "k", "k̚"],
"จ": ["t", "t", "t̚"],
"ฉ": ["t", "t", "t̚"],
"ช": ["ch", "ch", "t͡ɕʰ"],
"ซ": ["s", "s", "s"],
"ฌ": ["t", "t", "t̚"],
"ฎ": ["t", "t", "t̚"],
"ฏ": ["t", "t", "t̚"],
"ฐ": ["t", "t", "t̚"],
"ฑ": ["t", "t", "t̚"],
"ฒ": ["t", "t", "t̚"],
"ด": ["t", "t", "t̚"],
"ต": ["t", "t", "t̚"],
"ถ": ["t", "t", "t̚"],
"ท": ["t", "t", "t̚"],
"ธ": ["t", "t", "t̚"],
"ศ": ["t", "t", "t̚"],
"ษ": ["t", "t", "t̚"],
"ส": ["s", "s", "s"],
"บ": ["p", "p", "p̚"],
"ป": ["p", "p", "p̚"],
"ผ": ["p", "p", "p̚"],
"ฝ": ["p", "p", "p̚"],
"พ": ["p", "p", "p̚"],
"ฟ": ["f", "f", "f"],
"ภ": ["p", "p", "p̚"],
"ง": ["ng", "ng$", "ŋ"],
"ญ": ["n", "n", "n"],
"ณ": ["n", "n", "n"],
"น": ["n", "n", "n"],
"ร": ["n", "n", "n"],
"ล": ["l", "l", "l"],
"ฬ": ["n", "n", "n"],
"ม": ["m", "m", "m"],
"ฯ": ["ʔ", "ʔ", "ʔ"],
}
_TONE_FROM_MARK: dict[str, dict[str, str]] = {
"่": {"high": "low", "mid": "low", "low": "falling"},
"้": {"high": "falling", "mid": "falling", "low": "high"},
"๊": {"high": "high", "mid": "high", "low": "high"},
"๋": {"high": "rising", "mid": "rising", "low": "rising"},
"̄": {"high": "mid", "mid": "mid", "low": "mid"},
}
_TONE_NO_MARK: dict[str, dict[str, str]] = {
"dead-short": {"high": "low", "mid": "low", "low": "high"},
"dead-long": {"high": "low", "mid": "low", "low": "falling"},
"live": {"high": "rising", "mid": "mid", "low": "mid"},
}
_TONE_ROM_MARKS: dict[str, str] = {
"high": "́",
"mid": "",
"low": "̀",
"rising": "̌",
"falling": "̂",
}
_TONE_LEVELS: dict[str, str] = {
"high": "˦˥",
"mid": "˧",
"low": "˨˩",
"rising": "˩˩˦",
"falling": "˥˩",
}
_SYMBOLS: dict[str, str] = {
"๐": "0",
"๑": "1",
"๒": "2",
"๓": "3",
"๔": "4",
"๕": "5",
"๖": "6",
"๗": "7",
"๘": "8",
"๙": "9",
}
_MGVC_PATTERN = re.compile(
r"^([รลว]?)([ิึุ็ีืัำู]?[าอรยว]?[วยร]?ะ?)([คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?)$"
)
_FULL_PATTERN = re.compile(
r"^([เแโใไ]?)(หฺ[ก-รลว-ฮ])(ฺ?[รลว]?)([ิึุ็ีืัู]?็?[่้๊๋̄]?[าอรยวำ]?[วยร]?ะ?)([คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?[คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?)$"
)
_PARTIAL_PATTERN = re.compile(
r"^([เแโใไ]?)([ก-รลว-ฮ])(ฺ?[รลว]?)([ิึุ็ีืัู]?็?[่้๊๋̄]?[าอรยวำ]?[วยร]?ะ?)([คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?[คฅฆกขฃพฟภบปชฌฑฒทธจฎฏดตฐถศษสมญณนรลฬง]?)$"
)
def _c2_decomp(c2_char: str, seq_idx: int) -> str:
return "".join(_CODA.get(char, ["", "", ""])[seq_idx] for char in c2_char)
[docs]
def transliterate_wiktionary(text: str, mode: str = "ipa") -> str:
"""Transliterate Thai text using Wiktionary th-pron logic.
:param str text: Thai text input (single word or text fragment).
:param str mode: Output mode: ``paiboon``, ``royin``, or ``ipa``.
Unsupported modes return the input text unchanged.
:return: Transliterated text.
:rtype: str
:Example:
>>> transliterate_wiktionary("แมว", mode="royin")
'maeo'
"""
seq_idx = _SYSTEMS.get(mode)
if seq_idx is None:
return text
def process_word(match_word: re.Match[str]) -> str:
word = match_word.group(0)
if re.search(r"[่้๊๋̄].?[่้๊๋̄]", word):
return word
def syllable(match: re.Match[str]) -> str:
v1, c1, g, v2, c2 = match.groups()
tmark_match = re.search(r"[่้๊๋̄]", v2)
tmark = tmark_match.group(0) if tmark_match else None
v2 = re.sub(r"[่้๊๋̄]", "", v2)
if re.match(r"^ห.$", c1):
mgvc_match = _MGVC_PATTERN.match(c1[1] + g + v2 + c2)
if mgvc_match:
g_new, v2_new, c2_new = mgvc_match.groups()
c1, g, v2, c2 = "ห", g_new, v2_new, c2_new
if g and v2 != "ย":
c1, g = c1 + g, ""
if g == "ล" and not (v2 + c2):
c2 = g
g = ""
openness = "closed" if c2 != "" else "open"
if (v1 + g + v2) in _VOWEL[openness]:
orig_v = v1 + g + v2
v = _VOWEL[openness][orig_v][seq_idx]
g = ""
else:
orig_v = v1 + v2
v_lookup = _VOWEL[openness].get(v1 + v2)
v = v_lookup[seq_idx] if v_lookup else (v1 + v2)
g_clean = g.replace("ฺ", "")
g_lookup = _INITIAL.get(g_clean, _INITIAL[""])
g = cast(list[str], g_lookup["seq"])[seq_idx]
c1_clean = c1.replace("ฺ", "")
if c1_clean in _INITIAL:
ini = cast(list[str], _INITIAL[c1_clean]["seq"])[seq_idx]
cls = cast(str, _INITIAL[c1_clean]["class"])
else:
return match.group(0)
length = (
"long"
if re.search(r"([aiʉueɛoɔə])\1", v)
or "ː" in v
or orig_v in _UNROM_LONG
else "short"
)
life = (
"live"
if re.search(r"[มญณนรลฬง]", c2)
or (orig_v.endswith("ย") and v.endswith("i"))
or (c2 == "" and length == "long")
or _LIVE_EXC.get(orig_v)
else "dead"
)
if c2 in _CODA:
c2 = _CODA[c2][seq_idx]
else:
c2 = _c2_decomp(c2, seq_idx)
tone_dict = (
_TONE_FROM_MARK.get(tmark)
if tmark
else _TONE_NO_MARK.get(f"{life}-{length}", _TONE_NO_MARK.get(life))
)
tone = tone_dict.get(cls) if tone_dict else None
if mode == "paiboon":
v = re.sub(
r"^([^aiʉueɛoɔə]*)([aiʉueɛoɔə])",
f"\\g<1>\\g<2>{_TONE_ROM_MARKS.get(tone, '')}",
v,
)
elif mode == "ipa":
c2 = c2 + _TONE_LEVELS.get(tone, "")
return ini + g + v + c2
word = _FULL_PATTERN.sub(syllable, word)
word = _PARTIAL_PATTERN.sub(syllable, word)
return word
text = re.sub(f"{_THAI_RANGE}+", lambda m: process_word(m), text)
text = re.sub(r"[๐-๙]", lambda m: _SYMBOLS.get(m.group(0), m.group(0)), text)
if mode == "royin":
text = re.sub(r"^@", "", text)
text = re.sub(r"([\s\W])@", r"\1", text)
text = text.replace("@", "-")
text = re.sub(r"^\$ng", "ng", text)
text = re.sub(r"([\s\W])\$ng", r"\1ng", text)
text = re.sub(r"([aeiou])\$ng", r"\1-ng", text)
text = text.replace("$ng", "ng")
text = re.sub(r"ng\$([^\w\s])", r"ng\1", text)
text = re.sub(r"ng\$", "ng", text)
if mode == "ipa":
text = re.sub(r"[ \-–]", ".", text)
text = re.sub(r"([aiɯu])([˥-˩]+)$", r"\1ʔ\2", text)
return unicodedata.normalize("NFC", text)
[docs]
def get_word_dict(word: str) -> dict[str, str]:
"""Return Wiktionary transliteration outputs in all supported systems.
:param str word: Thai input word.
:return: ``dict[str, str]`` with ``word``, ``paiboon``, ``royin``, and ``ipa``.
:rtype: dict[str, str]
:Example:
>>> get_word_dict("แมว")
{'word': 'แมว', 'paiboon': 'mɛɛo', 'royin': 'maeo', 'ipa': 'mɛːw˧'}
"""
return {
"word": word,
"paiboon": transliterate_wiktionary(word, mode="paiboon"),
"royin": transliterate_wiktionary(word, mode="royin"),
"ipa": transliterate_wiktionary(word, mode="ipa"),
}