# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""The Royal Thai General System of Transcription (RTGS)
is the official system for rendering Thai words in the Latin alphabet.
It was published by the Royal Institute of Thailand.
:See Also:
* `Wikipedia <https://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription>`_
"""
from __future__ import annotations
import re
from pythainlp import thai_consonants, word_tokenize
# Romanized vowels for checking
_ROMANIZED_VOWELS = "aeiou"
# vowel
_vowel_patterns = """เ*ียว,\\1iao
แ*็ว,\\1aeo
เ*ือย,\\1ueai
แ*ว,\\1aeo
เ*็ว,\\1eo
เ*ว,\\1eo
*ิว,\\1io
*วย,\\1uai
เ*ย,\\1oei
*อย,\\1oi
โ*ย,\\1oi
*ุย,\\1ui
*าย,\\1ai
ไ*ย,\\1ai
*ัย,\\1ai
ไ**,\\1\\2ai
ไ*,\\1ai
ใ*,\\1ai
*ว*,\\1ua\\2
*ัวะ,\\1ua
*ัว,\\1ua
เ*ือะ,\\1uea
เ*ือ,\\1uea
เ*ียะ,\\1ia
เ*ีย,\\1ia
เ*อะ,\\1oe
เ*อ,\\1oe
เ*ิ,\\1oe
*อ,\\1o
เ*าะ,\\1o
เ*็,\\1e
โ*ะ,\\1o
โ*,\\1o
แ*ะ,\\1ae
แ*,\\1ae
เ*าะ,\\1e
*าว,\\1ao
เ*า,\\1ao
เ*,\\1e
*ู,\\1u
*ุ,\\1u
*ื,\\1ue
*ึ,\\1ue
*ี,\\1i
*ิ,\\1i
*ำ,\\1am
*า,\\1a
*ั,\\1a
*ะ,\\1a
#ฤ,\\1rue
$ฤ,\\1ri"""
_vowel_patterns = _vowel_patterns.replace("*", f"([{thai_consonants}])")
_vowel_patterns = _vowel_patterns.replace("#", "([คนพมห])")
_vowel_patterns = _vowel_patterns.replace("$", "([กตทปศส])")
_VOWELS = [x.split(",") for x in _vowel_patterns.split("\n")]
# พยัญชนะ ต้น สะกด
_CONSONANTS = {
"ก": ["k", "k"],
"ข": ["kh", "k"],
"ฃ": ["kh", "k"],
"ค": ["kh", "k"],
"ฅ": ["kh", "k"],
"ฆ": ["kh", "k"],
"ง": ["ng", "ng"],
"จ": ["ch", "t"],
"ฉ": ["ch", "t"],
"ช": ["ch", "t"],
"ซ": ["s", "t"],
"ฌ": ["ch", "t"],
"ญ": ["y", "n"],
"ฎ": ["d", "t"],
"ฏ": ["t", "t"],
"ฐ": ["th", "t"],
# ฑ พยัญชนะต้น เป็น d ได้
"ฑ": ["th", "t"],
"ฒ": ["th", "t"],
"ณ": ["n", "n"],
"ด": ["d", "t"],
"ต": ["t", "t"],
"ถ": ["th", "t"],
"ท": ["th", "t"],
"ธ": ["th", "t"],
"น": ["n", "n"],
"บ": ["b", "p"],
"ป": ["p", "p"],
"ผ": ["ph", "p"],
"ฝ": ["f", "p"],
"พ": ["ph", "p"],
"ฟ": ["f", "p"],
"ภ": ["ph", "p"],
"ม": ["m", "m"],
"ย": ["y", ""],
"ร": ["r", "n"],
"ฤ": ["rue", ""],
"ล": ["l", "n"],
"ว": ["w", ""],
"ศ": ["s", "t"],
"ษ": ["s", "t"],
"ส": ["s", "t"],
"ห": ["h", ""],
"ฬ": ["l", "n"],
"อ": ["", ""],
"ฮ": ["h", ""],
}
_THANTHAKHAT = "\u0e4c"
_RE_CONSONANT = re.compile(f"[{thai_consonants}]")
_RE_NORMALIZE = re.compile(
f"จน์|มณ์|ณฑ์|ทร์|ตร์|[{thai_consonants}]{_THANTHAKHAT}|"
f"[{thai_consonants}][\u0e30-\u0e39]{_THANTHAKHAT}"
# Paiyannoi, Maiyamok, Tonemarks, Thanthakhat, Nikhahit, other signs
r"|[\u0e2f\u0e46\u0e48-\u0e4f\u0e5a\u0e5b]"
)
def _normalize(word: str) -> str:
"""Remove silence, no sound, and tonal characters.
ตัดอักษรที่ไม่ออกเสียง (การันต์ ไปยาลน้อย ไม้ยมก*) และวรรณยุกต์ทิ้ง
"""
return _RE_NORMALIZE.sub("", word)
def _replace_vowels(word: str) -> str:
for vowel in _VOWELS:
word = re.sub(vowel[0], vowel[1], word)
return word
def _replace_consonants(word: str, consonants: str) -> str:
_HO_HIP = "\u0e2b" # ห
_RO_RUA = "\u0e23" # ร
_LO_LING = "\u0e25" # ล
_WO_WAEN = "\u0e27" # ว
_DOUBLE_RO_RUA = _RO_RUA + _RO_RUA
# Consonants that can be second in a cluster
_CLUSTER_SECOND = {_RO_RUA, _LO_LING, _WO_WAEN}
if not consonants:
return word
skip = False
mod_chars = []
j = 0 # j is the index of consonants
vowel_seen = False # Track if we've seen a vowel (non-consonant character)
for i in range(len(word)):
if skip:
skip = False
j += 1
elif word[i] not in _CONSONANTS: # word[i] is not a Thai consonant.
vowel_seen = True
mod_chars.append(word[i])
elif (
len(mod_chars) == 0 and word[i] == _HO_HIP and len(consonants) != 1
): # Skip HO HIP except that HO HIP is the only one consonant
j += 1
elif word[i:] == _DOUBLE_RO_RUA: # Double RO RUA is in end of word
skip = True
mod_chars.append("a")
mod_chars.append("n")
vowel_seen = True # 'a' acts as a vowel
j += 1
elif word[i : i + 2] == _DOUBLE_RO_RUA:
skip = True
mod_chars.append("a")
vowel_seen = True # 'a' acts as a vowel
j += 1
elif not vowel_seen: # Building initial consonant cluster
# Check if we've added any actual initial consonants (non-empty romanized characters)
# We check for non-vowel characters since mod_chars contains romanized output
has_initial = any(
c and c not in _ROMANIZED_VOWELS for c in mod_chars
)
if not has_initial:
# First consonant in the cluster
initial = _CONSONANTS[consonants[j]][0]
if (
initial
): # Only append if not empty (e.g., อ has empty initial)
mod_chars.append(initial)
j += 1
else:
# Check if this consonant can be part of a cluster
is_cluster_consonant = word[i] in _CLUSTER_SECOND
is_last_char = i + 1 >= len(word)
has_vowel_next = (
not is_last_char and word[i + 1] not in _CONSONANTS
)
# Cluster consonants (ร/r, ล/l, ว/w) are part of initial cluster if:
# - followed by a vowel, OR
# - not the last character (e.g., กรม/krom: ก/k+ร/r are cluster, ม/m is final)
if is_cluster_consonant and (
has_vowel_next or not is_last_char
):
# This is part of initial cluster (ร/r, ล/l, or ว/w after first consonant)
mod_chars.append(_CONSONANTS[consonants[j]][0])
j += 1
elif not is_cluster_consonant and not is_last_char:
# Not a cluster consonant, and there are more characters
# This likely starts a new syllable, so add implicit 'a' to previous syllable
mod_chars.append("a")
vowel_seen = True
# Now process this consonant as start of new syllable
initial = _CONSONANTS[consonants[j]][0]
if initial: # Only append if not empty
mod_chars.append(initial)
vowel_seen = False # Reset for new syllable
j += 1
elif has_vowel_next:
# Not a cluster consonant, but vowel follows - still initial
mod_chars.append(_CONSONANTS[consonants[j]][0])
j += 1
elif is_last_char:
# This is a final consonant with no vowel, need to add 'o'
mod_chars.append("o")
mod_chars.append(_CONSONANTS[consonants[j]][1])
vowel_seen = True
j += 1
else:
# There's another consonant after this one
# Add implicit 'o' and treat this as final
mod_chars.append("o")
mod_chars.append(_CONSONANTS[consonants[j]][1])
vowel_seen = True
j += 1
else: # After vowel - could be final consonant or start of new syllable
has_vowel_next = (
i + 1 < len(word) and word[i + 1] not in _CONSONANTS
)
if has_vowel_next:
# Consonant followed by vowel - start of new syllable
mod_chars.append(_CONSONANTS[consonants[j]][0])
vowel_seen = False # Reset for new syllable
j += 1
else:
# No vowel follows - this is a final consonant
mod_chars.append(_CONSONANTS[consonants[j]][1])
j += 1
return "".join(mod_chars)
# support function for romanize()
def _romanize(word: str) -> str:
# Special case: single ห character should be empty (silent)
if word == "ห":
return ""
word = _replace_vowels(_normalize(word))
consonants = _RE_CONSONANT.findall(word)
# 2-character word, all consonants
if len(word) == 2 and len(consonants) == 2:
word = list(word)
word.insert(1, "o")
word = "".join(word)
word = _replace_consonants(word, consonants)
return word
def _should_add_syllable_separator(
prev_word: str, curr_word: str, prev_romanized: str
) -> bool:
"""Determine if 'a' should be added between two romanized syllables.
This applies when:
- Previous word has explicit vowel and ends with consonant
- Current word is a 2-consonant cluster with no vowels (e.g., 'กร')
:param prev_word: The previous Thai word/token
:param curr_word: The current Thai word/token
:param prev_romanized: The romanized form of the previous word
:return: True if 'a' should be added before the current word
"""
if not prev_romanized or len(curr_word) < 2:
return False
# Check if previous word has explicit vowel
prev_normalized = _normalize(prev_word)
prev_after_vowels = _replace_vowels(prev_normalized)
prev_consonants = _RE_CONSONANT.findall(prev_word)
has_explicit_vowel_prev = len(prev_after_vowels) > len(prev_consonants)
# Check if current word is 2 Thai consonants with no vowel
consonants_in_word = _RE_CONSONANT.findall(curr_word)
vowels_in_word = len(curr_word) - len(consonants_in_word)
# Add 'a' if conditions are met
return (
has_explicit_vowel_prev
and len(consonants_in_word) == 2
and vowels_in_word == 0
and prev_romanized[-1] not in _ROMANIZED_VOWELS
)
[docs]
def romanize(text: str) -> str:
"""Render Thai words in Latin alphabet, using RTGS
Royal Thai General System of Transcription (RTGS),
is the official system by the Royal Institute of Thailand.
:param text: Thai text to be romanized
:type text: str
:return: A string of Thai words rendered in the Latin alphabet
:rtype: str
"""
words = word_tokenize(text)
romanized_words = []
for i, word in enumerate(words):
romanized = _romanize(word)
# Check if we need to add syllable separator 'a'
if i > 0 and romanized:
prev_word = words[i - 1]
prev_romanized = romanized_words[-1] if romanized_words else ""
if _should_add_syllable_separator(prev_word, word, prev_romanized):
romanized = "a" + romanized
romanized_words.append(romanized)
return "".join(romanized_words)