Source code for pythainlp.util.khuap_klam

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Thai consonant cluster (Kham Khuap Klam) checker."""

from __future__ import annotations

import re
from typing import Optional

# Regex for true consonant clusters (คำควบกล้ำแท้):
# initial consonants ก ข ค ต ป ผ พ ฟ บ followed by ร ล ว
_TRUE_CLUSTER_RE = re.compile(r"^[กขคตปผพฟบ][รลว]")

# Regex for false consonant clusters (คำควบกล้ำไม่แท้):
# written forms that look like clusters but are not pronounced as such
_FALSE_CLUSTER_RE = re.compile(r"^(ทร|จร|ศร|สร|ซร)")

# Leading vowels that appear before the initial consonant in written Thai
_LEAD_VOWEL_RE = re.compile(r"^[เแโใไ]+")


def _strip_lead_vowels(text: str) -> str:
    """Remove leading vowels (เ แ โ ใ ไ) from the start of *text*."""
    return _LEAD_VOWEL_RE.sub("", text)


[docs] def check_khuap_klam(word: str) -> Optional[bool]: """Check whether a Thai word is a consonant cluster (Kham Khuap Klam). :param str word: Thai word to check. :return: ``True`` if the word is a *true* consonant cluster (คำควบกล้ำแท้), ``False`` if it is a *false* consonant cluster (คำควบกล้ำไม่แท้), or ``None`` if it is not a consonant cluster. :rtype: Optional[bool] :Example: :: from pythainlp.util import check_khuap_klam # True consonant clusters (คำควบกล้ำแท้) print(check_khuap_klam("กราบ")) # True print(check_khuap_klam("ปลา")) # True print(check_khuap_klam("เพราะ")) # True print(check_khuap_klam("ตรง")) # True # False consonant clusters (คำควบกล้ำไม่แท้) print(check_khuap_klam("จริง")) # False print(check_khuap_klam("ทราย")) # False print(check_khuap_klam("เศร้า")) # False # Not a consonant cluster print(check_khuap_klam("แม่")) # None print(check_khuap_klam("ตา")) # None """ if not word: return None from ..transliterate import pronunciate # Convert to pronunciation; remove sub-consonant marker (พินทุ ฺ) reading = pronunciate(word, engine="w2p").replace("\u0e3a", "") # Use only the first syllable of the reading first_syll_reading = reading.split("-")[0] written_core = _strip_lead_vowels(word) reading_core = _strip_lead_vowels(first_syll_reading) is_true_sound = bool(_TRUE_CLUSTER_RE.match(reading_core)) is_false_form = bool(_FALSE_CLUSTER_RE.match(written_core)) if is_true_sound: return True if is_false_form: return False return None