Source code for pythainlp.util.thai

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Check if it is Thai text"""

from __future__ import annotations

import string
from collections import defaultdict
from types import MappingProxyType
from typing import Optional

from pythainlp import (
    thai_above_vowels,
    thai_below_vowels,
    thai_consonants,
    thai_digits,
    thai_follow_vowels,
    thai_lead_vowels,
    thai_punctuations,
    thai_signs,
    thai_tonemarks,
    thai_vowels,
)
from pythainlp.tools import warn_deprecation

_DEFAULT_IGNORE_CHARS: str = (
    string.whitespace + string.digits + string.punctuation
)
_TH_FIRST_CHAR_ASCII: int = 3584
_TH_LAST_CHAR_ASCII: int = 3711

# A comprehensive map of Thai characters to their descriptive names.
# MappingProxyType makes this constant read-only at runtime.
_THAI_CHAR_NAMES: MappingProxyType[str, str] = MappingProxyType(
    {
        # Consonants
        **{char: char for char in thai_consonants},
        # Vowels and Signs
        "\u0e24": "ฤ",
        "\u0e26": "ฦ",
        "\u0e30": "สระ อะ",
        "\u0e31": "ไม้หันอากาศ",
        "\u0e32": "สระ อา",
        "\u0e33": "สระ อำ",
        "\u0e34": "สระ อิ",
        "\u0e35": "สระ อี",
        "\u0e36": "สระ อึ",
        "\u0e37": "สระ อือ",
        "\u0e38": "สระ อุ",
        "\u0e39": "สระ อู",
        "\u0e40": "สระ เอ",
        "\u0e41": "สระ แอ",
        "\u0e42": "สระ โอ",
        "\u0e43": "สระ ใอ",
        "\u0e44": "สระ ไอ",
        "\u0e45": "ไม้ม้วน",
        "\u0e4d": "นฤคหิต",
        "\u0e47": "ไม้ไต่คู้",
        # Tone Marks
        "\u0e48": "ไม้เอก",
        "\u0e49": "ไม้โท",
        "\u0e4a": "ไม้ตรี",
        "\u0e4b": "ไม้จัตวา",
        # Other Signs
        "\u0e2f": "ไปยาลน้อย",
        "\u0e3a": "พินทุ",
        "\u0e46": "ไม้ยมก",
        "\u0e4c": "การันต์",
        "\u0e4e": "ยามักการ",
        # Punctuation
        "\u0e4f": "ฟองมัน",
        "\u0e5a": "อังคั่นคู่",
        "\u0e5b": "โคมุต",
        # Digits
        **{char: char for char in thai_digits},
        # Symbol
        "\u0e3f": "฿",
    }
)


def is_thai_char(ch: str) -> bool:
    """Check if a character is a Thai character.

    :param ch: input character
    :type ch: str
    :return: True if ch is a Thai character, otherwise False.
    :rtype: bool

    :Example:

        >>> from pythainlp.util import is_thai_char
        >>> is_thai_char("ก")  # THAI CHARACTER KO KAI
        True
        >>> is_thai_char("๕")  # THAI DIGIT FIVE
        True
    """
    ch_val = ord(ch)
    if _TH_FIRST_CHAR_ASCII <= ch_val <= _TH_LAST_CHAR_ASCII:
        return True
    return False


[docs] def isthaichar(ch: str) -> bool: """Check if a character is a Thai character. .. deprecated:: 5.3.2 Use :func:`is_thai_char` instead. :param ch: input character :type ch: str :return: True if ch is a Thai character, otherwise False. :rtype: bool """ warn_deprecation( "pythainlp.util.isthaichar", "pythainlp.util.is_thai_char", "5.3.2", "6.0", ) return is_thai_char(ch)
def is_thai(text: str, ignore_chars: str = ".") -> bool: """Check if every character in a string is a Thai character. :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to "." :type ignore_chars: str, optional :return: True if every character in the input string is Thai, otherwise False. :rtype: bool :Example: >>> from pythainlp.util import is_thai >>> is_thai("กาลเวลา") True >>> is_thai("กาลเวลา.") True >>> is_thai("กาล-เวลา") False >>> is_thai("กาล-เวลา +66", ignore_chars="01234567890+-., ") True """ if not ignore_chars: ignore_chars = "" for ch in text: if ch not in ignore_chars and not is_thai_char(ch): return False return True
[docs] def isthai(text: str, ignore_chars: str = ".") -> bool: """Check if every character in a string is a Thai character. .. deprecated:: 5.3.2 Use :func:`is_thai` instead. :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to "." :type ignore_chars: str, optional :return: True if every character in the input string is Thai, otherwise False. :rtype: bool """ warn_deprecation( "pythainlp.util.isthai", "pythainlp.util.is_thai", "5.3.2", "6.0", ) return is_thai(text, ignore_chars)
def count_thai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: """Find proportion of Thai characters in a given text. :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to whitespace,\\ digits, and punctuation marks. :type ignore_chars: str, optional :return: proportion of Thai characters in the text (percentage) :rtype: float :Example: >>> from pythainlp.util import count_thai >>> count_thai("ไทยเอ็นแอลพี 3.0") 100.0 >>> count_thai("PyThaiNLP 3.0") 0.0 >>> count_thai("ใช้งาน PyThaiNLP 3.0") 40.0 >>> count_thai("ใช้งาน PyThaiNLP 3.0", ignore_chars="") 30.0 """ if not text or not isinstance(text, str): return 0.0 if not ignore_chars: ignore_chars = "" num_thai = 0 num_ignore = 0 for ch in text: if ch in ignore_chars: num_ignore += 1 elif is_thai_char(ch): num_thai += 1 num_count = len(text) - num_ignore if num_count == 0: return 0.0 return (num_thai / num_count) * 100
[docs] def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: """Find proportion of Thai characters in a given text. .. deprecated:: 5.3.2 Use :func:`count_thai` instead. :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to whitespace,\\ digits, and punctuation marks. :type ignore_chars: str, optional :return: proportion of Thai characters in the text (percentage) :rtype: float """ warn_deprecation( "pythainlp.util.countthai", "pythainlp.util.count_thai", "5.3.2", "6.0", ) return count_thai(text, ignore_chars)
[docs] def display_thai_char(ch: str) -> str: """Prefix an underscore (_) to a high-position vowel or a tone mark, to ease readability. :param ch: input character :type ch: str :return: "_" + ch :rtype: str :Example: >>> from pythainlp.util import display_thai_char >>> display_thai_char("้") '_้' """ if ( ch in thai_above_vowels or ch in thai_tonemarks or ch in "\u0e33\u0e4c\u0e4d\u0e4e" ): # last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan return "_" + ch else: return ch
[docs] def thai_word_tone_detector(word: Optional[str]) -> list[tuple[str, str]]: """Thai tone detector for word. It uses pythainlp.transliterate.pronunciate for converting word to\ pronunciation. :param word: Thai word, or None :type word: str, optional :return: list of tuples (syllable, tone) for each syllable. Tone values: ``l`` (low), ``m`` (mid), ``h`` (high), ``r`` (rising), ``f`` (falling), or empty string if it cannot be detected. Returns ``[]`` if word is None or empty. :rtype: list[tuple[str, str]] :Example: >>> from pythainlp.util import thai_word_tone_detector >>> print(thai_word_tone_detector("คนดี")) [('คน', 'm'), ('ดี', 'm')] >>> print(thai_word_tone_detector("มือถือ")) [('มือ', 'm'), ('ถือ', 'r')] >>> print(thai_word_tone_detector(None)) [] """ if not word: return [] from ..transliterate import pronunciate from ..util.syllable import tone_detector _pronunciate = pronunciate(word).split("-") return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate]
[docs] def count_thai_chars(text: str) -> dict[str, int]: """Count Thai characters by type. Count Thai characters by type: consonants, vowels, lead_vowels, follow_vowels, above_vowels, below_vowels, tonemarks, signs, thai_digits, punctuations, and non_thai. :param str text: input text :return: dict with counts of Thai characters by type :rtype: dict[str, int] :Example: >>> from pythainlp.util import count_thai_chars >>> count_thai_chars("ทดสอบภาษาไทย") # doctest: +NORMALIZE_WHITESPACE { 'vowels': 3, 'lead_vowels': 1, 'follow_vowels': 2, 'above_vowels': 0, 'below_vowels': 0, 'consonants': 9, 'tonemarks': 0, 'signs': 0, 'thai_digits': 0, 'punctuations': 0, 'non_thai': 0 } """ _dict = { "vowels": 0, "lead_vowels": 0, "follow_vowels": 0, "above_vowels": 0, "below_vowels": 0, "consonants": 0, "tonemarks": 0, "signs": 0, "thai_digits": 0, "punctuations": 0, "non_thai": 0, } for c in text: if c in thai_vowels: _dict["vowels"] += 1 if c in thai_lead_vowels: _dict["lead_vowels"] += 1 elif c in thai_follow_vowels: _dict["follow_vowels"] += 1 elif c in thai_above_vowels: _dict["above_vowels"] += 1 elif c in thai_below_vowels: _dict["below_vowels"] += 1 elif c in thai_consonants: _dict["consonants"] += 1 elif c in thai_tonemarks: _dict["tonemarks"] += 1 elif c in thai_signs: _dict["signs"] += 1 elif c in thai_digits: _dict["thai_digits"] += 1 elif c in thai_punctuations: _dict["punctuations"] += 1 else: _dict["non_thai"] += 1 return _dict
[docs] def analyze_thai_text(text: str) -> dict[str, int]: """Analyze Thai text and return a character count by descriptive name. Process the text character by character and map each Thai character to its descriptive name or to itself (for consonants and digits). :param str text: Thai text string to be analyzed :return: dict mapping character names to their count in the text :rtype: dict[str, int] :Example: >>> from pythainlp.util import analyze_thai_text >>> analyze_thai_text("คนดี") {'ค': 1, 'น': 1, 'ด': 1, 'สระ อี': 1} >>> analyze_thai_text("เล่น") {'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1} """ results: dict[str, int] = defaultdict(int) # Iterate over each character in the input string for char in text: # Check if the character is in our mapping if char in _THAI_CHAR_NAMES: name = _THAI_CHAR_NAMES[char] results[name] += 1 else: # If the character is not a known Thai character, classify it as character results[char] += 1 return dict(results)