Source code for pythainlp.util.thai

# -*- coding: utf-8 -*-
"""
Check if it is Thai text
"""
import string

from pythainlp import thai_above_vowels, thai_tonemarks

_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation
_TH_FIRST_CHAR_ASCII = 3584
_TH_LAST_CHAR_ASCII = 3711

[docs]def isthaichar(ch: str) -> bool:
    """
    This function checks if the input character is a Thai character.

    :param str ch: input character

    :return: returns **True** if the input character is a Thai characttr,
             otherwise returns **False**
    :rtype: bool

    :Example:
    ::

        from pythainlp.util import isthaichar

        isthaichar("ก") # THAI CHARACTER KO KAI
        # output: True

        isthaichar("๐") # THAI DIGIT ZERO
        # output: True

        isthaichar("๕") # THAI DIGIT FIVE
        # output: True
    """
    ch_val = ord(ch)
    if ch_val >= _TH_FIRST_CHAR_ASCII and ch_val <= _TH_LAST_CHAR_ASCII:
        return True
    return False


[docs]def isthai(word: str, ignore_chars: str = ".") -> bool:
    """
    This function checks if all character in the input string
    are Thai character.

    :param str word: input text
    :param str ignore_chars: string characters to be ignored
                             (i.e. will be considered as Thai)

    :return: returns **True** if the input text all contains Thai characters,
             otherwise returns **False**
    :rtype: bool

    :Example:

    Check if all character is Thai character. By default,
    it ignores only full stop (".")::

        from pythainlp.util import isthai

        isthai("กาลเวลา")
        # output: True

        isthai("กาลเวลา.")
        # output: True

    Explicitly ignore digits, whitespace, and the following characters
    ("-", ".", "$", ",")::

        from pythainlp.util import isthai

        isthai("กาลเวลา, การเวลา-ก,  3.75$", ignore_chars="1234567890.-,$ ")
        # output: True

    """
    if not ignore_chars:
        ignore_chars = ""

    for ch in word:
        if ch not in ignore_chars and not isthaichar(ch):
            return False
    return True


[docs]def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float:
    """
    This function calculates percentage of Thai characters in the text
    with an option to ignored some characters.

    :param str text: input text
    :param str ignore_chars: string of characters to ignore from counting.
                             By default, the ignored characters are whitespace,
                             newline, digits, and punctuation.

    :return: percentage of Thai characters in the text
    :rtype: float

    :Example:

    Find the percentage of Thai characters in the textt with default
    ignored characters set (whitespace, newline character,
    punctuation and digits)::

        from pythainlp.util import countthai

        countthai("ดอนัลด์ จอห์น ทรัมป์ English: Donald John Trump")
        # output: 45.0

        countthai("(English: Donald John Trump)")
        # output: 0.0

    Find the percentage of Thai characters in the text while ignoring
    only punctuation but not whitespace, newline character and digits::

        import string

        string.punctuation
        # output: !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~

        countthai("ดอนัลด์ จอห์น ทรัมป์ English: Donald John Trump", \\
            ignore_chars=string.punctuation)
        # output: 39.130434782608695

        countthai("ดอนัลด์ จอห์น ทรัมป์ (English: Donald John Trump)", \\
            ignore_chars=string.punctuation)
        # output: 0.0
    """
    if not text or not isinstance(text, str):
        return 0.0

    if not ignore_chars:
        ignore_chars = ""

    num_thai = 0
    num_ignore = 0

    for ch in text:
        if ch in ignore_chars:
            num_ignore += 1
        elif isthaichar(ch):
            num_thai += 1

    num_count = len(text) - num_ignore

    if num_count == 0:
        return 0.0

    return (num_thai / num_count) * 100


[docs]def display_thai_char(char: str) -> str:
    """
    This function adds a underscore (_) prefix to high-position vowels and tone
    marks to ease readability

    :param str character:

    :return: returns **True** if the input text all contains Thai characters,
             otherwise returns **False**
    :rtype: bool

    :Example:

    display_thai_char("้")
    # output: "_้"

    """

    if char in thai_above_vowels or char in thai_tonemarks \
       or char in '\u0e33\u0e4c\u0e4d\u0e4e':
        # last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan
        return "_" + char
    else:
        return char