Source code for pythainlp.util.thai

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Check if it is Thai text
"""
import string
from typing import Tuple

from pythainlp import (
    thai_lead_vowels,
    thai_follow_vowels,
    thai_above_vowels,
    thai_below_vowels,
    thai_consonants,
    thai_vowels,
    thai_tonemarks,
    thai_signs,
    thai_digits,
    thai_punctuations,
)
from pythainlp.transliterate import pronunciate
from pythainlp.util.syllable import tone_detector

_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation
_TH_FIRST_CHAR_ASCII = 3584
_TH_LAST_CHAR_ASCII = 3711


[docs]def isthaichar(ch: str) -> bool: """Check if a character is a Thai character. :param ch: input character :type ch: str :return: True if ch is a Thai characttr, otherwise False. :rtype: bool :Example: :: from pythainlp.util import isthaichar isthaichar("ก") # THAI CHARACTER KO KAI # output: True isthaichar("๕") # THAI DIGIT FIVE # output: True """ ch_val = ord(ch) if ch_val >= _TH_FIRST_CHAR_ASCII and ch_val <= _TH_LAST_CHAR_ASCII: return True return False
[docs]def isthai(text: str, ignore_chars: str = ".") -> bool: """Check if every characters in a string are Thai character. :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to "." :type ignore_chars: str, optional :return: True if every characters in the input string are Thai, otherwise False. :rtype: bool :Example: :: from pythainlp.util import isthai isthai("กาลเวลา") # output: True isthai("กาลเวลา.") # output: True isthai("กาล-เวลา") # output: False isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,") # output: True """ if not ignore_chars: ignore_chars = "" for ch in text: if ch not in ignore_chars and not isthaichar(ch): return False return True
[docs]def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float: """Find proportion of Thai characters in a given text :param text: input text :type text: str :param ignore_chars: characters to be ignored, defaults to whitespaces,\\ digits, and puntuations. :type ignore_chars: str, optional :return: proportion of Thai characters in the text (percent) :rtype: float :Example: :: from pythainlp.util import countthai countthai("ไทยเอ็นแอลพี 3.0") # output: 100.0 countthai("PyThaiNLP 3.0") # output: 0.0 countthai("ใช้งาน PyThaiNLP 3.0") # output: 40.0 countthai("ใช้งาน PyThaiNLP 3.0", ignore_chars="") # output: 30.0 """ if not text or not isinstance(text, str): return 0.0 if not ignore_chars: ignore_chars = "" num_thai = 0 num_ignore = 0 for ch in text: if ch in ignore_chars: num_ignore += 1 elif isthaichar(ch): num_thai += 1 num_count = len(text) - num_ignore if num_count == 0: return 0.0 return (num_thai / num_count) * 100
[docs]def display_thai_char(ch: str) -> str: """Prefix an underscore (_) to a high-position vowel or a tone mark, to ease readability. :param ch: input character :type ch: str :return: "_" + ch :rtype: str :Example: :: from pythainlp.util import display_thai_char display_thai_char("้") # output: "_้" """ if ( ch in thai_above_vowels or ch in thai_tonemarks or ch in "\u0e33\u0e4c\u0e4d\u0e4e" ): # last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan return "_" + ch else: return ch
[docs]def thai_word_tone_detector(word: str) -> Tuple[str, str]: """ Thai tone detector for word. It use pythainlp.transliterate.pronunciate for convert word to\ pronunciation. :param str word: Thai word. :return: Thai pronunciation with tone each syllables.\ (l, m, h, r, f or empty if it cannot detector) :rtype: Tuple[str, str] :Example: :: from pythainlp.util import thai_word_tone_detector print(thai_word_tone_detector("คนดี")) # output: [('คน', 'm'), ('ดี', 'm')] print(thai_word_tone_detector("มือถือ")) # output: [('มือ', 'm'), ('ถือ', 'r')] """ _pronunciate = pronunciate(word).split("-") return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate]
[docs]def count_thai_chars(text: str) -> dict: """ Count Thai characters by type This function will give you numbers of Thai characters by type\ (consonants, vowels, lead_vowels, follow_vowels, above_vowels,\ below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai) :param str text: Text :return: Dict with numbers of Thai characters by type :rtype: dict :Example: :: from pythainlp.util import count_thai_chars count_thai_chars("ทดสอบภาษาไทย") # output: { # 'vowels': 3, # 'lead_vowels': 1, # 'follow_vowels': 2, # 'above_vowels': 0, # 'below_vowels': 0, # 'consonants': 9, # 'tonemarks': 0, # 'signs': 0, # 'thai_digits': 0, # 'punctuations': 0, # 'non_thai': 0 # } """ _dict = { "vowels": 0, "lead_vowels": 0, "follow_vowels": 0, "above_vowels": 0, "below_vowels": 0, "consonants": 0, "tonemarks": 0, "signs": 0, "thai_digits": 0, "punctuations": 0, "non_thai": 0, } for c in text: if c in thai_vowels: _dict["vowels"] += 1 if c in thai_lead_vowels: _dict["lead_vowels"] += 1 elif c in thai_follow_vowels: _dict["follow_vowels"] += 1 elif c in thai_above_vowels: _dict["above_vowels"] += 1 elif c in thai_below_vowels: _dict["below_vowels"] += 1 elif c in thai_consonants: _dict["consonants"] += 1 elif c in thai_tonemarks: _dict["tonemarks"] += 1 elif c in thai_signs: _dict["signs"] += 1 elif c in thai_digits: _dict["thai_digits"] += 1 elif c in thai_punctuations: _dict["punctuations"] += 1 else: _dict["non_thai"] += 1 return _dict