# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Check if it is Thai text
"""
import string
from typing import Tuple
from pythainlp import (
thai_lead_vowels,
thai_follow_vowels,
thai_above_vowels,
thai_below_vowels,
thai_consonants,
thai_vowels,
thai_tonemarks,
thai_signs,
thai_digits,
thai_punctuations,
)
from pythainlp.transliterate import pronunciate
from pythainlp.util.syllable import tone_detector
_DEFAULT_IGNORE_CHARS = string.whitespace + string.digits + string.punctuation
_TH_FIRST_CHAR_ASCII = 3584
_TH_LAST_CHAR_ASCII = 3711
[docs]def isthaichar(ch: str) -> bool:
"""Check if a character is a Thai character.
:param ch: input character
:type ch: str
:return: True if ch is a Thai characttr, otherwise False.
:rtype: bool
:Example:
::
from pythainlp.util import isthaichar
isthaichar("ก") # THAI CHARACTER KO KAI
# output: True
isthaichar("๕") # THAI DIGIT FIVE
# output: True
"""
ch_val = ord(ch)
if ch_val >= _TH_FIRST_CHAR_ASCII and ch_val <= _TH_LAST_CHAR_ASCII:
return True
return False
[docs]def isthai(text: str, ignore_chars: str = ".") -> bool:
"""Check if every characters in a string are Thai character.
:param text: input text
:type text: str
:param ignore_chars: characters to be ignored, defaults to "."
:type ignore_chars: str, optional
:return: True if every characters in the input string are Thai,
otherwise False.
:rtype: bool
:Example:
::
from pythainlp.util import isthai
isthai("กาลเวลา")
# output: True
isthai("กาลเวลา.")
# output: True
isthai("กาล-เวลา")
# output: False
isthai("กาล-เวลา +66", ignore_chars="01234567890+-.,")
# output: True
"""
if not ignore_chars:
ignore_chars = ""
for ch in text:
if ch not in ignore_chars and not isthaichar(ch):
return False
return True
[docs]def countthai(text: str, ignore_chars: str = _DEFAULT_IGNORE_CHARS) -> float:
"""Find proportion of Thai characters in a given text
:param text: input text
:type text: str
:param ignore_chars: characters to be ignored, defaults to whitespaces,\\
digits, and puntuations.
:type ignore_chars: str, optional
:return: proportion of Thai characters in the text (percent)
:rtype: float
:Example:
::
from pythainlp.util import countthai
countthai("ไทยเอ็นแอลพี 3.0")
# output: 100.0
countthai("PyThaiNLP 3.0")
# output: 0.0
countthai("ใช้งาน PyThaiNLP 3.0")
# output: 40.0
countthai("ใช้งาน PyThaiNLP 3.0", ignore_chars="")
# output: 30.0
"""
if not text or not isinstance(text, str):
return 0.0
if not ignore_chars:
ignore_chars = ""
num_thai = 0
num_ignore = 0
for ch in text:
if ch in ignore_chars:
num_ignore += 1
elif isthaichar(ch):
num_thai += 1
num_count = len(text) - num_ignore
if num_count == 0:
return 0.0
return (num_thai / num_count) * 100
[docs]def display_thai_char(ch: str) -> str:
"""Prefix an underscore (_) to a high-position vowel or a tone mark,
to ease readability.
:param ch: input character
:type ch: str
:return: "_" + ch
:rtype: str
:Example:
::
from pythainlp.util import display_thai_char
display_thai_char("้")
# output: "_้"
"""
if (
ch in thai_above_vowels
or ch in thai_tonemarks
or ch in "\u0e33\u0e4c\u0e4d\u0e4e"
):
# last condition is Sra Aum, Thanthakhat, Nikhahit, Yamakkan
return "_" + ch
else:
return ch
[docs]def thai_word_tone_detector(word: str) -> Tuple[str, str]:
"""
Thai tone detector for word.
It use pythainlp.transliterate.pronunciate for convert word to\
pronunciation.
:param str word: Thai word.
:return: Thai pronunciation with tone each syllables.\
(l, m, h, r, f or empty if it cannot detector)
:rtype: Tuple[str, str]
:Example:
::
from pythainlp.util import thai_word_tone_detector
print(thai_word_tone_detector("คนดี"))
# output: [('คน', 'm'), ('ดี', 'm')]
print(thai_word_tone_detector("มือถือ"))
# output: [('มือ', 'm'), ('ถือ', 'r')]
"""
_pronunciate = pronunciate(word).split("-")
return [(i, tone_detector(i.replace("หฺ", "ห"))) for i in _pronunciate]
[docs]def count_thai_chars(text: str) -> dict:
"""
Count Thai characters by type
This function will give you numbers of Thai characters by type\
(consonants, vowels, lead_vowels, follow_vowels, above_vowels,\
below_vowels, tonemarks, signs, thai_digits, punctuations, non_thai)
:param str text: Text
:return: Dict with numbers of Thai characters by type
:rtype: dict
:Example:
::
from pythainlp.util import count_thai_chars
count_thai_chars("ทดสอบภาษาไทย")
# output: {
# 'vowels': 3,
# 'lead_vowels': 1,
# 'follow_vowels': 2,
# 'above_vowels': 0,
# 'below_vowels': 0,
# 'consonants': 9,
# 'tonemarks': 0,
# 'signs': 0,
# 'thai_digits': 0,
# 'punctuations': 0,
# 'non_thai': 0
# }
"""
_dict = {
"vowels": 0,
"lead_vowels": 0,
"follow_vowels": 0,
"above_vowels": 0,
"below_vowels": 0,
"consonants": 0,
"tonemarks": 0,
"signs": 0,
"thai_digits": 0,
"punctuations": 0,
"non_thai": 0,
}
for c in text:
if c in thai_vowels:
_dict["vowels"] += 1
if c in thai_lead_vowels:
_dict["lead_vowels"] += 1
elif c in thai_follow_vowels:
_dict["follow_vowels"] += 1
elif c in thai_above_vowels:
_dict["above_vowels"] += 1
elif c in thai_below_vowels:
_dict["below_vowels"] += 1
elif c in thai_consonants:
_dict["consonants"] += 1
elif c in thai_tonemarks:
_dict["tonemarks"] += 1
elif c in thai_signs:
_dict["signs"] += 1
elif c in thai_digits:
_dict["thai_digits"] += 1
elif c in thai_punctuations:
_dict["punctuations"] += 1
else:
_dict["non_thai"] += 1
return _dict