# -*- coding: utf-8 -*-
"""
Text normalization
"""
import re
from pythainlp import thai_tonemarks
_NORMALIZE_RULE1 = [
"ะ",
"ั",
"็",
"า",
"ิ",
"ี",
"ึ",
"่",
"ํ",
"ุ",
"ู",
"ใ",
"ไ",
"โ",
"ื",
"่",
"้",
"๋",
"๊",
"ึ",
"์",
"๋",
"ำ",
] # เก็บพวกสระ วรรณยุกต์ที่ซ้ำกันแล้วมีปัญหา
_NORMALIZE_RULE2 = [
("เเ", "แ"), # เ เ -> แ
("ํ(t)า", "\\1ำ"),
("ํา(t)", "\\1ำ"),
("([่-๋])([ัิ-ื])", "\\2\\1"),
("([่-๋])([ูุ])", "\\2\\1"),
("ำ([่-๋])", "\\1ำ"),
("(์)([ัิ-ู])", "\\2\\1"),
] # เก็บพวก พิมพ์ลำดับผิดหรือผิดแป้นแต่กลับแสดงผลถูกต้อง ให้ไปเป็นแป้นที่ถูกต้อง เช่น เ + เ ไปเป็น แ
[docs]def normalize(text: str) -> str:
"""
Thai text normalize
:param str text: thai text
:return: thai text
**Example**::
>>> print(normalize("เเปลก")=="แปลก") # เ เ ป ล ก กับ แปลก
True
"""
for data in _NORMALIZE_RULE2:
text = re.sub(data[0].replace("t", "[่้๊๋]"), data[1], text)
for data in list(zip(_NORMALIZE_RULE1, _NORMALIZE_RULE1)):
text = re.sub(data[0].replace("t", "[่้๊๋]") + "+", data[1], text)
return text
[docs]def deletetone(text: str) -> str:
"""
Remove tonemarks
:param str text: thai text
:return: thai text
"""
chars = [ch for ch in text if ch not in thai_tonemarks]
return "".join(chars)