# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
Text normalization
import re
from typing import List, Union
from pythainlp import thai_above_vowels as above_v
from pythainlp import thai_below_vowels as below_v
from pythainlp import thai_follow_vowels as follow_v
from pythainlp import thai_lead_vowels as lead_v
from pythainlp import thai_tonemarks as tonemarks
from pythainlp.tokenize import word_tokenize
from pythainlp.tools import warn_deprecation
_DANGLING_CHARS = f"{above_v}{below_v}{tonemarks}\u0e3a\u0e4c\u0e4d\u0e4e"
_ZERO_WIDTH_CHARS = "\u200b\u200c" # ZWSP, ZWNJ
("\u0e40\u0e40", "\u0e41"), # Sara E + Sara E -> Sara Ae
), # TONE/Thanthakhat + ABV/BLW VOWEL -> ABV/BLW VOWEL + TONE/Thanthakhat
), # Nikhahit + TONEMARK + Sara Aa -> TONEMARK + Sara Am
("([^\u0e24\u0e26])\u0e45", "\\1\u0e32"), # Lakkhangyao -> Sara Aa
# VOWELS + Phinthu, Thanthakhat, Nikhahit, Yamakkan
zip([f"({ch}[ ]*)+{ch}" for ch in _NOREPEAT_CHARS], _NOREPEAT_CHARS)
_RE_TONEMARKS = re.compile(f"[{tonemarks}]+")
_RE_REMOVE_NEWLINES = re.compile("[ \n]*\n[ \n]*")
def _last_char(matchobj): # to be used with _RE_NOREPEAT_TONEMARKS
return matchobj.group(0)[-1]
def remove_dangling(text: str) -> str:
Remove Thai non-base characters at the beginning of text.
This is a common "typo", especially for input field in a form,
as these non-base characters can be visually hidden from user
who may accidentally typed them in.
A character to be removed should be both:
* tone mark, above vowel, below vowel, or non-base sign AND
* located at the beginning of the text
:param str text: input text
:return: text without dangling Thai characters at the beginning
:rtype: str
from pythainlp.util import remove_dangling
# output: 'ก'
return _RE_REMOVE_DANGLINGS.sub("", text)
def remove_dup_spaces(text: str) -> str:
Remove duplicate spaces. Replace multiple spaces with one space.
Multiple newline characters and empty lines will be replaced
with one newline character.
:param str text: input text
:return: text without duplicated spaces and newlines
:rtype: str
from pythainlp.util import remove_dup_spaces
remove_dup_spaces("ก ข ค")
# output: 'ก ข ค'
while " " in text:
text = text.replace(" ", " ")
text = _RE_REMOVE_NEWLINES.sub("\n", text)
text = text.strip()
return text
def remove_tonemark(text: str) -> str:
Remove all Thai tone marks from the text.
Thai script has four tone marks indicating four tones as follows:
* Down tone (Thai: ไม้เอก _่ )
* Falling tone (Thai: ไม้โท _้ )
* High tone (Thai: ไม้ตรี _๊ )
* Rising tone (Thai: ไม้จัตวา _๋ )
Putting wrong tone mark is a common mistake in Thai writing.
By removing tone marks from the string, it could be used to
for a approximate string matching.
:param str text: input text
:return: text without Thai tone marks
:rtype: str
from pythainlp.util import remove_tonemark
# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
for ch in tonemarks:
while ch in text:
text = text.replace(ch, "")
return text
def remove_zw(text: str) -> str:
Remove zero-width characters.
These non-visible characters may cause unexpected result from the
user's point of view. Removing them can make string matching more robust.
Characters to be removed:
* Zero-width space (ZWSP)
* Zero-width non-joiner (ZWJP)
:param str text: input text
:return: text without zero-width characters
:rtype: str
for ch in _ZERO_WIDTH_CHARS:
while ch in text:
text = text.replace(ch, "")
return text
def reorder_vowels(text: str) -> str:
Reorder vowels and tone marks to the standard logical order/spelling.
Characters in input text will be reordered/transformed,
according to these rules:
* Sara E + Sara E -> Sara Ae
* Nikhahit + Sara Aa -> Sara Am
* tone mark + non-base vowel -> non-base vowel + tone mark
* follow vowel + tone mark -> tone mark + follow vowel
:param str text: input text
:return: text with vowels and tone marks in the standard logical order
:rtype: str
for pair in _REORDER_PAIRS:
text = re.sub(pair[0], pair[1], text)
return text
def remove_repeat_vowels(text: str) -> str:
Remove repeating vowels, tone marks, and signs.
This function will call reorder_vowels() first, to make sure that
double Sara E will be converted to Sara Ae and not be removed.
:param str text: input text
:return: text without repeating Thai vowels, tone marks, and signs
:rtype: str
text = reorder_vowels(text)
for pair in _NOREPEAT_PAIRS:
text = re.sub(pair[0], pair[1], text)
# remove repeating tone marks, use last tone mark
text = _RE_TONEMARKS.sub(_last_char, text)
return text
def normalize(text: str) -> str:
Normalize and clean Thai text with normalizing rules as follows:
* Remove zero-width spaces
* Remove duplicate spaces
* Reorder tone marks and vowels to standard order/spelling
* Remove duplicate vowels and signs
* Remove duplicate tone marks
* Remove dangling non-base characters at the beginning of text
normalize() simply call remove_zw(), remove_dup_spaces(),
remove_repeat_vowels(), and remove_dangling(), in that order.
If a user wants to customize the selection or the order of rules
to be applied, they can choose to call those functions by themselves.
Note: for Unicode normalization, see unicodedata.normalize().
:param str text: input text
:return: normalized text according to the rules
:rtype: str
from pythainlp.util import normalize
normalize("เเปลก") # starts with two Sara E
# output: แปลก
# output: นานา
text = remove_zw(text)
text = remove_dup_spaces(text)
text = remove_repeat_vowels(text)
text = remove_dangling(text)
return text
def expand_maiyamok(sent: Union[str, List[str]]) -> List[str]:
Expand Maiyamok.
Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
repetition. This function preprocesses Thai text by replacing
Maiyamok with a word being repeated.
:param Union[str, List[str]] sent: sentence (list or string)
:return: list of words
:rtype: List[str]
from pythainlp.util import expand_maiyamok
# output: ['คน', 'คน', 'นก']
if isinstance(sent, str):
sent = word_tokenize(sent)
yamok = "ๆ"
# Breaks Maiyamok that attached to others, e.g. "นกๆๆ", "นกๆ ๆ", "นกๆคน"
re_yamok = re.compile(rf"({yamok})")
temp_toks: list[str] = []
for token in sent:
toks = re_yamok.split(token)
toks = [tok for tok in toks if tok] # remove empty string ("")
sent = temp_toks
del temp_toks
output_toks: list[str] = []
yamok_count = 0
len_sent = len(sent)
for i in range(len_sent - 1, -1, -1): # do it backward
if yamok_count == 0 or (i + 1 >= len_sent):
if sent[i] == yamok:
yamok_count = yamok_count + 1
if sent[i] == yamok:
yamok_count = yamok_count + 1
if sent[i].isspace():
if yamok_count > 0: # remove space before yamok
else: # with preprocessing above, this should not happen
output_toks.extend([sent[i]] * (yamok_count + 1))
yamok_count = 0
return output_toks[::-1]
def maiyamok(sent: Union[str, List[str]]) -> List[str]:
Expand Maiyamok.
Deprecated. Use expand_maiyamok() instead.
Maiyamok (ๆ) (Unicode U+0E46) is a Thai character indicating word
repetition. This function preprocesses Thai text by replacing
Maiyamok with a word being repeated.
:param Union[str, List[str]] sent: sentence (list or string)
:return: list of words
:rtype: List[str]
from pythainlp.util import expand_maiyamok
# output: ['คน', 'คน', 'นก']
return expand_maiyamok(sent)