# -*- coding: utf-8 -*-
"""
Thai tokenizers
"""
import re
import warnings
from typing import Iterable, List, Union
from pythainlp.corpus import thai_syllables, thai_words
from .trie import Trie
DEFAULT_DICT_TRIE = Trie(thai_words())
[docs]def word_tokenize(
text: str,
custom_dict: Trie = None,
engine: str = "newmm",
keep_whitespace: bool = True,
) -> List[str]:
"""
This function tokenizes running text into words.
:param str text: text to be tokenized
:param str engine: name of the tokenizer to be used
:param pythainlp.tokenize.Trie custom_dict: dictionary trie
:param bool keep_whitespace: True to keep whitespaces, a common mark
for end of phrase in Thai.
Otherwise, whitespaces are omitted.
:return: list of words
:rtype: list[str]
**Options for engine**
* *newmm* (default) - dictionary-based, Maximum Matching +
Thai Character Cluster
* *newmm-safe* - newmm, with a mechanism to avoid long
processing time for some long continuous text without spaces
* *longest* - dictionary-based, Longest Matching
* *icu* - wrapper for ICU (International Components for Unicode,
using PyICU), dictionary-based
* *attacut* - wrapper for
`AttaCut <https://github.com/PyThaiNLP/attacut>`_.,
learning-based approach
* *deepcut* - wrapper for
`DeepCut <https://github.com/rkcosmos/deepcut>`_,
learning-based approach
.. warning::
* the option for engine named *ulmfit* has been deprecated since \
PyThaiNLP version 2.1
:Note:
- The parameter **custom_dict** can be provided as an argument \
only for *newmm*, *longest*, and *attacut* engine.
:Example:
Tokenize text with different tokenizer::
from pythainlp.tokenize import word_tokenize
text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
word_tokenize(text, engine="newmm")
# output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
word_tokenize(text, engine='attacut')
# output: ['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
Tokenize text by omiting whitespaces::
text = "วรรณกรรม ภาพวาด และการแสดงงิ้ว "
word_tokenize(text, engine="newmm")
# output:
# ['วรรณกรรม', ' ', 'ภาพวาด', ' ', 'และ', 'การแสดง', 'งิ้ว', ' ']
word_tokenize(text, engine="newmm", keep_whitespace=False)
# output: ['วรรณกรรม', 'ภาพวาด', 'และ', 'การแสดง', 'งิ้ว']
Tokenize with default and custom dictionary::
from pythainlp.corpus.common import thai_words
from pythainlp.tokenize import dict_trie
text = 'ชินโซ อาเบะ เกิด 21 กันยายน'
word_tokenize(text, engine="newmm")
# output:
# ['ชิน', 'โซ', ' ', 'อา', 'เบะ', ' ',
# 'เกิด', ' ', '21', ' ', 'กันยายน']
custom_dict_japanese_name = set(thai_words()
custom_dict_japanese_name.add('ชินโซ')
custom_dict_japanese_name.add('อาเบะ')
trie = dict_trie(dict_source=custom_dict_japanese_name)
word_tokenize(text, engine="newmm", custom_dict=trie))
# output:
# ['ชินโซ', ' ', 'อาเบะ',
# ' ', 'เกิด', ' ', '21', ' ', 'กันยายน']
"""
if not text or not isinstance(text, str):
return []
segments = []
if engine == "newmm" or engine == "onecut":
from .newmm import segment
segments = segment(text, custom_dict)
elif engine == "newmm-safe":
from .newmm import segment
segments = segment(text, custom_dict, safe_mode=True)
elif engine == "attacut":
from .attacut import segment
segments = segment(text)
elif engine == "longest":
from .longest import segment
segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
elif engine == "icu":
from .pyicu import segment
segments = segment(text)
else: # default, use "newmm" engine
from .newmm import segment
segments = segment(text, custom_dict)
if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
return segments
def dict_word_tokenize(
text: str,
custom_dict: Trie = DEFAULT_DICT_TRIE,
engine: str = "newmm",
keep_whitespace: bool = True,
) -> List[str]:
"""
:meth: DEPRECATED: Please use `word_tokenize()` with a `custom_dict`
argument instead
:param str text: text to be tokenized
:param dict custom_dict: a dictionary trie, or an iterable of words,
or a string of dictionary path
:param str engine: choose between different options of engine to token
(newmm [default], longest, and attacut)
:param bool keep_whitespace: True to keep whitespaces, a common mark
for end of phrase in Thai
:return: list of words
:rtype: list[str]
"""
warnings.warn(
"dict_word_tokenize is deprecated. Use word_tokenize with a custom_dict argument instead.",
DeprecationWarning,
)
return word_tokenize(
text=text,
custom_dict=custom_dict,
engine=engine,
keep_whitespace=keep_whitespace,
)
[docs]def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
"""
This function does not yet automatically recognize when a sentence
actually ends. Rather it helps split text where white space and
a new line is found.
:param str text: the text to be tokenized
:param str engine: choose between *'whitespace'* or *'whitespace+newline'*
:return: list of splited sentences
:rtype: list[str]
**Options for engine**
* *whitespace+newline* (default) - split by whitespace token \
and newline.
* *whitespace* - split by whitespace token. Specifiaclly, with \
:class:`regex` pattern ``r" +"``
:Example:
Split the text based on *whitespace*::
from pythainlp.tokenize import sent_tokenize
sentence_1 = "ฉันไปประชุมเมื่อวันที่ 11 มีนาคม"
sentence_2 = "ข้าราชการได้รับการหมุนเวียนเป็นระยะ \\
และได้รับมอบหมายให้ประจำในระดับภูมิภาค"
sent_tokenize(sentence_1, engine="whitespace")
# output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']
sent_tokenize(sentence_2, engine="whitespace")
# output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
# '\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']
Split the text based on *whitespace* and *newline*::
sent_tokenize(sentence_1, engine="whitespace+newline")
# output: ['ฉันไปประชุมเมื่อวันที่', '11', 'มีนาคม']
sent_tokenize(sentence_2, engine="whitespace+newline")
# output: ['ข้าราชการได้รับการหมุนเวียนเป็นระยะ',
'\\nและได้รับมอบหมายให้ประจำในระดับภูมิภาค']
"""
if not text or not isinstance(text, str):
return []
sentences = []
if engine == "whitespace":
sentences = re.split(r" +", text, re.U)
else: # default, use whitespace + newline
sentences = text.split()
return sentences
[docs]def subword_tokenize(text: str, engine: str = "tcc") -> List[str]:
"""
This function tokenizes text into inseparable units of
Thai contiguous characters namely
`Thai Character Clusters (TCCs) \
<https://www.researchgate.net/publication/2853284_Character_Cluster_Based_Thai_Information_Retrieval>`_
TCCs are the units based on Thai spelling feature that could not be
separated any character further such as 'ก็', 'จะ', 'ไม่', and 'ฝา'.
If the following units are separated, they could not be spelled out.
This function apply the TCC rules to tokenizes the text into
the smallest units.
For example, the word 'ขนมชั้น' would be tokenized
into 'ข', 'น', 'ม', and 'ชั้น'.
:param str text: text to be tokenized
:param str engine: the name subword tokenizer
:return: list of subwords
:rtype: list[str]
**Options for engine**
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *ssg* - CRF syllable segmenter for Thai.
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
[In development]
:Example:
Tokenize text into subword based on *tcc*::
from pythainlp.tokenize import subword_tokenize
text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
text_2 = "ความแปลกแยกและพัฒนาการ"
subword_tokenize(text_1, engine='tcc')
# output: ['ยุ', 'ค', 'เริ่ม', 'แร', 'ก',
# 'ข', 'อ', 'ง', ' ', 'รา', 'ช', 'ว', 'ง',
# 'ศ', '์', 'ห', 'มิ', 'ง']
subword_tokenize(text_2, engine='tcc')
# output: ['ค', 'วา', 'ม', 'แป', 'ล', 'ก', 'แย', 'ก',
'และ', 'พัฒ','นา', 'กา', 'ร']
Tokenize text into subword based on *etcc* **(Work In Progress)**::
text_1 = "ยุคเริ่มแรกของ ราชวงศ์หมิง"
text_2 = "ความแปลกแยกและพัฒนาการ"
subword_tokenize(text_1, engine='etcc')
# output: ['ยุคเริ่มแรกของ ราชวงศ์หมิง']
subword_tokenize(text_2, engine='etcc')
# output: ['ความแปลกแยกและ', 'พัฒ', 'นาการ']
"""
if not text or not isinstance(text, str):
return []
if engine == "etcc":
from .etcc import segment
elif engine == "ssg":
from .ssg import segment
else: # default
from .tcc import segment
return segment(text)
[docs]def syllable_tokenize(text: str, engine: str = "default") -> List[str]:
"""
This function is to tokenize text into syllable (Thai: พยางค์), a unit of
pronunciation having one vowel sound. For example, the word 'รถไฟ'
contains two syallbles including 'รถ', and 'ไฟ'.
Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
with *newmm* as a tokenizer. The function tokenize the text with
the dictionary of Thai words from
:func:`pythainlp.corpus.common.thai_words`
and then dictionary of Thai syllable from
:func:`pythainlp.corpus.common.thai_syllables`.
As a result, only syllables are obtained.
:param str text: input string to be tokenized
:param str engine: name of the syllable tokenizer
:return: list of syllables where whitespaces in the text **are included**
:rtype: list[str]
**Options for engine**
* *default*
* *ssg* - CRF syllable segmenter for Thai.
:Example::
::
from pythainlp.tokenize import syllable_tokenize
text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
syllable_tokenize(text)
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
"""
if not text or not isinstance(text, str):
return []
tokens = []
if engine == "default":
words = word_tokenize(text)
trie = dict_trie(dict_source=thai_syllables())
for word in words:
tokens.extend(word_tokenize(text=word, custom_dict=trie))
else:
from .ssg import segment
tokens = segment(text)
return tokens
[docs]def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
"""
Create a dictionary trie which will be used for word_tokenize() function.
:param str|Iterable[str]|pythainlp.tokenize.Trie dict_source: a path to
dictionary file or a list of words or a pythainlp.tokenize.Trie object
:return: a trie object created from a dictionary input
:rtype: pythainlp.tokenize.Trie
"""
trie = None
if isinstance(dict_source, Trie):
trie = dict_source
elif isinstance(dict_source, str):
# Receive a file path of the dict to read
with open(dict_source, "r", encoding="utf8") as f:
_vocabs = f.read().splitlines()
trie = Trie(_vocabs)
elif isinstance(dict_source, Iterable):
# Note: Since Trie and str are both Iterable,
# so the Iterable check should be here, at the very end,
# because it has less specificality
# Received a sequence type object of vocabs
trie = Trie(dict_source)
else:
raise TypeError(
"Type of dict_source must be pythainlp.tokenize.Trie, or Iterable[str], or str (path to source file)"
)
return trie
[docs]class Tokenizer:
"""
This class allows users to pre-define custom dictionary along with
tokenizer and encapsulate them into one single object.
It is an wrapper for both two functions including
:func:`pythainlp.tokenize.word_tokenize`,
and :func:`pythainlp.tokenize.dict_trie`
:Example:
Tokenizer object instantiated with :class:`pythainlp.tokenize.Trie`::
from pythainlp.tokenize import Tokenizer
from pythainlp.tokenize import Tokenizer, dict_trie
from pythainlp.corpus.common import thai_words
custom_words_list = set(thai_words())
custom_words_list.add('อะเฟเซีย')
custom_words_list.add('Aphasia')
trie = dict_trie(dict_source=custom_words_list)
text = "อะเฟเซีย (Aphasia*) เป็นอาการผิดปกติของการพูด"
_tokenizer = Tokenizer(custom_dict=trie, engine='newmm')
# output: ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ',
'ผิดปกติ', 'ของ', 'การ', 'พูด']
Tokenizer object instantiated with a list of words::
text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด"
_tokenizer = Tokenizer(custom_dict=list(thai_words()), engine='newmm')
_tokenizer.word_tokenize(text)
# output:
# ['อะ', 'เฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ',
# 'ผิดปกติ', 'ของ', 'การ', 'พูด']
Tokenizer object instantiated with a file path containing list of
word separated with *newline* and explicitly set a new tokenizer
after initiation::
PATH_TO_CUSTOM_DICTIONARY = './custom_dictionary.txtt'
# write a file
with open(PATH_TO_CUSTOM_DICTIONARY, 'w', encoding='utf-8') as f:
f.write('อะเฟเซีย\\nAphasia\\nผิด\\nปกติ')
text = "อะเฟเซีย (Aphasia) เป็นอาการผิดปกติของการพูด"
# initate an object from file with `attacut` as tokenizer
_tokenizer = Tokenizer(custom_dict=PATH_TO_CUSTOM_DICTIONARY, \\
engine='attacut')
_tokenizer.word_tokenize(text)
# output:
# ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็น', 'อาการ', 'ผิด',
# 'ปกติ', 'ของ', 'การ', 'พูด']
# change tokenizer to `newmm`
_tokenizer.set_tokenizer_engine(engine='newmm')
_tokenizer.word_tokenize(text)
# output:
# ['อะเฟเซีย', ' ', '(', 'Aphasia', ')', ' ', 'เป็นอาการ', 'ผิด',
# 'ปกติ', 'ของการพูด']
"""
def __init__(
self,
custom_dict: Union[Trie, Iterable[str], str] = None,
engine: str = "newmm",
):
"""
Initialize tokenizer object
:param str: a file path, a list of vocaburaies* to be
used to create a trie, or an instantiated
:class:`pythainlp.tokenize.Trie` object.
:param str engine: choose between different options of engine to token
(i.e. *newmm*, *longest*, *attacut*)
"""
self.__trie_dict = None
self.__engine = engine
if custom_dict:
self.__trie_dict = dict_trie(custom_dict)
else:
self.__trie_dict = DEFAULT_DICT_TRIE
[docs] def word_tokenize(self, text: str) -> List[str]:
"""
:param str text: text to be tokenized
:return: list of words, tokenized from the text
:rtype: list[str]
"""
return word_tokenize(
text, custom_dict=self.__trie_dict, engine=self.__engine
)
[docs] def set_tokenize_engine(self, engine: str) -> None:
"""
Set the tokenizer
:param str engine: choose between different options of engine to token
(i.e. *newmm*, *longest*, *attacut*)
"""
self.__engine = engine