# -*- coding: utf-8 -*-
"""
Thai tokenizers
"""
import re
import sys
from typing import Iterable, List, Union
from pythainlp.corpus import get_corpus, thai_syllables, thai_words
from marisa_trie import Trie
DEFAULT_DICT_TRIE = Trie(thai_words())
FROZEN_DICT_TRIE = Trie(get_corpus("words_th_frozen_201810.txt"))
[docs]def word_tokenize(
text: str,
custom_dict: Trie = None,
engine: str = "newmm",
keep_whitespace: bool = True,
) -> List[str]:
"""
:param str text: text to be tokenized
:param str engine: tokenizer to be used
:param dict custom_dict: a dictionary trie
:param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai
:return: list of words
**Options for engine**
* newmm (default) - dictionary-based, Maximum Matching + Thai Character Cluster
* longest - dictionary-based, Longest Matching
* deepcut - wrapper for deepcut, language-model-based https://github.com/rkcosmos/deepcut
* icu - wrapper for ICU (International Components for Unicode, using PyICU), dictionary-based
* ulmfit - for thai2fit
* a custom_dict can be provided for newmm, longest, and deepcut
**Example**
>>> from pythainlp.tokenize import word_tokenize
>>> text = "โอเคบ่พวกเรารักภาษาบ้านเกิด"
>>> word_tokenize(text, engine="newmm")
['โอเค', 'บ่', 'พวกเรา', 'รัก', 'ภาษา', 'บ้านเกิด']
>>> word_tokenize(text, engine="icu")
['โอ', 'เค', 'บ่', 'พวก', 'เรา', 'รัก', 'ภาษา', 'บ้าน', 'เกิด']
"""
if not text or not isinstance(text, str):
return []
segments = []
if engine == "newmm" or engine == "onecut":
from .newmm import segment
segments = segment(text, custom_dict)
elif engine == "longest":
from .longest import segment
segments = segment(text, custom_dict)
elif engine == "mm" or engine == "multi_cut":
from .multi_cut import segment
segments = segment(text, custom_dict)
elif engine == "deepcut": # deepcut can optionally use dictionary
from .deepcut import segment
if custom_dict:
custom_dict = list(custom_dict)
segments = segment(text, custom_dict)
else:
segments = segment(text)
elif engine == "ulmfit": # ulmfit has its own specific dictionary
from .newmm import segment
segments = segment(text, custom_dict=FROZEN_DICT_TRIE)
elif engine == "icu":
from .pyicu import segment
segments = segment(text)
else: # default, use "newmm" engine
from .newmm import segment
segments = segment(text, custom_dict)
if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]
return segments
def dict_word_tokenize(
text: str,
custom_dict: Trie = DEFAULT_DICT_TRIE,
engine: str = "newmm",
keep_whitespace: bool = True,
) -> List[str]:
"""
:meth: DEPRECATED: Please use `word_tokenize()` with a `custom_dict` argument instead
:param str text: text to be tokenized
:param dict custom_dict: a dictionary trie, or an iterable of words, or a string of dictionary path
:param str engine: choose between different options of engine to token (newmm [default], mm, longest, and deepcut)
:param bool keep_whitespace: True to keep whitespaces, a common mark for end of phrase in Thai
:return: list of words
"""
print(
"Deprecated. Use word_tokenize() with a custom_dict argument instead.",
file=sys.stderr,
)
return word_tokenize(
text=text,
custom_dict=custom_dict,
engine=engine,
keep_whitespace=keep_whitespace,
)
[docs]def sent_tokenize(text: str, engine: str = "whitespace+newline") -> List[str]:
"""
This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found.
:param str text: the text to be tokenized
:param str engine: choose between 'whitespace' or 'whitespace+newline'
:return: list of sentences
"""
if not text or not isinstance(text, str):
return []
sentences = []
if engine == "whitespace":
sentences = re.split(r" +", text, re.U)
else: # default, use whitespace + newline
sentences = text.split()
return sentences
[docs]def subword_tokenize(text: str, engine: str = "tcc") -> List[str]:
"""
:param str text: text to be tokenized
:param str engine: subword tokenizer
:return: list of subwords
**Options for engine**
* tcc (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* etcc - Enhanced Thai Character Cluster (Inrut et al. 2001) [In development]
"""
if not text or not isinstance(text, str):
return []
if engine == "etcc":
from .etcc import segment
else: # default
from .tcc import segment
return segment(text)
[docs]def syllable_tokenize(text: str) -> List[str]:
"""
:param str text: input string to be tokenized
:return: list of syllables
"""
if not text or not isinstance(text, str):
return []
tokens = []
if text:
words = word_tokenize(text)
trie = dict_trie(dict_source=thai_syllables())
for word in words:
tokens.extend(word_tokenize(text=word, custom_dict=trie))
return tokens
[docs]def dict_trie(dict_source: Union[str, Iterable[str], Trie]) -> Trie:
"""
Create a dict trie which will be used for word_tokenize() function.
For more information on the trie data structure,
see: https://marisa-trie.readthedocs.io/en/latest/index.html
:param string/list dict_source: a list of vocaburaries or a path to source file
:return: a trie created from a dictionary input
"""
trie = None
if isinstance(dict_source, Trie):
trie = dict_source
elif isinstance(dict_source, str):
# Receive a file path of the dict to read
with open(dict_source, "r", encoding="utf8") as f:
_vocabs = f.read().splitlines()
trie = Trie(_vocabs)
elif isinstance(dict_source, Iterable):
# Note: Trie and str are both Iterable, Iterable check should be here
# Received a sequence type object of vocabs
trie = Trie(dict_source)
else:
raise TypeError(
"Type of dict_source must be marisa_trie.Trie, or Iterable[str], or str (path to source file)"
)
return trie
[docs]class Tokenizer:
def __init__(
self, custom_dict: Union[Trie, Iterable[str], str] = None, engine: str = "newmm"
):
"""
Initialize tokenizer object
:param str custom_dict: a file path or a list of vocaburaies to be used to create a trie
:param str engine: choose between different options of engine to token (newmm, mm, longest)
"""
self.__trie_dict = None
self.__engine = engine
if custom_dict:
self.__trie_dict = dict_trie(custom_dict)
else:
self.__trie_dict = DEFAULT_DICT_TRIE
[docs] def word_tokenize(self, text: str) -> List[str]:
"""
:param str text: text to be tokenized
:return: list of words, tokenized from the text
"""
return word_tokenize(text, custom_dict=self.__trie_dict, engine=self.__engine)
[docs] def set_tokenize_engine(self, engine: str) -> None:
"""
:param str engine: choose between different options of engine to token (newmm, mm, longest)
"""
self.__engine = engine