# -*- coding: utf-8 -*-"""The implementation of tokenizer accorinding to Thai Character Clusters (TCCs)rules purposed by `Theeramunkong et al. 2000. \ <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548>`_Credits: * TCC: Jakkrit TeCho * Grammar: Wittawat Jitkrittum (`link to the source file \ <https://github.com/wittawatj/jtcc/blob/master/TCC.g>`_) * Python code: Korakot Chaovavanich"""importrefromtypingimportList,Set_RE_TCC=("""\เc็cเcctาะเccีtยะเccีtย(?=[เ-ไก-ฮ]|$)เcc็cเcิc์cเcิtcเcีtยะ?เcืtอะ?เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)เctา?ะ?cัtวะc[ัื]tc[ุิะ]?c[ิุู]์c[ะ-ู]tc็ct[ะาำ]?แc็cแcc์แctะแcc็cแccc์โctะ[เ-ไ]ct""".replace("c","[ก-ฮ]").replace("t","[่-๋]?").split())_PAT_TCC=re.compile("|".join(_RE_TCC))
[docs]deftcc(text:str)->str:""" TCC generator, generates Thai Character Clusters :param str text: text to be tokenized to character clusters :return: subwords (character clusters) :rtype: Iterator[str] """ifnottextornotisinstance(text,str):return""len_text=len(text)p=0whilep<len_text:m=_PAT_TCC.match(text[p:])ifm:n=m.span()[1]else:n=1yieldtext[p:p+n]p+=n
[docs]deftcc_pos(text:str)->Set[int]:""" TCC positions :param str text: text to be tokenized to character clusters :return: list of the end position of subwords :rtype: set[int] """ifnottextornotisinstance(text,str):returnset()p_set=set()p=0forwintcc(text):p+=len(w)p_set.add(p)returnp_set
[docs]defsegment(text:str)->List[str]:""" Subword segmentation :param str text: text to be tokenized to character clusters :return: list of subwords (character clusters), tokenized from the text :rtype: list[str] """returnlist(tcc(text))