# -*- coding: utf-8 -*-
"""
nercut 0.2
Dictionary-based maximal matching word segmentation, constrained with
Thai Character Cluster (TCC) boundaries, and combining tokens that are
parts of the same named-entity.
Code by Wannaphong Phatthiyaphaibun
"""
from typing import Iterable, List
from pythainlp.tag.named_entity import NER
_thainer = NER(engine="thainer")
[docs]def segment(
text: str,
taglist: Iterable[str] = [
"ORGANIZATION",
"PERSON",
"PHONE",
"EMAIL",
"DATE",
"TIME",
],
tagger=_thainer
) -> List[str]:
"""
Dictionary-based maximal matching word segmentation, constrained with
Thai Character Cluster (TCC) boundaries, and combining tokens that are
parts of the same named-entity.
:param str text: text to be tokenized to words
:parm list taglist: a list of named-entity tags to be used
:parm class tagger: ner tagger engine
:return: list of words, tokenized from the text
"""
if not isinstance(text, str):
return []
tagged_words = tagger.tag(text, pos=False)
words = []
combining_word = ""
for idx, (curr_word, curr_tag) in enumerate(tagged_words):
if curr_tag != "O":
tag = curr_tag[2:]
else:
tag = "O"
if curr_tag.startswith("B-") and tag in taglist:
combining_word = curr_word
elif (
curr_tag.startswith("I-")
and combining_word != ""
and tag in taglist
):
combining_word += curr_word
elif (
curr_tag == "O"
and combining_word != ""
):
words.append(combining_word)
combining_word = ""
words.append(curr_word)
else: # if tag is O
combining_word = ""
words.append(curr_word)
if idx + 1 == len(tagged_words):
if curr_tag.startswith("B-") and combining_word != "":
words.append(combining_word)
elif curr_tag.startswith("I-") and combining_word != "":
words.append(combining_word)
else:
pass
return words