# -*- coding: utf-8 -*-
"""
Named-entity recognizer
"""
import warnings
from typing import List, Tuple, Union
[docs]class NER:
"""
Named-entity recognizer class
:param str engine: Named-entity recognizer engine
:param str corpus: corpus
**Options for engine**
* *thainer* - Thai NER engine
* *wangchanberta* - wangchanberta model
* *lst20_onnx* - LST20 NER model by wangchanberta with ONNX runtime
* *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_.
**Options for corpus**
* *thaimer* - Thai NER corpus
* *lst20* - lst20 corpus (wangchanberta only). \
`LST20 <https://aiforthai.in.th/corpus.php>`_ corpus \
by National Electronics and Computer Technology Center, Thailand \
It is free for **non-commercial uses and research only**. \
You can read at \
`Facebook <https://www.facebook.com/dancearmy/posts/10157641945708284>`_.
**Note**: for tltk engine, It's support ner model from tltk only.
"""
[docs] def __init__(self, engine: str, corpus: str = "thainer") -> None:
self.load_engine(engine=engine, corpus=corpus)
[docs] def load_engine(self, engine: str, corpus: str) -> None:
self.name_engine = engine
self.engine = None
if engine == "thainer" and corpus == "thainer":
from pythainlp.tag.thainer import ThaiNameTagger
self.engine = ThaiNameTagger()
elif engine == "lst20_onnx":
from pythainlp.tag.lst20_ner_onnx import LST20_NER_ONNX
self.engine = LST20_NER_ONNX()
elif engine == "wangchanberta":
from pythainlp.wangchanberta import ThaiNameTagger
if corpus=="lst20":
warnings.warn("""
LST20 corpus are free for research and open source only.\n
If you want to use in Commercial use, please contract NECTEC.\n
https://www.facebook.com/dancearmy/posts/10157641945708284
""")
self.engine = ThaiNameTagger(dataset_name=corpus)
elif engine == "tltk":
from pythainlp.tag import tltk
self.engine = tltk
else:
raise ValueError(
"NER class not support {0} engine or {1} corpus.".format(
engine,
corpus
)
)
[docs] def tag(
self,
text,
pos=True,
tag=False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
"""
This function tags named-entitiy from text in IOB format.
:param str text: text in Thai to be tagged
:param bool pos: output with part-of-speech tag.\
(wangchanberta is not support)
:param bool tag: output like html tag.
:return: a list of tuple associated with tokenized word, NER tag,
POS tag (if the parameter `pos` is specified as `True`),
and output like html tag (if the parameter `tag` is
specified as `True`).
Otherwise, return a list of tuple associated with tokenized
word and NER tag
:rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
:Example:
>>> from pythainlp.tag import NER
>>>
>>> ner = NER("thainer")
>>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์")
[('ทดสอบ', 'VV', 'O'),
('นาย', 'NN', 'B-PERSON'),
('วรรณ', 'NN', 'I-PERSON'),
('พงษ์', 'NN', 'I-PERSON'),
(' ', 'PU', 'I-PERSON'),
('ภัททิย', 'NN', 'I-PERSON'),
('ไพบูลย์', 'NN', 'I-PERSON')]
>>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True)
'ทดสอบ<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>'
"""
if pos and self.name_engine == "wangchanberta":
warnings.warn(
"""wangchanberta is not support part-of-speech tag.
It have not part-of-speech tag in output."""
)
if self.name_engine == "wangchanberta" or self.name_engine == "lst20_onnx":
return self.engine.get_ner(text, tag=tag)
else:
return self.engine.get_ner(text, tag=tag, pos=pos)
[docs]class NNER:
"""
Nested Named Entity Recognition
:param str engine: Nested Named entity recognizer engine
:param str corpus: corpus
**Options for engine**
* *thai_nner* - Thai NER engine
"""
[docs] def __init__(self, engine: str = "thai_nner") -> None:
self.load_engine(engine)
[docs] def load_engine(self, engine: str = "thai_nner") -> None:
from pythainlp.tag.thai_nner import Thai_NNER
self.engine = Thai_NNER()
[docs] def tag(self, text) -> Tuple[List[str], List[dict]]:
"""
This function tags nested named-entitiy.
:param str text: text in Thai to be tagged
:return: a list of tuple associated with tokenized word, NNER tag.
:rtype: Tuple[List[str], List[dict]]
:Example:
>>> from pythainlp.tag.named_entity import NNER
>>> nner = NNER()
>>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า")
([
'<s>',
'',
'แมว',
'ทํา',
'',
'อะไร',
'ตอน',
'',
'ห้า',
'',
'โมง',
'',
'เช้า',
'</s>'
],
[
{
'text': ['', 'ห้า'],
'span': [7, 9],
'entity_type': 'cardinal'
},
{
'text': ['', 'ห้า', '', 'โมง'],
'span': [7, 11],
'entity_type': 'time'
},
{
'text': ['', 'โมง'],
'span': [9, 11],
'entity_type': 'unit'
}
])
"""
return self.engine.tag(text)