Source code for pythainlp.tag.named_entity

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Named-entity recognizer
"""
from typing import List, Tuple, Union


[docs] class NER: """ Class of named-entity recognizer :param str engine: engine of named-entity recognizer :param str corpus: corpus **Options for engine** * *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0 (default) * *thainer* - Thai NER engine * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_. **Options for corpus** * *thainer* - Thai NER corpus (default) **Note**: The tltk engine supports NER models from tltk only. """
[docs] def __init__(self, engine: str = "thainer-v2", corpus: str = "thainer") -> None: self.load_engine(engine=engine, corpus=corpus)
[docs] def load_engine(self, engine: str, corpus: str) -> None: self.name_engine = engine self.engine = None if engine == "thainer" and corpus == "thainer": from pythainlp.tag.thainer import ThaiNameTagger self.engine = ThaiNameTagger() elif engine == "thainer-v2" and corpus == "thainer": from pythainlp.wangchanberta import NamedEntityRecognition self.engine = NamedEntityRecognition(model="pythainlp/thainer-corpus-v2-base-model") elif engine == "tltk": from pythainlp.tag import tltk self.engine = tltk elif engine == "wangchanberta" and corpus == "thainer": from pythainlp.wangchanberta import ThaiNameTagger self.engine = ThaiNameTagger(dataset_name=corpus) elif engine == "phayathaibert" and corpus == "thainer-v2": from pythainlp.phayathaibert.core import NamedEntityTagger self.engine = NamedEntityTagger() else: raise ValueError( "NER class not support {0} engine or {1} corpus.".format( engine, corpus ) )
[docs] def tag(self, text, pos=False, tag=False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named entities in text in IOB format. :param str text: text in Thai to be tagged :param bool pos: output with part-of-speech tags.\ (wangchanberta is not supported) :param bool tag: output HTML-like tags. :return: a list of tuples associated with tokenized words, NER tags, POS tags (if the parameter `pos` is specified as `True`), and output HTML-like tags (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuples associated with tokenized words and NER tags :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] :Example: >>> from pythainlp.tag import NER >>> >>> ner = NER("thainer") >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์") [('ทดสอบ', 'O'), ('นาย', 'B-PERSON'), ('วรรณ', 'I-PERSON'), ('พงษ์', 'I-PERSON'), (' ', 'I-PERSON'), ('ภัททิย', 'I-PERSON'), ('ไพบูลย์', 'I-PERSON')] >>> ner.tag("ทดสอบนายวรรณพงษ์ ภัททิยไพบูลย์", tag=True) 'ทดสอบ<PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>' """ return self.engine.get_ner(text, tag=tag, pos=pos)
[docs] class NNER: """ Nested Named Entity Recognition :param str engine: engine of nested named entity recognizer :param str corpus: corpus **Options for engine** * *thai_nner* - Thai NER engine """
[docs] def __init__(self, engine: str = "thai_nner") -> None: self.load_engine(engine)
[docs] def load_engine(self, engine: str = "thai_nner") -> None: from pythainlp.tag.thai_nner import Thai_NNER self.engine = Thai_NNER()
[docs] def tag(self, text) -> Tuple[List[str], List[dict]]: """ This function tags nested named entities. :param str text: text in Thai to be tagged :return: a list of tuples associated with tokenized words and NNER tags. :rtype: Tuple[List[str], List[dict]] :Example: >>> from pythainlp.tag.named_entity import NNER >>> nner = NNER() >>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า") ([ '<s>', '', 'แมว', 'ทํา', '', 'อะไร', 'ตอน', '', 'ห้า', '', 'โมง', '', 'เช้า', '</s>' ], [ { 'text': ['', 'ห้า'], 'span': [7, 9], 'entity_type': 'cardinal' }, { 'text': ['', 'ห้า', '', 'โมง'], 'span': [7, 11], 'entity_type': 'time' }, { 'text': ['', 'โมง'], 'span': [9, 11], 'entity_type': 'unit' } ]) """ return self.engine.tag(text)