Source code for pythainlp.tag.named_entity

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Named-entity recognizer"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Union

if TYPE_CHECKING:
    from types import ModuleType

    from pythainlp.phayathaibert.core import NamedEntityTagger
    from pythainlp.tag.thai_nner import ThaiNNER
    from pythainlp.tag.thainer import ThaiNameTagger
    from pythainlp.wangchanberta.core import (
        NamedEntityRecognition,
    )
    from pythainlp.wangchanberta.core import (
        ThaiNameTagger as WangchanbertaThaiNameTagger,
    )

# Type alias for NER engine types
NEREngineType = Union[
    "ThaiNNER",
    "ModuleType",
    "ThaiNameTagger",
    "NamedEntityRecognition",
    "WangchanbertaThaiNameTagger",
    "NamedEntityTagger",
    None,
]


[docs] class NER: """Class of named-entity recognizer :param str engine: engine of named-entity recognizer :param str corpus: corpus **Options for engine** * *phayathaibert* - PhayaThaiBERT-based Thai NER engine * *thainer* - Thai NER engine * *thai-nner* - Thai Nested NER engine * *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0 (default) * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_. * *wangchanberta* - WangchanBERTa-based Thai NER engine **Options for corpus** * *thainer* - Thai NER corpus (default) * *thainer-v2* - Thai NER v2 corpus **Note**: The tltk engine supports NER models from tltk only. The thai-nner engine supports nested NER and ignores corpus parameter. """ name_engine: str engine: NEREngineType
[docs] def __init__( self, engine: str = "thainer-v2", corpus: str = "thainer" ) -> None: self.load_engine(engine=engine, corpus=corpus)
[docs] def load_engine(self, engine: str, corpus: str) -> None: self.name_engine = engine self.engine = None # Engines that ignore corpus parameter if engine == "thai-nner": from pythainlp.tag.thai_nner import ThaiNNER self.engine = ThaiNNER() elif engine == "tltk": from pythainlp.tag import tltk self.engine = tltk # Corpus-specific engines elif corpus == "thainer": if engine == "thainer": from pythainlp.tag.thainer import ThaiNameTagger self.engine = ThaiNameTagger() elif engine == "thainer-v2": from pythainlp.wangchanberta import NamedEntityRecognition self.engine = NamedEntityRecognition( model="pythainlp/thainer-corpus-v2-base-model" ) elif engine == "wangchanberta": from pythainlp.wangchanberta import ( ThaiNameTagger as WangchanbertaThaiNameTagger, ) # noqa: I001,E501 self.engine = WangchanbertaThaiNameTagger( dataset_name=corpus ) elif corpus == "thainer-v2": if engine == "phayathaibert": from pythainlp.phayathaibert.core import NamedEntityTagger self.engine = NamedEntityTagger() if self.engine is None: raise ValueError( f"NER class not support {engine} engine or {corpus} corpus." )
[docs] def tag( self, text: str, pos: bool = False, tag: bool = False ) -> Union[list[tuple[str, str]], list[tuple[str, str, str]], str]: """This function tags named entities in text in IOB format. :param str text: text in Thai to be tagged :param bool pos: output with part-of-speech tags.\ (wangchanberta is not supported) :param bool tag: output HTML-like tags. :return: a list of tuples associated with tokenized words, NER tags, POS tags (if the parameter `pos` is specified as `True`), and output HTML-like tags (if the parameter `tag` is specified as `True`). Otherwise, return a list of tuples associated with tokenized words and NER tags :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]], str] :Example: >>> from pythainlp.tag import NER >>> >>> ner = NER("thainer") >>> ner.tag("ทดสอบ นายวรรณพงษ์ ภัททิยไพบูลย์") [('ทดสอบ', 'O'), (' ', 'O'), ('นาย', 'B-PERSON'), ('วรรณ', 'I-PERSON'), ('พงษ์', 'I-PERSON'), (' ', 'I-PERSON'), ('ภัททิย', 'I-PERSON'), ('ไพบูลย์', 'I-PERSON')] >>> ner.tag("ทดสอบ นายวรรณพงษ์ ภัททิยไพบูลย์", tag=True) 'ทดสอบ <PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>' """ if self.engine is None: raise RuntimeError("Engine not initialized") return self.engine.get_ner(text, tag=tag, pos=pos)
[docs] class NNER: """Nested Named Entity Recognition :param str engine: engine of nested named entity recognizer :param str corpus: corpus **Options for engine** * *thai_nner* - Thai NER engine """ engine: "ThaiNNER"
[docs] def __init__(self, engine: str = "thai_nner") -> None: self.load_engine(engine)
[docs] def load_engine(self, engine: str = "thai_nner") -> None: from pythainlp.tag.thai_nner import ThaiNNER self.engine = ThaiNNER()
[docs] def tag( self, text: str, top_level_only: bool = False ) -> tuple[list[str], list[dict[str, Any]]]: """This function tags nested named entities. :param str text: text in Thai to be tagged :param bool top_level_only: If True, return only top-level (outermost) entities. If False, return all nested entities. Default is False. :return: a tuple of (tokens, entities) where tokens is a list of tokenized strings and entities is a list of dictionaries containing 'text', 'span', and 'entity_type' keys. :rtype: tuple[list[str], list[dict[str, Any]]] .. note:: The tokenized output may include empty strings as part of the tokenization process from the underlying Thai-NNER model. :Example: >>> from pythainlp.tag.named_entity import NNER >>> nner = NNER() >>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า") ([ '<s>', '', 'แมว', 'ทํา', '', 'อะไร', 'ตอน', '', 'ห้า', '', 'โมง', '', 'เช้า', '</s>' ], [ { 'text': ['', 'ห้า'], 'span': [7, 9], 'entity_type': 'cardinal' }, { 'text': ['', 'ห้า', '', 'โมง'], 'span': [7, 11], 'entity_type': 'time' }, { 'text': ['', 'โมง'], 'span': [9, 11], 'entity_type': 'unit' } ]) >>> # Get only top-level entities (outermost entities) >>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า", top_level_only=True) ([...], [{'text': ['', 'ห้า', '', 'โมง'], 'span': [7, 11], 'entity_type': 'time'}]) """ return self.engine.tag(text, top_level_only=top_level_only)