Source code for pythainlp.tag.named_entity

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Named-entity recognizer
"""
from typing import List, Tuple, Union



[docs]
class NER:
    """
    Class of named-entity recognizer

    :param str engine: engine of named-entity recognizer
    :param str corpus: corpus

    **Options for engine**
        * *thainer-v2* - Thai NER engine v2.0 for Thai NER 2.0 (default)
        * *thainer* - Thai NER engine
        * *tltk* - wrapper for `TLTK <https://pypi.org/project/tltk/>`_.

    **Options for corpus**
        * *thainer* - Thai NER corpus (default)

    **Note**: The tltk engine supports NER models from tltk only.
    """


[docs]
    def __init__(self, engine: str = "thainer-v2", corpus: str = "thainer") -> None:
        self.load_engine(engine=engine, corpus=corpus)



[docs]
    def load_engine(self, engine: str, corpus: str) -> None:
        self.name_engine = engine
        self.engine = None
        if engine == "thainer" and corpus == "thainer":
            from pythainlp.tag.thainer import ThaiNameTagger

            self.engine = ThaiNameTagger()
        elif engine == "thainer-v2" and corpus == "thainer":
            from pythainlp.wangchanberta import NamedEntityRecognition
            self.engine = NamedEntityRecognition(model="pythainlp/thainer-corpus-v2-base-model")
        elif engine == "tltk":
            from pythainlp.tag import tltk

            self.engine = tltk
        elif engine == "wangchanberta" and corpus == "thainer":
            from pythainlp.wangchanberta import ThaiNameTagger

            self.engine = ThaiNameTagger(dataset_name=corpus)
        elif engine == "phayathaibert" and corpus == "thainer-v2":
            from pythainlp.phayathaibert.core import NamedEntityTagger

            self.engine = NamedEntityTagger()
        else:
            raise ValueError(
                "NER class not support {0} engine or {1} corpus.".format(
                    engine, corpus
                )
            )



[docs]
    def tag(self,
            text,
            pos=False,
            tag=False
            ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
        """
        This function tags named entities in text in IOB format.

        :param str text: text in Thai to be tagged
        :param bool pos: output with part-of-speech tags.\
            (wangchanberta is not supported)
        :param bool tag: output HTML-like tags.
        :return: a list of tuples associated with tokenized words, NER tags,
                 POS tags (if the parameter `pos` is specified as `True`),
                 and output HTML-like tags (if the parameter `tag` is
                 specified as `True`).
                 Otherwise, return a list of tuples associated with tokenized
                 words and NER tags
        :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
        :Example:

            >>> from pythainlp.tag import NER
            >>>
            >>> ner = NER("thainer")
            >>> ner.tag("ทดสอบ นายวรรณพงษ์ ภัททิยไพบูลย์")
            [('ทดสอบ', 'O'),
            [(' ', 'O'),
            ('นาย', 'B-PERSON'),
            ('วรรณ', 'I-PERSON'),
            ('พงษ์', 'I-PERSON'),
            (' ', 'I-PERSON'),
            ('ภัททิย', 'I-PERSON'),
            ('ไพบูลย์', 'I-PERSON')]
            >>> ner.tag("ทดสอบ นายวรรณพงษ์ ภัททิยไพบูลย์", tag=True)
            'ทดสอบ <PERSON>นายวรรณพงษ์ ภัททิยไพบูลย์</PERSON>'
        """
        return self.engine.get_ner(text, tag=tag, pos=pos)





[docs]
class NNER:
    """
    Nested Named Entity Recognition

    :param str engine: engine of nested named entity recognizer
    :param str corpus: corpus

    **Options for engine**
        * *thai_nner* - Thai NER engine
    """


[docs]
    def __init__(self, engine: str = "thai_nner") -> None:
        self.load_engine(engine)



[docs]
    def load_engine(self, engine: str = "thai_nner") -> None:
        from pythainlp.tag.thai_nner import Thai_NNER

        self.engine = Thai_NNER()



[docs]
    def tag(self, text) -> Tuple[List[str], List[dict]]:
        """
        This function tags nested named entities.

        :param str text: text in Thai to be tagged

        :return: a list of tuples associated with tokenized words and NNER tags.
        :rtype: Tuple[List[str], List[dict]]

        :Example:

            >>> from pythainlp.tag.named_entity import NNER
            >>> nner = NNER()
            >>> nner.tag("แมวทำอะไรตอนห้าโมงเช้า")
            ([
                '<s>',
                '',
                'แมว',
                'ทํา',
                '',
                'อะไร',
                'ตอน',
                '',
                'ห้า',
                '',
                'โมง',
                '',
                'เช้า',
                '</s>'
            ],
            [
                {
                    'text': ['', 'ห้า'],
                    'span': [7, 9],
                    'entity_type': 'cardinal'
                },
                {
                    'text': ['', 'ห้า', '', 'โมง'],
                    'span': [7, 11],
                    'entity_type': 'time'
                },
                {
                    'text': ['', 'โมง'],
                    'span': [9, 11],
                    'entity_type': 'unit'
                }
            ])
        """
        return self.engine.tag(text)