Source code for pythainlp.tag.tltk

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from typing import List, Tuple, Union
try:
    from tltk import nlp
except ImportError:
    raise ImportError("Not found tltk! Please install tltk by pip install tltk")
from pythainlp.tokenize import word_tokenize

nlp.pos_load()
nlp.ner_load()


def pos_tag(words: List[str], corpus: str = "tnc") -> List[Tuple[str, str]]:
    if corpus != "tnc":
        raise ValueError("tltk not support {0} corpus.".format(0))
    return nlp.pos_tag_wordlist(words)


def _post_process(text: str) -> str:
    return text.replace("<s/>", " ")


[docs]def get_ner(
    text: str, pos: bool = True, tag: bool = False
) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
    """
    Named-entity recognizer from **TLTK**

    This function tags named-entities in text in IOB format.

    :param str text: text in Thai to be tagged
    :param bool pos: To include POS tags in the results (`True`) or
        exclude (`False`). The default value is `True`
    :param bool tag: output HTML-like tag.
    :return: a list of tuples associated with tokenized words, NER tags,
        POS tags (if the parameter `pos` is specified as `True`),
        and output HTML-like tags (if the parameter `tag` is
        specified as `True`).
        Otherwise, return a list of tuples associated with tokenized
        words and NER tags
    :rtype: Union[list[tuple[str, str]], list[tuple[str, str, str]]], str

    :Example:

        >>> from pythainlp.tag.tltk import get_ner
        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง")
        [('เขา', 'PRON', 'O'),
        ('เรียน', 'VERB', 'O'),
        ('ที่', 'SCONJ', 'O'),
        ('โรงเรียน', 'NOUN', 'B-L'),
        ('นางรอง', 'VERB', 'I-L')]
        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", pos=False)
        [('เขา', 'O'),
        ('เรียน', 'O'),
        ('ที่', 'O'),
        ('โรงเรียน', 'B-L'),
        ('นางรอง', 'I-L')]
        >>> get_ner("เขาเรียนที่โรงเรียนนางรอง", tag=True)
        'เขาเรียนที่<L>โรงเรียนนางรอง</L>'
    """
    if not text:
        return []
    list_word = []
    for i in word_tokenize(text, engine="tltk"):
        if i == " ":
            i = "<s/>"
        list_word.append(i)
    _pos = nlp.pos_tag_wordlist(list_word)
    sent_ner = [
        (_post_process(word), pos, ner) for word, pos, ner in nlp.ner(_pos)
    ]
    if tag:
        temp = ""
        sent = ""
        for idx, (word, pos, ner) in enumerate(sent_ner):
            if ner.startswith("B-") and temp != "":
                sent += "</" + temp + ">"
                temp = ner[2:]
                sent += "<" + temp + ">"
            elif ner.startswith("B-"):
                temp = ner[2:]
                sent += "<" + temp + ">"
            elif ner == "O" and temp != "":
                sent += "</" + temp + ">"
                temp = ""
            sent += word

            if idx == len(sent_ner) - 1 and temp != "":
                sent += "</" + temp + ">"

        return sent
    if pos is False:
        return [(word, ner) for word, pos, ner in sent_ner]
    return sent_ner