Source code for pythainlp.tag

# -*- coding: utf-8 -*-
"""
Tagging each token in a sentence with supplementary information,
such as its part of speech and class of named-entity.
"""

from typing import List, Tuple

__all__ = ["pos_tag", "pos_tag_sents", "tag_provinces"]
from .locations import tag_provinces

# tag map for orchid to Universal Dependencies
# from Korakot Chaovavanich
_TAG_MAP_UD = {
    # NOUN
    "NOUN": "NOUN",
    "NCMN": "NOUN",
    "NTTL": "NOUN",
    "CNIT": "NOUN",
    "CLTV": "NOUN",
    "CMTR": "NOUN",
    "CFQC": "NOUN",
    "CVBL": "NOUN",
    # VERB
    "VACT": "VERB",
    "VSTA": "VERB",
    # PROPN
    "PROPN": "PROPN",
    "NPRP": "PROPN",
    # ADJ
    "ADJ": "ADJ",
    "NONM": "ADJ",
    "VATT": "ADJ",
    "DONM": "ADJ",
    # ADV
    "ADV": "ADV",
    "ADVN": "ADV",
    "ADVI": "ADV",
    "ADVP": "ADV",
    "ADVS": "ADV",
    # INT
    "INT": "INTJ",
    # PRON
    "PRON": "PRON",
    "PPRS": "PRON",
    "PDMN": "PRON",
    "PNTR": "PRON",
    # DET
    "DET": "DET",
    "DDAN": "DET",
    "DDAC": "DET",
    "DDBQ": "DET",
    "DDAQ": "DET",
    "DIAC": "DET",
    "DIBQ": "DET",
    "DIAQ": "DET",
    # NUM
    "NUM": "NUM",
    "NCNM": "NUM",
    "NLBL": "NUM",
    "DCNM": "NUM",
    # AUX
    "AUX": "AUX",
    "XVBM": "AUX",
    "XVAM": "AUX",
    "XVMM": "AUX",
    "XVBB": "AUX",
    "XVAE": "AUX",
    # ADP
    "ADP": "ADP",
    "RPRE": "ADP",
    # CCONJ
    "CCONJ": "CCONJ",
    "JCRG": "CCONJ",
    # SCONJ
    "SCONJ": "SCONJ",
    "PREL": "SCONJ",
    "JSBR": "SCONJ",
    "JCMP": "SCONJ",
    # PART
    "PART": "PART",
    "FIXN": "PART",
    "FIXV": "PART",
    "EAFF": "PART",
    "EITT": "PART",
    "AITT": "PART",
    "NEG": "PART",
    # PUNCT
    "PUNCT": "PUNCT",
    "PUNC": "PUNCT",
}


def _UD_Exception(w: str, tag: str) -> str:
    if w == "การ" or w == "ความ":
        return "NOUN"

    return tag


def _orchid_to_ud(tag) -> List[Tuple[str, str]]:
    _i = 0
    temp = []
    while _i < len(tag):
        temp.append((tag[_i][0], _UD_Exception(tag[_i][0], _TAG_MAP_UD[tag[_i][1]])))
        _i += 1

    return temp


def _artagger_tag(words: List[str], corpus: str = None) -> List[Tuple[str, str]]:
    if not words:
        return []

    from artagger import Tagger

    words_ = Tagger().tag(" ".join(words))

    return [(word.word, word.tag) for word in words_]


[docs]def pos_tag( words: List[str], engine: str = "perceptron", corpus: str = "orchid" ) -> List[Tuple[str, str]]: """ Part of Speech tagging function. :param list words: a list of tokenized words :param str engine: * unigram - unigram tagger * perceptron - perceptron tagger (default) * artagger - RDR POS tagger :param str corpus: * orchid - annotated Thai academic articles (default) * orchid_ud - annotated Thai academic articles using Universal Dependencies Tags * pud - Parallel Universal Dependencies (PUD) treebanks :return: returns a list of labels regarding which part of speech it is """ _corpus = corpus _tag = [] if corpus == "orchid_ud": corpus = "orchid" if not words: return [] if engine == "perceptron": from .perceptron import tag as tag_ elif engine == "artagger": tag_ = _artagger_tag else: # default, use "unigram" ("old") engine from .unigram import tag as tag_ _tag = tag_(words, corpus=corpus) if _corpus == "orchid_ud": _tag = _orchid_to_ud(_tag) return _tag
[docs]def pos_tag_sents( sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid" ) -> List[List[Tuple[str, str]]]: """ Part of Speech tagging Sentence function. :param list sentences: a list of lists of tokenized words :param str engine: * unigram - unigram tagger * perceptron - perceptron tagger (default) * artagger - RDR POS tagger :param str corpus: * orchid - annotated Thai academic articles (default) * orchid_ud - annotated Thai academic articles using Universal Dependencies Tags * pud - Parallel Universal Dependencies (PUD) treebanks :return: returns a list of labels regarding which part of speech it is """ if not sentences: return [] return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]