Source code for pythainlp.tag.pos_tag

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from typing import List, Tuple


[docs] def pos_tag( words: List[str], engine: str = "perceptron", corpus: str = "orchid" ) -> List[Tuple[str, str]]: """ Marks words with part-of-speech (POS) tags, such as 'NOUN' and 'VERB'. :param list words: a list of tokenized words :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger * *wangchanberta* - wangchanberta model. * *tltk* - TLTK: Thai Language Toolkit (support TNC corpora only.\ If you choose other corpora, they will be converted to TNC corpora.) :param str corpus: the corpus that is used to create the language model for tagger * *orchid* - `ORCHID \ <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \ text from Thai academic articles (default) * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags * *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_ * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \ from `Universal Dependencies <https://universaldependencies.org/>` * *pud* - `Parallel Universal Dependencies (PUD)\ <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \ treebanks, natively use Universal POS tags * *tdtb* - `Thai Discourse Treebank \ <https://github.com/nlp-chula/thai-discourse-treebank/tree/main>`_ \ , natively use Universal POS tags * *tnc* - Thai National Corpus (support tltk engine only) * *tdtb* - `Thai Discourse Treebank <https://github.com/nlp-chula/thai-discourse-treebank>`_ * *tud* - `Thai Universal Dependency Treebank (TUD)\ <https://github.com/nlp-chula/TUD>`_ \ :return: a list of tuples (word, POS tag) :rtype: list[tuple[str, str]] :Example: Tag words with corpus `orchid` (default):: from pythainlp.tag import pos_tag words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ 'นายก', 'เชอร์ชิล'] pos_tag(words) # output: # [('ฉัน', 'PPRS'), ('มี', 'VSTA'), ('ชีวิต', 'NCMN'), ('รอด', 'NCMN'), # ('ใน', 'RPRE'), ('อาคาร', 'NCMN'), ('หลบภัย', 'NCMN'), # ('ของ', 'RPRE'), ('นายก', 'NCMN'), ('เชอร์ชิล', 'NCMN')] Tag words with corpus `orchid_ud`:: from pythainlp.tag import pos_tag words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ 'นายก', 'เชอร์ชิล'] pos_tag(words, corpus='orchid_ud') # output: # [('ฉัน', 'PROPN'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), # ('รอด', 'NOUN'), ('ใน', 'ADP'), ('อาคาร', 'NOUN'), # ('หลบภัย', 'NOUN'), ('ของ', 'ADP'), ('นายก', 'NOUN'), # ('เชอร์ชิล', 'NOUN')] Tag words with corpus `pud`:: from pythainlp.tag import pos_tag words = ['ฉัน','มี','ชีวิต','รอด','ใน','อาคาร','หลบภัย','ของ', \\ 'นายก', 'เชอร์ชิล'] pos_tag(words, corpus='pud') # [('ฉัน', 'PRON'), ('มี', 'VERB'), ('ชีวิต', 'NOUN'), ('รอด', 'VERB'), # ('ใน', 'ADP'), ('อาคาร', 'NOUN'), ('หลบภัย', 'NOUN'), # ('ของ', 'ADP'), ('นายก', 'NOUN'), ('เชอร์ชิล', 'PROPN')] Tag words with different engines including *perceptron* and *unigram*:: from pythainlp.tag import pos_tag words = ['เก้าอี้','มี','จำนวน','ขา', ' ', '=', '3'] pos_tag(words, engine='perceptron', corpus='orchid') # output: # [('เก้าอี้', 'NCMN'), ('มี', 'VSTA'), ('จำนวน', 'NCMN'), # ('ขา', 'NCMN'), (' ', 'PUNC'), # ('=', 'PUNC'), ('3', 'NCNM')] pos_tag(words, engine='unigram', corpus='pud') # output: # [('เก้าอี้', None), ('มี', 'VERB'), ('จำนวน', 'NOUN'), ('ขา', None), # ('<space>', None), ('<equal>', None), ('3', 'NUM')] """ if not words: return [] _support_corpus = [ "blackboard", "blackboard_ud", "orchid", "orchid_ud", "pud", "tdtb", "tud", ] if engine == "perceptron" and corpus in _support_corpus: from pythainlp.tag.perceptron import tag as tag_ elif engine == "tltk": from pythainlp.tag.tltk import pos_tag as tag_ corpus = "tnc" elif engine == "unigram" and corpus in _support_corpus: # default from pythainlp.tag.unigram import tag as tag_ else: raise ValueError( "pos_tag not support {0} engine or {1} corpus.".format( engine, corpus ) ) word_tags = tag_(words, corpus=corpus) return word_tags
[docs] def pos_tag_sents( sentences: List[List[str]], engine: str = "perceptron", corpus: str = "orchid", ) -> List[List[Tuple[str, str]]]: """ Marks sentences with part-of-speech (POS) tags. :param list sentences: a list of lists of tokenized words :param str engine: * *perceptron* - perceptron tagger (default) * *unigram* - unigram tagger * *tltk* - TLTK: Thai Language Toolkit (support TNC corpus only.\ If you choose other corpora, they will be converted to TNC corpora.) :param str corpus: the corpus that is used to create the language model for tagger * *orchid* - `ORCHID \ <https://www.academia.edu/9127599/Thai_Treebank>`_ corpus, \ text from Thai academic articles (default) * *orchid_ud* - ORCHID text, with tags mapped to Universal POS tags * *blackboard* - `blackboard treebank <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_ * *blackboard_ud* - blackboard text, with tags mapped to Universal POS tag \ from `Universal Dependencies <https://universaldependencies.org/>` * *pud* - `Parallel Universal Dependencies (PUD)\ <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \ treebanks, natively use Universal POS tags * *tnc* - Thai National Corpus (support tltk engine only) :return: a list of lists of tuples (word, POS tag) :rtype: list[list[tuple[str, str]]] :Example: Labels POS for two sentences:: from pythainlp.tag import pos_tag_sents sentences = [['เก้าอี้','มี','3','ขา'], \\ ['นก', 'บิน', 'กลับ', 'รัง']] pos_tag_sents(sentences, corpus='pud) # output: # [[('เก้าอี้', 'PROPN'), ('มี', 'VERB'), ('3', 'NUM'), # ('ขา', 'NOUN')], [('นก', 'NOUN'), ('บิน', 'VERB'), # ('กลับ', 'VERB'), ('รัง', 'NOUN')]] """ if not sentences: return [] return [pos_tag(sent, engine=engine, corpus=corpus) for sent in sentences]
def pos_tag_transformers( sentence: str, engine: str = "bert", corpus: str = "blackboard", ) -> List[List[Tuple[str, str]]]: """ Marks sentences with part-of-speech (POS) tags. :param str sentence: a list of lists of tokenized words :param str engine: * *bert* - BERT: Bidirectional Encoder Representations from Transformers (default) * *wangchanberta* - fine-tuned version of airesearch/wangchanberta-base-att-spm-uncased on pud corpus (support PUD cotpus only) * *phayathaibert* - fine-tuned version of clicknext/phayathaibert \ on blackboard corpus (support blackboard cotpus only) * *mdeberta* - mDeBERTa: Multilingual Decoding-enhanced BERT with disentangled attention (support PUD corpus only) :param str corpus: the corpus that is used to create the language model for tagger * *blackboard* - `blackboard treebank (support bert engine only) <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_ * *pud* - `Parallel Universal Dependencies (PUD)\ <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \ treebanks, natively use Universal POS tags (support wangchanberta and mdeberta engine) :return: a list of lists of tuples (word, POS tag) :rtype: list[list[tuple[str, str]]] :Example: Labels POS for given sentence:: from pythainlp.tag import pos_tag_transformers sentences = "แมวทำอะไรตอนห้าโมงเช้า" pos_tag_transformers(sentences, engine="bert", corpus='blackboard') # output: # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] """ try: from transformers import ( AutoModelForTokenClassification, AutoTokenizer, TokenClassificationPipeline, ) except ImportError: raise ImportError( "Not found transformers! Please install transformers by pip install transformers" ) if not sentence: return [] _blackboard_support_engine = { "bert": "lunarlist/pos_thai", "phayathai": "lunarlist/pos_thai_phayathai", } _pud_support_engine = { "wangchanberta": "Pavarissy/wangchanberta-ud-thai-pud-upos", "mdeberta": "Pavarissy/mdeberta-v3-ud-thai-pud-upos", } if corpus == "blackboard" and engine in _blackboard_support_engine.keys(): base_model = _blackboard_support_engine.get(engine) model = AutoModelForTokenClassification.from_pretrained(base_model) tokenizer = AutoTokenizer.from_pretrained(base_model) elif corpus == "pud" and engine in _pud_support_engine.keys(): base_model = _pud_support_engine.get(engine) model = AutoModelForTokenClassification.from_pretrained(base_model) tokenizer = AutoTokenizer.from_pretrained(base_model) else: raise ValueError( "pos_tag_transformers not support {0} engine or {1} corpus.".format( engine, corpus ) ) pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple", ) outputs = pipeline(sentence) word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]] return word_tags