Source code for pythainlp.chunk

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Thai phrase structure (chunking) module.

This module provides chunk parsing for Thai text, following the
NLTK :mod:`nltk.chunk` naming convention.

:Example:

    .. code-block:: python

        from pythainlp.chunk import chunk_parse, CRFChunkParser
        from pythainlp.tag import pos_tag

        tokens = ["ผม", "รัก", "คุณ"]
        tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")

        # Using the convenience function
        print(chunk_parse(tokens_pos))
        # output: ['B-NP', 'B-VP', 'I-VP']

        # Using the class directly
        with CRFChunkParser() as parser:
            print(parser.parse(tokens_pos))
        # output: ['B-NP', 'B-VP', 'I-VP']
"""
from __future__ import annotations

__all__: list[str] = [
    "CRFChunkParser",
    "chunk_parse",
]

from pythainlp.chunk.crfchunk import CRFChunkParser


[docs] def chunk_parse( sent: list[tuple[str, str]], engine: str = "crf", corpus: str = "orchidpp", ) -> list[str]: """Parse a Thai sentence into phrase-structure chunks (IOB format). :param list[tuple[str, str]] sent: list of (word, POS-tag) pairs. :param str engine: chunking engine; currently only ``"crf"`` is supported. :param str corpus: corpus name for the CRF model; currently only ``"orchidpp"`` is supported. :return: list of IOB chunk labels, one per token. :rtype: list[str] :Example: .. code-block:: python from pythainlp.chunk import chunk_parse from pythainlp.tag import pos_tag tokens = ["ผม", "รัก", "คุณ"] tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid") print(chunk_parse(tokens_pos)) # output: ['B-NP', 'B-VP', 'I-VP'] """ _parser = CRFChunkParser(corpus=corpus) return _parser.parse(sent)