Source code for pythainlp.chunk

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Thai phrase structure (chunking) module.

This module provides chunk parsing for Thai text, following the
NLTK :mod:`nltk.chunk` naming convention.

:Example:

    .. code-block:: python

        from pythainlp.chunk import chunk_parse, CRFChunkParser
        from pythainlp.tag import pos_tag

        tokens = ["ผม", "รัก", "คุณ"]
        tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")

        # Using the convenience function
        print(chunk_parse(tokens_pos))
        # output: ['B-NP', 'B-VP', 'I-VP']

        # Using the class directly
        with CRFChunkParser() as parser:
            print(parser.parse(tokens_pos))
        # output: ['B-NP', 'B-VP', 'I-VP']
"""

from __future__ import annotations

__all__: list[str] = [
    "CRFChunkParser",
    "chunk_parse",
]

from pythainlp.chunk.crfchunk import CRFChunkParser



[docs]
def chunk_parse(
    sent: list[tuple[str, str]],
    engine: str = "crf",
    corpus: str = "orchidpp",
) -> list[str]:
    """Parse a Thai sentence into phrase-structure chunks (IOB format).

    :param list[tuple[str, str]] sent: list of (word, POS-tag) pairs.
    :param str engine: chunking engine; currently only ``"crf"`` is
        supported.
    :param str corpus: corpus name for the CRF model; currently only
        ``"orchidpp"`` is supported.
    :return: list of IOB chunk labels, one per token.
    :rtype: list[str]

    :Example:

    .. code-block:: python

        from pythainlp.chunk import chunk_parse
        from pythainlp.tag import pos_tag

        tokens = ["ผม", "รัก", "คุณ"]
        tokens_pos = pos_tag(tokens, engine="perceptron", corpus="orchid")

        print(chunk_parse(tokens_pos))
        # output: ['B-NP', 'B-VP', 'I-VP']
    """
    _parser = CRFChunkParser(corpus=corpus)
    return _parser.parse(sent)