# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""CRF-based Thai phrase structure (chunk) parser."""
from __future__ import annotations
from importlib.resources import as_file, files
from typing import TYPE_CHECKING, Any, Optional, Union
if TYPE_CHECKING:
import types
from contextlib import AbstractContextManager
from pycrfsuite import (
Tagger as CRFTagger, # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute]
)
from pythainlp.corpus import thai_stopwords
def _is_stopword(word: str) -> bool:
return word in thai_stopwords()
def _doc2features(
tokens: list[tuple[str, str]], index: int
) -> dict[str, Union[str, bool]]:
"""Extract features for a single token in a POS-tagged sentence.
:param list[tuple[str, str]] tokens: POS-tagged sentence,
a list of (word, POS-tag) pairs.
:param int index: index of the token to extract features for.
:return: feature dictionary for the token.
:rtype: dict[str, Union[str, bool]]
"""
word, pos = tokens[index]
f: dict[str, Union[str, bool]] = {
"word": word,
"word_is_stopword": _is_stopword(word),
"pos": pos,
}
if index > 1:
prevprevword, prevprevpos = tokens[index - 2]
f["prev-prev-word"] = prevprevword
f["prev-prevz-word_is_stopword"] = _is_stopword(prevprevword)
f["prev-prevz-pos"] = prevprevpos
if index > 0:
prevword, prevpos = tokens[index - 1]
f["prev-word"] = prevword
f["prev-word_is_stopword"] = _is_stopword(prevword)
f["prev-pos"] = prevpos
else:
f["BOS"] = True
if index < len(tokens) - 2:
nextnextword, nextnextpos = tokens[index + 2]
f["nextnext-word"] = nextnextword
f["nextnext-word_is_stopword"] = _is_stopword(nextnextword)
f["nextnext-pos"] = nextnextpos
if index < len(tokens) - 1:
nextword, nextpos = tokens[index + 1]
f["next-word"] = nextword
f["next-word_is_stopword"] = _is_stopword(nextword)
f["next-pos"] = nextpos
else:
f["EOS"] = True
return f
def _extract_features(
doc: list[tuple[str, str]],
) -> list[dict[str, Union[str, bool]]]:
return [_doc2features(doc, i) for i in range(len(doc))]
[docs]
class CRFChunkParser:
"""CRF-based chunk parser for Thai text.
Parses a POS-tagged sentence into phrase-structure chunks
(IOB format), following the NLTK :class:`nltk.chunk.ChunkParserI`
convention.
This class supports the context manager protocol for deterministic
resource cleanup:
.. code-block:: python
from pythainlp.chunk import CRFChunkParser
with CRFChunkParser() as parser:
result = parser.parse(tokens_pos)
:param str corpus: corpus name for the CRF model
(default: ``"orchidpp"``).
"""
corpus: str
_model_file_ctx: Optional[AbstractContextManager[Any]]
tagger: CRFTagger
xseq: list[dict[str, Union[str, bool]]]
[docs]
def __init__(self, corpus: str = "orchidpp") -> None:
self.corpus = corpus
self._model_file_ctx = None
self.load_model(self.corpus)
[docs]
def load_model(self, corpus: str) -> None:
"""Load the CRF model for the given corpus.
:param str corpus: corpus name.
"""
from pycrfsuite import (
Tagger as CRFTagger, # noqa: PLC0415 # pyright: ignore[reportAttributeAccessIssue] # pyrefly: ignore[missing-module-attribute]
)
self.tagger = CRFTagger()
if corpus == "orchidpp":
corpus_files = files("pythainlp.corpus")
model_file = corpus_files.joinpath("crfchunk_orchidpp.model")
self._model_file_ctx = as_file(model_file)
model_path = self._model_file_ctx.__enter__()
self.tagger.open(str(model_path))
[docs]
def parse(self, token_pos: list[tuple[str, str]]) -> list[str]:
"""Parse a POS-tagged sentence into IOB chunk labels.
:param list[tuple[str, str]] token_pos: list of (word, POS-tag)
pairs.
:return: list of IOB chunk labels, one per token.
:rtype: list[str]
"""
self.xseq = _extract_features(token_pos)
return self.tagger.tag(self.xseq) # type: ignore[no-any-return]
def __enter__(self) -> CRFChunkParser:
"""Context manager entry."""
return self
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_val: Optional[BaseException],
exc_tb: Optional[types.TracebackType],
) -> None:
"""Context manager exit — clean up resources."""
if self._model_file_ctx is not None:
try:
self._model_file_ctx.__exit__(exc_type, exc_val, exc_tb)
self._model_file_ctx = None
except Exception: # noqa: S110
pass
def __del__(self) -> None:
"""Attempt resource cleanup on garbage collection.
.. note::
:meth:`__del__` is not guaranteed to be called.
Use the context manager protocol for reliable cleanup.
"""
if self._model_file_ctx is not None:
try:
self._model_file_ctx.__exit__(None, None, None)
except Exception: # noqa: S110
pass