Source code for pythainlp.coref.core

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

from typing import Any, Union, cast

from pythainlp.coref._fastcoref import CorefResult

_MODEL_CACHE: dict[tuple[str, str], Any] = {}



[docs]
def coreference_resolution(
    texts: Union[str, list[str]],
    model_name: str = "han-coref-v1.0",
    device: str = "cpu",
) -> list[CorefResult]:
    """Coreference Resolution

    :param Union[str, list[str]] texts: list of texts to apply coreference resolution to
    :param str model_name: coreference resolution model
    :param str device: device for running coreference resolution model on\
        ("cpu", "cuda", and others)
    :return: List of texts with coreference resolution
    :rtype: list[CorefResult]

    :Options for model_name:
        * *han-coref-v1.0* - (default) Han-Coref: Thai coreference resolution\
            by PyThaiNLP v1.0

    :Example:

        >>> from pythainlp.coref import coreference_resolution  # doctest: +SKIP

        >>> print(  # doctest: +SKIP
        ...     coreference_resolution(
        ...         ["Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก"]
        ...     )
        ... )
        [
        {'text': 'Bill Gates ได้รับวัคซีน COVID-19 เข็มแรกแล้ว ระบุ ผมรู้สึกสบายมาก',
        'clusters_string': [['Bill Gates', 'ผม']],
        'clusters': [[(0, 10), (50, 52)]]}
        ]
    """
    if isinstance(texts, str):
        texts = [texts]

    model_key = (model_name, device)
    if model_key not in _MODEL_CACHE and model_name == "han-coref-v1.0":
        from pythainlp.coref.han_coref import HanCoref

        _MODEL_CACHE[model_key] = HanCoref(device=device)

    model = _MODEL_CACHE.get(model_key)
    if model is not None:
        return cast(list[CorefResult], model.predict(texts))

    return [
        CorefResult(text=text, clusters_string=[], clusters=[])
        for text in texts
    ]