Source code for pythainlp.tools.path

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""PyThaiNLP data tools

For text processing and text conversion, see pythainlp.util
"""

from __future__ import annotations

import os
import sys
from typing import TYPE_CHECKING, cast

if TYPE_CHECKING:
    from os import PathLike

if sys.version_info >= (3, 11):
    from importlib.resources import files  # Available in Python 3.11+
else:
    from importlib_resources import files  # noqa: I001

PYTHAINLP_DEFAULT_DATA_DIR: str = "pythainlp-data"


def is_read_only_mode() -> bool:
    """Return whether PyThaiNLP is operating in read-only mode.

    Read-only mode prevents **implicit background writes** to PyThaiNLP's
    internal data directory — writes that happen as side effects the user
    may not be aware of. It is activated by setting the
    ``PYTHAINLP_READ_ONLY`` environment variable to a truthy value
    (e.g. ``"1"``).

    .. deprecated::
        ``PYTHAINLP_READ_MODE`` is deprecated.
        Use ``PYTHAINLP_READ_ONLY`` instead.
        Setting both variables at the same time raises :exc:`ValueError`.

    When read-only mode is active, the following implicit writes are blocked:

    - Creating the PyThaiNLP data directory
      (``~/pythainlp-data`` or as set by ``PYTHAINLP_DATA``).
    - :func:`pythainlp.corpus.download` — corpus downloads and catalog
      updates.
    - :func:`pythainlp.corpus.remove` — corpus file and catalog deletions.

    The following **explicit** user-initiated writes are **not** blocked,
    because the user deliberately provided the destination path:

    - Saving a trained model to a user-specified path
      (e.g. ``model.save("my_model.json")``).
    - Training a tagger with an explicit ``save_loc`` argument.
    - Saving a tokenizer vocabulary to a user-specified directory.
    - CLI output files written to a path the user specified or invoked.

    .. note::
        Use :func:`~pythainlp.tools.path.is_offline_mode` (``PYTHAINLP_OFFLINE``)
        to disable only *automatic* background downloads while still allowing
        explicit :func:`~pythainlp.corpus.download` calls.

    :return: ``True`` if PyThaiNLP is in read-only mode, ``False`` otherwise.
    :rtype: bool

    :raises ValueError: if both ``PYTHAINLP_READ_ONLY`` and
        ``PYTHAINLP_READ_MODE`` are set at the same time.

    :Example:
    ::

        import os
        from pythainlp import is_read_only_mode

        os.environ["PYTHAINLP_READ_ONLY"] = "1"
        print(is_read_only_mode())  # True

        os.environ["PYTHAINLP_READ_ONLY"] = "0"
        print(is_read_only_mode())  # False
    """
    import warnings

    read_only = os.getenv("PYTHAINLP_READ_ONLY")
    read_mode_legacy = os.getenv("PYTHAINLP_READ_MODE")

    if read_only is not None and read_mode_legacy is not None:
        raise ValueError(
            "Both PYTHAINLP_READ_ONLY and PYTHAINLP_READ_MODE are set. "
            "Please use PYTHAINLP_READ_ONLY only and unset PYTHAINLP_READ_MODE."
        )

    if read_mode_legacy is not None and read_only is None:
        warnings.warn(
            "PYTHAINLP_READ_MODE is deprecated; use PYTHAINLP_READ_ONLY instead.",
            DeprecationWarning,
            stacklevel=2,
        )
        return read_mode_legacy == "1"

    if read_only is not None:
        return read_only.strip().lower() not in ("", "0", "false", "no", "off")

    return False


def is_unsafe_pickle_allowed() -> bool:
    """Return whether loading legacy pickle-based corpus files is allowed.

    Pickle deserialisation can execute arbitrary code if the file has been
    tampered with, so it is **disabled by default**.
    Set the ``PYTHAINLP_ALLOW_UNSAFE_PICKLE`` environment variable to
    a truthy value (e.g. ``"1"``) only when you trust the corpus file and
    understand the risk.

    :return: ``True`` if legacy pickle loading is allowed, ``False`` otherwise.
    :rtype: bool
    """
    val = os.getenv("PYTHAINLP_ALLOW_UNSAFE_PICKLE", "")
    return val.strip().lower() in ("1", "true", "yes", "on")


def is_offline_mode() -> bool:
    """Return whether PyThaiNLP is operating in offline mode.

    Offline mode is activated by setting the ``PYTHAINLP_OFFLINE``
    environment variable to a truthy value (e.g. ``"1"``).
    Falsy values (``""``, ``"0"``, ``"false"``, ``"no"``, ``"off"``)
    keep online mode active.

    This follows the same convention as ``HF_HUB_OFFLINE`` in
    `huggingface_hub`.

    When offline mode is active, :func:`pythainlp.corpus.get_corpus_path`
    raises :exc:`FileNotFoundError` for any corpus that is not already
    cached locally, instead of triggering an automatic download.

    .. note::
        :func:`pythainlp.corpus.download` always executes regardless of
        this setting, because an explicit call to ``download()`` or
        ``thainlp data get`` is a deliberate user action.
        ``PYTHAINLP_OFFLINE`` only prevents *automatic* downloads
        initiated by :func:`~pythainlp.corpus.get_corpus_path`.

    :return: ``True`` if PyThaiNLP is in offline mode, ``False`` otherwise.
    :rtype: bool

    :Example:
    ::

        import os
        from pythainlp import is_offline_mode

        os.environ["PYTHAINLP_OFFLINE"] = "1"
        print(is_offline_mode())  # True

        os.environ["PYTHAINLP_OFFLINE"] = "0"
        print(is_offline_mode())  # False
    """
    val = os.getenv("PYTHAINLP_OFFLINE", "")
    return val.strip().lower() not in ("", "0", "false", "no", "off")


def safe_path_join(base: str, *parts: str) -> str:
    """Join *base* with *parts*, verify containment, and return the normalized path.

    This is the authoritative path-traversal guard used throughout the library
    wherever a base directory and external path components are combined
    (e.g., :func:`get_full_data_path` and the internal corpus path helpers
    in :mod:`pythainlp.corpus.core`).

    :param str base: base directory that the result must reside within.
    :param parts: additional path components to append.
    :type parts: str

    :return: normalized absolute path of the joined result.
    :rtype: str

    :raises ValueError: if the resolved path escapes *base*.
    """
    abs_base = os.path.abspath(base)
    abs_full = os.path.abspath(os.path.join(abs_base, *parts))
    if abs_full != abs_base and not abs_full.startswith(abs_base + os.sep):
        raise ValueError(
            f"Path traversal attempt detected: resolved path {abs_full!r} "
            f"is outside the base directory {abs_base!r}."
        )
    return abs_full


[docs] def get_full_data_path(path: str) -> str: """Join the PyThaiNLP data directory path with *path* and return the result. :param str path: relative path or filename to append to the data directory. :return: normalized absolute path within the PyThaiNLP data directory. :rtype: str :raises ValueError: if *path* resolves to a location outside the PyThaiNLP data directory (path traversal attempt). :Example: :: from pythainlp.tools import get_full_data_path get_full_data_path("ttc_freq.txt") # output: '/root/pythainlp-data/ttc_freq.txt' """ return safe_path_join(get_pythainlp_data_path(), path)
[docs] def get_pythainlp_data_path() -> str: """Return the full path where PyThaiNLP keeps its (downloaded) data. The directory is created if it does not yet exist. The path is resolved in the following order: 1. ``PYTHAINLP_DATA`` environment variable (preferred). 2. ``PYTHAINLP_DATA_DIR`` environment variable (deprecated; shows a warning). 3. If **both** variables are set, the function raises :exc:`ValueError` because the conflict must be resolved explicitly. 4. If neither is set, ``~/pythainlp-data`` is used. .. deprecated:: ``PYTHAINLP_DATA_DIR`` is deprecated. Use ``PYTHAINLP_DATA`` instead (follows the same pattern as ``NLTK_DATA``). :return: full path of directory for :mod:`pythainlp` downloaded data :rtype: str :Example: :: from pythainlp.tools import get_pythainlp_data_path get_pythainlp_data_path() # output: '/root/pythainlp-data' """ import warnings data_dir = os.getenv("PYTHAINLP_DATA") data_dir_legacy = os.getenv("PYTHAINLP_DATA_DIR") if data_dir and data_dir_legacy: raise ValueError( "Both PYTHAINLP_DATA and PYTHAINLP_DATA_DIR are set. " "Please use PYTHAINLP_DATA only and unset PYTHAINLP_DATA_DIR." ) if data_dir_legacy and not data_dir: warnings.warn( "PYTHAINLP_DATA_DIR is deprecated; use PYTHAINLP_DATA instead.", DeprecationWarning, stacklevel=2, ) data_dir = data_dir_legacy resolved = data_dir or os.path.join("~", PYTHAINLP_DEFAULT_DATA_DIR) path = os.path.expanduser(resolved) if not is_read_only_mode(): os.makedirs(path, exist_ok=True) return path
[docs] def get_pythainlp_path() -> str: """This function returns full path of PyThaiNLP codes. Note: When the package is installed as a zip file, the returned path may not be a standard filesystem path and should not be used for direct file I/O operations. Use importlib.resources for accessing package files in a zip-safe manner. :return: full path of :mod:`pythainlp` codes :rtype: str :Example: :: from pythainlp.tools import get_pythainlp_path get_pythainlp_path() # output: '/usr/local/lib/python3.6/dist-packages/pythainlp' """ package_path = files("pythainlp") # For compatibility, convert to string path if possible # This works for both regular installations and zip files if hasattr(package_path, "__fspath__"): return os.fspath(cast("PathLike[str]", package_path)) # Fallback for traversable objects that don't support __fspath__ return str(package_path)