Source code for pythainlp.corpus.wikipedia

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Provides an optional word list from Thai Wikipedia titles."""

from __future__ import annotations

from typing import Optional

from pythainlp.corpus.core import get_corpus

_WIKIPEDIA_TITLES: Optional[frozenset[str]] = None
_WIKIPEDIA_TITLES_FILENAME: str = "wikipedia_titles_th.txt"


[docs] def thai_wikipedia_titles() -> frozenset[str]: """Return a frozenset of words from Thai Wikipedia titles corpus. They are mostly nouns and noun phrases, including event, organization, people, place, and product names. Commonly misspelled words are included intentionally. See: `dev/pythainlp/corpus/wikipedia_titles_th.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/wikipedia_titles_th.txt>`_ More info: https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md :return: :class:`frozenset` containing Thai words. :rtype: :class:`frozenset` """ global _WIKIPEDIA_TITLES if not _WIKIPEDIA_TITLES: _WIKIPEDIA_TITLES = get_corpus( _WIKIPEDIA_TITLES_FILENAME, comments=False ) return _WIKIPEDIA_TITLES