Source code for pythainlp.corpus.tnc

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Thai National Corpus word frequency
"""

from __future__ import annotations

__all__ = [
    "bigram_word_freqs",
    "trigram_word_freqs",
    "unigram_word_freqs",
    "word_freqs",
]

from collections import defaultdict

from pythainlp.corpus import get_corpus, get_corpus_path

_UNIGRAM_FILENAME = "tnc_freq.txt"
_BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
_TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"


[docs] def word_freqs() -> list[tuple[str, int]]: """Get word frequency from Thai National Corpus (TNC) \n(See: `dev/pythainlp/corpus/tnc_freq.txt\ <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_) Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445 """ freqs: list[tuple[str, int]] = [] for line in get_corpus(_UNIGRAM_FILENAME): word_freq = line.split("\t") if len(word_freq) >= 2: freqs.append((word_freq[0], int(word_freq[1]))) return freqs
[docs] def unigram_word_freqs() -> dict[str, int]: """Get unigram word frequency from Thai National Corpus (TNC) """ freqs: dict[str, int] = defaultdict(int) for line in get_corpus(_UNIGRAM_FILENAME): _temp = line.strip().split(" ") if len(_temp) >= 2: freqs[_temp[0]] = int(_temp[-1]) return freqs
[docs] def bigram_word_freqs() -> dict[tuple[str, str], int]: """Get bigram word frequency from Thai National Corpus (TNC) """ freqs: dict[tuple[str, str], int] = defaultdict(int) path = get_corpus_path(_BIGRAM_CORPUS_NAME) if not path: return freqs path = str(path) try: with open(path, encoding="utf-8-sig") as fh: for line in fh: temp = line.strip().split(" ") if len(temp) >= 3: freqs[(temp[0], temp[1])] = int(temp[-1]) except (IOError, OSError): pass return freqs
[docs] def trigram_word_freqs() -> dict[tuple[str, str, str], int]: """Get trigram word frequency from Thai National Corpus (TNC) """ freqs: dict[tuple[str, str, str], int] = defaultdict(int) path = get_corpus_path(_TRIGRAM_CORPUS_NAME) if not path: return freqs path = str(path) try: with open(path, encoding="utf-8-sig") as fh: for line in fh: temp = line.strip().split(" ") if len(temp) >= 4: freqs[(temp[0], temp[1], temp[2])] = int(temp[-1]) except (IOError, OSError): pass return freqs