Source code for pythainlp.corpus.tnc

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project.
# SPDX-License-Identifier: Apache-2.0
"""
Thai National Corpus word frequency
"""

__all__ = [
    "bigram_word_freqs",
    "trigram_word_freqs",
    "unigram_word_freqs",
    "word_freqs",
]

from collections import defaultdict
from typing import List, Tuple

from pythainlp.corpus import get_corpus, get_corpus_path

_UNIGRAM_FILENAME = "tnc_freq.txt"
_BIGRAM_CORPUS_NAME = "tnc_bigram_word_freqs"
_TRIGRAM_CORPUS_NAME = "tnc_trigram_word_freqs"



[docs]
def word_freqs() -> List[Tuple[str, int]]:
    """
    Get word frequency from Thai National Corpus (TNC)
    \n(See: `dev/pythainlp/corpus/tnc_freq.txt\
    <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_)

    Credit: Korakot Chaovavanich https://www.facebook.com/groups/thainlp/posts/434330506948445
    """
    freqs: list[tuple[str, int]] = []
    lines = list(get_corpus(_UNIGRAM_FILENAME))
    for line in lines:
        word_freq = line.split("\t")
        if len(word_freq) >= 2:
            freqs.append((word_freq[0], int(word_freq[1])))

    return freqs




[docs]
def unigram_word_freqs() -> dict[str, int]:
    """
    Get unigram word frequency from Thai National Corpus (TNC)
    """
    freqs: dict[str, int] = defaultdict(int)
    lines = list(get_corpus(_UNIGRAM_FILENAME))
    for i in lines:
        _temp = i.strip().split("	")
        if len(_temp) >= 2:
            freqs[_temp[0]] = int(_temp[-1])

    return freqs




[docs]
def bigram_word_freqs() -> dict[Tuple[str, str], int]:
    """
    Get bigram word frequency from Thai National Corpus (TNC)
    """
    freqs: dict[tuple[str, str], int] = defaultdict(int)
    path = get_corpus_path(_BIGRAM_CORPUS_NAME)
    if not path:
        return freqs
    path = str(path)

    with open(path, "r", encoding="utf-8-sig") as fh:
        for i in fh.readlines():
            temp = i.strip().split("	")
            freqs[(temp[0], temp[1])] = int(temp[-1])

    return freqs




[docs]
def trigram_word_freqs() -> dict[Tuple[str, str, str], int]:
    """
    Get trigram word frequency from Thai National Corpus (TNC)
    """
    freqs: dict[tuple[str, str, str], int] = defaultdict(int)
    path = get_corpus_path(_TRIGRAM_CORPUS_NAME)
    if not path:
        return freqs
    path = str(path)

    with open(path, "r", encoding="utf-8-sig") as fh:
        for i in fh.readlines():
            temp = i.strip().split("	")
            freqs[(temp[0], temp[1], temp[2])] = int(temp[-1])

    return freqs