Source code for pythainlp.lm.text_util

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: C901

from typing import List, Tuple, Dict



[docs]
def calculate_ngram_counts(
        list_words: List[str],
        n_min: int = 2,
        n_max: int = 4) -> Dict[Tuple[str], int]:
    """
    Calculates the counts of n-grams in the list words for the specified range.

    :param List[str] list_words: List of string
    :param int n_min: The minimum n-gram size (default: 2).
    :param int n_max: The maximum n-gram size (default: 4).

    :return: A dictionary where keys are n-grams and values are their counts.
    :rtype: Dict[Tuple[str], int]
    """

    ngram_counts = {}

    for n in range(n_min, n_max + 1):
        for i in range(len(list_words) - n + 1):
            ngram = tuple(list_words[i:i + n])
            ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1

    return ngram_counts




[docs]
def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]:
    """
    Remove repeated n-grams

    :param List[str] string_list: List of string
    :param int n: n-gram size
    :return: List of string
    :rtype: List[str]

    :Example:
    ::

        from pythainlp.lm import remove_repeated_ngrams

        remove_repeated_ngrams(['เอา', 'เอา', 'แบบ', 'ไหน'], n=1)
        # output: ['เอา', 'แบบ', 'ไหน']
    """
    if not string_list or n <= 0:
        return string_list

    unique_ngrams = set()

    output_list = []

    for i in range(len(string_list)):
        if i + n <= len(string_list):
            ngram = tuple(string_list[i:i + n])

            if ngram not in unique_ngrams:
                unique_ngrams.add(ngram)

                if not output_list or output_list[-(n - 1):] != list(ngram[:-1]):
                    output_list.extend(ngram)
                else:
                    output_list.append(ngram[-1])
        else:
            for char in string_list[i:]:
                if not output_list or output_list[-1] != char:
                    output_list.append(char)

    return output_list