Source code for pythainlp.lm.text_util

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: C901
from __future__ import annotations


[docs] def calculate_ngram_counts( list_words: list[str], n_min: int = 2, n_max: int = 4 ) -> dict[tuple[str, ...], int]: """Calculates the counts of n-grams in the list words for the specified range. :param List[str] list_words: List of string :param int n_min: The minimum n-gram size (default: 2). :param int n_max: The maximum n-gram size (default: 4). :return: A dictionary where keys are n-grams and values are their counts. :rtype: dict[tuple[str, ...], int] """ if not list_words: return {} ngram_counts: dict[tuple[str, ...], int] = {} for n in range(n_min, n_max + 1): for i in range(len(list_words) - n + 1): ngram = tuple(list_words[i : i + n]) ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1 return ngram_counts
[docs] def remove_repeated_ngrams(string_list: list[str], n: int = 2) -> list[str]: """Remove repeated n-grams :param List[str] string_list: List of string :param int n: n-gram size :return: List of string :rtype: list[str] :Example: :: from pythainlp.lm import remove_repeated_ngrams remove_repeated_ngrams(["เอา", "เอา", "แบบ", "ไหน"], n=1) # output: ['เอา', 'แบบ', 'ไหน'] """ if not string_list or n <= 0: return string_list unique_ngrams = set() output_list: list[str] = [] for i in range(len(string_list)): if i + n <= len(string_list): ngram = tuple(string_list[i : i + n]) if ngram not in unique_ngrams: unique_ngrams.add(ngram) if not output_list or output_list[-(n - 1) :] != list( ngram[:-1] ): output_list.extend(ngram) else: output_list.append(ngram[-1]) else: for char in string_list[i:]: if not output_list or output_list[-1] != char: output_list.append(char) return output_list