# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
# ruff: noqa: C901
from typing import List, Tuple, Dict
[docs]
def calculate_ngram_counts(
list_words: List[str],
n_min: int = 2,
n_max: int = 4) -> Dict[Tuple[str], int]:
"""
Calculates the counts of n-grams in the list words for the specified range.
:param List[str] list_words: List of string
:param int n_min: The minimum n-gram size (default: 2).
:param int n_max: The maximum n-gram size (default: 4).
:return: A dictionary where keys are n-grams and values are their counts.
:rtype: Dict[Tuple[str], int]
"""
ngram_counts = {}
for n in range(n_min, n_max + 1):
for i in range(len(list_words) - n + 1):
ngram = tuple(list_words[i:i + n])
ngram_counts[ngram] = ngram_counts.get(ngram, 0) + 1
return ngram_counts
[docs]
def remove_repeated_ngrams(string_list: List[str], n: int = 2) -> List[str]:
"""
Remove repeated n-grams
:param List[str] string_list: List of string
:param int n: n-gram size
:return: List of string
:rtype: List[str]
:Example:
::
from pythainlp.lm import remove_repeated_ngrams
remove_repeated_ngrams(['เอา', 'เอา', 'แบบ', 'ไหน'], n=1)
# output: ['เอา', 'แบบ', 'ไหน']
"""
if not string_list or n <= 0:
return string_list
unique_ngrams = set()
output_list = []
for i in range(len(string_list)):
if i + n <= len(string_list):
ngram = tuple(string_list[i:i + n])
if ngram not in unique_ngrams:
unique_ngrams.add(ngram)
if not output_list or output_list[-(n - 1):] != list(ngram[:-1]):
output_list.extend(ngram)
else:
output_list.append(ngram[-1])
else:
for char in string_list[i:]:
if not output_list or output_list[-1] != char:
output_list.append(char)
return output_list