Source code for pythainlp.generate.core

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Text generator using n-gram language model

codes are from
https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
"""

import random
from typing import List, Union

from pythainlp.corpus.oscar import (
    unigram_word_freqs as oscar_word_freqs_unigram,
)
from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram
from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram


[docs] class Unigram: """ Text generator using Unigram :param str name: corpus name * *tnc* - Thai National Corpus (default) * *ttc* - Thai Textbook Corpus (TTC) * *oscar* - OSCAR Corpus """
[docs] def __init__(self, name: str = "tnc"): if name == "tnc": self.counts = tnc_word_freqs_unigram() elif name == "ttc": self.counts = ttc_word_freqs_unigram() elif name == "oscar": self.counts = oscar_word_freqs_unigram() self.word = list(self.counts.keys()) self.n = 0 for i in self.word: self.n += self.counts[i] self.prob = {i: self.counts[i] / self.n for i in self.word} self._word_prob: dict = {}
[docs] def gen_sentence( self, start_seq: str = "", N: int = 3, prob: float = 0.001, output_str: bool = True, duplicate: bool = False, ) -> Union[List[str], str]: """ :param str start_seq: word to begin sentence with :param int N: number of words :param bool output_str: output as string :param bool duplicate: allow duplicate words in sentence :return: list of words or a word string :rtype: List[str], str :Example: :: from pythainlp.generate import Unigram gen = Unigram() gen.gen_sentence("แมว") # output: 'แมวเวลานะนั้น' """ if not start_seq: start_seq = random.choice(self.word) rand_text = start_seq.lower() self._word_prob = { i: self.counts[i] / self.n for i in self.word if self.counts[i] / self.n >= prob } return self._next_word( rand_text, N, output_str, prob=prob, duplicate=duplicate )
def _next_word( self, text: str, N: int, output_str: bool, prob: float, duplicate: bool = False, ): words = [] words.append(text) word_list = list(self._word_prob.keys()) if N > len(word_list): N = len(word_list) for _ in range(N): w = random.choice(word_list) if duplicate is False: while w in words: w = random.choice(word_list) words.append(w) if output_str: return "".join(words) return words
[docs] class Bigram: """ Text generator using Bigram :param str name: corpus name * *tnc* - Thai National Corpus (default) """
[docs] def __init__(self, name: str = "tnc"): if name == "tnc": self.uni = tnc_word_freqs_unigram() self.bi = tnc_word_freqs_bigram() self.uni_keys = list(self.uni.keys()) self.bi_keys = list(self.bi.keys()) self.words = [i[-1] for i in self.bi_keys]
[docs] def prob(self, t1: str, t2: str) -> float: """ probability of word :param int t1: text 1 :param int t2: text 2 :return: probability value :rtype: float """ try: v = self.bi[(t1, t2)] / self.uni[t1] except ZeroDivisionError: v = 0.0 return v
[docs] def gen_sentence( self, start_seq: str = "", N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False, ) -> Union[List[str], str]: """ :param str start_seq: word to begin sentence with :param int N: number of words :param bool output_str: output as string :param bool duplicate: allow duplicate words in sentence :return: list of words or a word string :rtype: List[str], str :Example: :: from pythainlp.generate import Bigram gen = Bigram() gen.gen_sentence("แมว") # output: 'แมวไม่ได้รับเชื้อมัน' """ if not start_seq: start_seq = random.choice(self.words) late_word = start_seq list_word = [] list_word.append(start_seq) for _ in range(N): if duplicate: temp = [j for j in self.bi_keys if j[0] == late_word] else: temp = [ j for j in self.bi_keys if j[0] == late_word and j[1] not in list_word ] probs = [self.prob(late_word, next_word[-1]) for next_word in temp] p2 = [j for j in probs if j >= prob] if len(p2) == 0: break items = temp[probs.index(random.choice(p2))] late_word = items[-1] list_word.append(late_word) if output_str: return "".join(list_word) return list_word
[docs] class Trigram: """ Text generator using Trigram :param str name: corpus name * *tnc* - Thai National Corpus (default) """
[docs] def __init__(self, name: str = "tnc"): if name == "tnc": self.uni = tnc_word_freqs_unigram() self.bi = tnc_word_freqs_bigram() self.ti = tnc_word_freqs_trigram() self.uni_keys = list(self.uni.keys()) self.bi_keys = list(self.bi.keys()) self.ti_keys = list(self.ti.keys()) self.words = [i[-1] for i in self.bi_keys]
[docs] def prob(self, t1: str, t2: str, t3: str) -> float: """ probability of word :param int t1: text 1 :param int t2: text 2 :param int t3: text 3 :return: probability value :rtype: float """ try: v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)] except ZeroDivisionError: v = 0.0 return v
[docs] def gen_sentence( self, start_seq: str = "", N: int = 4, prob: float = 0.001, output_str: bool = True, duplicate: bool = False, ) -> Union[List[str], str]: """ :param str start_seq: word to begin sentence with :param int N: number of words :param bool output_str: output as string :param bool duplicate: allow duplicate words in sentence :return: list of words or a word string :rtype: List[str], str :Example: :: from pythainlp.generate import Trigram gen = Trigram() gen.gen_sentence() # output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ' """ if not start_seq: start_seq = random.choice(self.bi_keys) late_word = start_seq list_word = [] list_word.append(start_seq) for i in range(N): if duplicate: temp = [j for j in self.ti_keys if j[:2] == late_word] else: temp = [ j for j in self.ti_keys if j[:2] == late_word and j[1:] not in list_word ] probs = [self.prob(word[0], word[1], word[2]) for word in temp] p2 = [j for j in probs if j >= prob] if len(p2) == 0: break items = temp[probs.index(random.choice(p2))] late_word = items[1:] list_word.append(late_word) listdata = [] for i in list_word: for j in i: if j not in listdata: listdata.append(j) if output_str: return "".join(listdata) return listdata