Source code for pythainlp.generate.core

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Text generator using n-gram language model

codes are from
https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
"""

import random
from typing import List, Union

from pythainlp.corpus.oscar import (
    unigram_word_freqs as oscar_word_freqs_unigram,
)
from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram
from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram



[docs]
class Unigram:
    """
    Text generator using Unigram

    :param str name: corpus name
        * *tnc* - Thai National Corpus (default)
        * *ttc* - Thai Textbook Corpus (TTC)
        * *oscar* - OSCAR Corpus
    """


[docs]
    def __init__(self, name: str = "tnc"):
        if name == "tnc":
            self.counts = tnc_word_freqs_unigram()
        elif name == "ttc":
            self.counts = ttc_word_freqs_unigram()
        elif name == "oscar":
            self.counts = oscar_word_freqs_unigram()
        self.word = list(self.counts.keys())
        self.n = 0
        for i in self.word:
            self.n += self.counts[i]
        self.prob = {i: self.counts[i] / self.n for i in self.word}
        self._word_prob: dict = {}



[docs]
    def gen_sentence(
        self,
        start_seq: str = "",
        N: int = 3,
        prob: float = 0.001,
        output_str: bool = True,
        duplicate: bool = False,
    ) -> Union[List[str], str]:
        """
        :param str start_seq: word to begin sentence with
        :param int N: number of words
        :param bool output_str: output as string
        :param bool duplicate: allow duplicate words in sentence

        :return: list of words or a word string
        :rtype: List[str], str

        :Example:
        ::

            from pythainlp.generate import Unigram

            gen = Unigram()

            gen.gen_sentence("แมว")
            # output: 'แมวเวลานะนั้น'
        """
        if not start_seq:
            start_seq = random.choice(self.word)
        rand_text = start_seq.lower()
        self._word_prob = {
            i: self.counts[i] / self.n
            for i in self.word
            if self.counts[i] / self.n >= prob
        }
        return self._next_word(
            rand_text, N, output_str, prob=prob, duplicate=duplicate
        )


    def _next_word(
        self,
        text: str,
        N: int,
        output_str: bool,
        prob: float,
        duplicate: bool = False,
    ):
        words = []
        words.append(text)
        word_list = list(self._word_prob.keys())
        if N > len(word_list):
            N = len(word_list)
        for _ in range(N):
            w = random.choice(word_list)
            if duplicate is False:
                while w in words:
                    w = random.choice(word_list)
            words.append(w)

        if output_str:
            return "".join(words)
        return words




[docs]
class Bigram:
    """
    Text generator using Bigram

    :param str name: corpus name
        * *tnc* - Thai National Corpus (default)
    """


[docs]
    def __init__(self, name: str = "tnc"):
        if name == "tnc":
            self.uni = tnc_word_freqs_unigram()
            self.bi = tnc_word_freqs_bigram()
        self.uni_keys = list(self.uni.keys())
        self.bi_keys = list(self.bi.keys())
        self.words = [i[-1] for i in self.bi_keys]



[docs]
    def prob(self, t1: str, t2: str) -> float:
        """
        probability of word

        :param int t1: text 1
        :param int t2: text 2

        :return: probability value
        :rtype: float
        """
        try:
            v = self.bi[(t1, t2)] / self.uni[t1]
        except ZeroDivisionError:
            v = 0.0
        return v



[docs]
    def gen_sentence(
        self,
        start_seq: str = "",
        N: int = 4,
        prob: float = 0.001,
        output_str: bool = True,
        duplicate: bool = False,
    ) -> Union[List[str], str]:
        """
        :param str start_seq: word to begin sentence with
        :param int N: number of words
        :param bool output_str: output as string
        :param bool duplicate: allow duplicate words in sentence

        :return: list of words or a word string
        :rtype: List[str], str

        :Example:
        ::

            from pythainlp.generate import Bigram

            gen = Bigram()

            gen.gen_sentence("แมว")
            # output: 'แมวไม่ได้รับเชื้อมัน'
        """
        if not start_seq:
            start_seq = random.choice(self.words)
        late_word = start_seq
        list_word = []
        list_word.append(start_seq)

        for _ in range(N):
            if duplicate:
                temp = [j for j in self.bi_keys if j[0] == late_word]
            else:
                temp = [
                    j
                    for j in self.bi_keys
                    if j[0] == late_word and j[1] not in list_word
                ]
            probs = [self.prob(late_word, next_word[-1]) for next_word in temp]
            p2 = [j for j in probs if j >= prob]
            if len(p2) == 0:
                break
            items = temp[probs.index(random.choice(p2))]
            late_word = items[-1]
            list_word.append(late_word)

        if output_str:
            return "".join(list_word)

        return list_word





[docs]
class Trigram:
    """
    Text generator using Trigram

    :param str name: corpus name
        * *tnc* - Thai National Corpus (default)
    """


[docs]
    def __init__(self, name: str = "tnc"):
        if name == "tnc":
            self.uni = tnc_word_freqs_unigram()
            self.bi = tnc_word_freqs_bigram()
            self.ti = tnc_word_freqs_trigram()
        self.uni_keys = list(self.uni.keys())
        self.bi_keys = list(self.bi.keys())
        self.ti_keys = list(self.ti.keys())
        self.words = [i[-1] for i in self.bi_keys]



[docs]
    def prob(self, t1: str, t2: str, t3: str) -> float:
        """
        probability of word

        :param int t1: text 1
        :param int t2: text 2
        :param int t3: text 3

        :return: probability value
        :rtype: float
        """
        try:
            v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)]
        except ZeroDivisionError:
            v = 0.0

        return v



[docs]
    def gen_sentence(
        self,
        start_seq: str = "",
        N: int = 4,
        prob: float = 0.001,
        output_str: bool = True,
        duplicate: bool = False,
    ) -> Union[List[str], str]:
        """
        :param str start_seq: word to begin sentence with
        :param int N: number of words
        :param bool output_str: output as string
        :param bool duplicate: allow duplicate words in sentence

        :return: list of words or a word string
        :rtype: List[str], str

        :Example:
        ::

            from pythainlp.generate import Trigram

            gen = Trigram()

            gen.gen_sentence()
            # output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
        """
        if not start_seq:
            start_seq = random.choice(self.bi_keys)
        late_word = start_seq
        list_word = []
        list_word.append(start_seq)

        for i in range(N):
            if duplicate:
                temp = [j for j in self.ti_keys if j[:2] == late_word]
            else:
                temp = [
                    j
                    for j in self.ti_keys
                    if j[:2] == late_word and j[1:] not in list_word
                ]
            probs = [self.prob(word[0], word[1], word[2]) for word in temp]
            p2 = [j for j in probs if j >= prob]
            if len(p2) == 0:
                break
            items = temp[probs.index(random.choice(p2))]
            late_word = items[1:]
            list_word.append(late_word)

        listdata = []
        for i in list_word:
            for j in i:
                if j not in listdata:
                    listdata.append(j)

        if output_str:
            return "".join(listdata)

        return listdata