# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Text generator using n-gram language model
codes are from
https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
"""
import random
from typing import List, Union
from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram
from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram
from pythainlp.corpus.oscar import (
unigram_word_freqs as oscar_word_freqs_unigram,
)
[docs]class Unigram:
"""
Text generator using Unigram
:param str name: corpus name
* *tnc* - Thai National Corpus (default)
* *ttc* - Thai Textbook Corpus (TTC)
* *oscar* - OSCAR Corpus
"""
[docs] def __init__(self, name: str = "tnc"):
if name == "tnc":
self.counts = tnc_word_freqs_unigram()
elif name == "ttc":
self.counts = ttc_word_freqs_unigram()
elif name == "oscar":
self.counts = oscar_word_freqs_unigram()
self.word = list(self.counts.keys())
self.n = 0
for i in self.word:
self.n += self.counts[i]
self.prob = {i: self.counts[i] / self.n for i in self.word}
self._word_prob = {}
[docs] def gen_sentence(
self,
start_seq: str = None,
N: int = 3,
prob: float = 0.001,
output_str: bool = True,
duplicate: bool = False,
) -> Union[List[str], str]:
"""
:param str start_seq: word to begin sentence with
:param int N: number of words
:param bool output_str: output as string
:param bool duplicate: allow duplicate words in sentence
:return: list of words or a word string
:rtype: List[str], str
:Example:
::
from pythainlp.generate import Unigram
gen = Unigram()
gen.gen_sentence("แมว")
# output: 'แมวเวลานะนั้น'
"""
if start_seq is None:
start_seq = random.choice(self.word)
rand_text = start_seq.lower()
self._word_prob = {
i: self.counts[i] / self.n
for i in self.word
if self.counts[i] / self.n >= prob
}
return self._next_word(
rand_text, N, output_str, prob=prob, duplicate=duplicate
)
def _next_word(
self,
text: str,
N: int,
output_str: str,
prob: float,
duplicate: bool = False,
):
self.words = []
self.words.append(text)
self._word_list = list(self._word_prob.keys())
if N > len(self._word_list):
N = len(self._word_list)
for _ in range(N):
self._word = random.choice(self._word_list)
if duplicate is False:
while self._word in self.words:
self._word = random.choice(self._word_list)
self.words.append(self._word)
if output_str:
return "".join(self.words)
return self.words
[docs]class Bigram:
"""
Text generator using Bigram
:param str name: corpus name
* *tnc* - Thai National Corpus (default)
"""
[docs] def __init__(self, name: str = "tnc"):
if name == "tnc":
self.uni = tnc_word_freqs_unigram()
self.bi = tnc_word_freqs_bigram()
self.uni_keys = list(self.uni.keys())
self.bi_keys = list(self.bi.keys())
self.words = [i[-1] for i in self.bi_keys]
[docs] def prob(self, t1: str, t2: str) -> float:
"""
probability of word
:param int t1: text 1
:param int t2: text 2
:return: probability value
:rtype: float
"""
try:
v = self.bi[(t1, t2)] / self.uni[t1]
except ZeroDivisionError:
v = 0.0
return v
[docs] def gen_sentence(
self,
start_seq: str = None,
N: int = 4,
prob: float = 0.001,
output_str: bool = True,
duplicate: bool = False,
) -> Union[List[str], str]:
"""
:param str start_seq: word to begin sentence with
:param int N: number of words
:param bool output_str: output as string
:param bool duplicate: allow duplicate words in sentence
:return: list of words or a word string
:rtype: List[str], str
:Example:
::
from pythainlp.generate import Bigram
gen = Bigram()
gen.gen_sentence("แมว")
# output: 'แมวไม่ได้รับเชื้อมัน'
"""
if start_seq is None:
start_seq = random.choice(self.words)
self.late_word = start_seq
self.list_word = []
self.list_word.append(start_seq)
for _ in range(N):
if duplicate:
self._temp = [
j for j in self.bi_keys if j[0] == self.late_word
]
else:
self._temp = [
j
for j in self.bi_keys
if j[0] == self.late_word and j[1] not in self.list_word
]
self._probs = [
self.prob(self.late_word, next_word[-1])
for next_word in self._temp
]
self._p2 = [j for j in self._probs if j >= prob]
if len(self._p2) == 0:
break
self.items = self._temp[self._probs.index(random.choice(self._p2))]
self.late_word = self.items[-1]
self.list_word.append(self.late_word)
if output_str:
return "".join(self.list_word)
return self.list_word
[docs]class Trigram:
"""
Text generator using Trigram
:param str name: corpus name
* *tnc* - Thai National Corpus (default)
"""
[docs] def __init__(self, name: str = "tnc"):
if name == "tnc":
self.uni = tnc_word_freqs_unigram()
self.bi = tnc_word_freqs_bigram()
self.ti = tnc_word_freqs_trigram()
self.uni_keys = list(self.uni.keys())
self.bi_keys = list(self.bi.keys())
self.ti_keys = list(self.ti.keys())
self.words = [i[-1] for i in self.bi_keys]
[docs] def prob(self, t1: str, t2: str, t3: str) -> float:
"""
probability of word
:param int t1: text 1
:param int t2: text 2
:param int t3: text 3
:return: probability value
:rtype: float
"""
try:
v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)]
except ZeroDivisionError:
v = 0.0
return v
[docs] def gen_sentence(
self,
start_seq: str = None,
N: int = 4,
prob: float = 0.001,
output_str: bool = True,
duplicate: bool = False,
) -> Union[List[str], str]:
"""
:param str start_seq: word to begin sentence with
:param int N: number of words
:param bool output_str: output as string
:param bool duplicate: allow duplicate words in sentence
:return: list of words or a word string
:rtype: List[str], str
:Example:
::
from pythainlp.generate import Trigram
gen = Trigram()
gen.gen_sentence()
# output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
"""
if start_seq is None:
start_seq = random.choice(self.bi_keys)
self.late_word = start_seq
self.list_word = []
self.list_word.append(start_seq)
for i in range(N):
if duplicate:
self._temp = [
j for j in self.ti_keys if j[:2] == self.late_word
]
else:
self._temp = [
j
for j in self.ti_keys
if j[:2] == self.late_word and j[1:] not in self.list_word
]
self._probs = [
self.prob(word[0], word[1], word[2]) for word in self._temp
]
self._p2 = [j for j in self._probs if j >= prob]
if len(self._p2) == 0:
break
self.items = self._temp[self._probs.index(random.choice(self._p2))]
self.late_word = self.items[1:]
self.list_word.append(self.late_word)
self.listdata = []
for i in self.list_word:
for j in i:
if j not in self.listdata:
self.listdata.append(j)
if output_str:
return "".join(self.listdata)
return self.listdata