# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Text generator using n-gram language model
codes are from
https://towardsdatascience.com/understanding-word-n-grams-and-n-gram-probability-in-natural-language-processing-9d9eef0fa058
"""
import random
from typing import List, Union
from pythainlp.corpus.oscar import (
unigram_word_freqs as oscar_word_freqs_unigram,
)
from pythainlp.corpus.tnc import bigram_word_freqs as tnc_word_freqs_bigram
from pythainlp.corpus.tnc import trigram_word_freqs as tnc_word_freqs_trigram
from pythainlp.corpus.tnc import unigram_word_freqs as tnc_word_freqs_unigram
from pythainlp.corpus.ttc import unigram_word_freqs as ttc_word_freqs_unigram
[docs]
class Unigram:
"""
Text generator using Unigram
:param str name: corpus name
* *tnc* - Thai National Corpus (default)
* *ttc* - Thai Textbook Corpus (TTC)
* *oscar* - OSCAR Corpus
"""
[docs]
def __init__(self, name: str = "tnc"):
if name == "tnc":
self.counts = tnc_word_freqs_unigram()
elif name == "ttc":
self.counts = ttc_word_freqs_unigram()
elif name == "oscar":
self.counts = oscar_word_freqs_unigram()
self.word = list(self.counts.keys())
self.n = 0
for i in self.word:
self.n += self.counts[i]
self.prob = {i: self.counts[i] / self.n for i in self.word}
self._word_prob: dict = {}
[docs]
def gen_sentence(
self,
start_seq: str = "",
N: int = 3,
prob: float = 0.001,
output_str: bool = True,
duplicate: bool = False,
) -> Union[List[str], str]:
"""
:param str start_seq: word to begin sentence with
:param int N: number of words
:param bool output_str: output as string
:param bool duplicate: allow duplicate words in sentence
:return: list of words or a word string
:rtype: List[str], str
:Example:
::
from pythainlp.generate import Unigram
gen = Unigram()
gen.gen_sentence("แมว")
# output: 'แมวเวลานะนั้น'
"""
if not start_seq:
start_seq = random.choice(self.word)
rand_text = start_seq.lower()
self._word_prob = {
i: self.counts[i] / self.n
for i in self.word
if self.counts[i] / self.n >= prob
}
return self._next_word(
rand_text, N, output_str, prob=prob, duplicate=duplicate
)
def _next_word(
self,
text: str,
N: int,
output_str: bool,
prob: float,
duplicate: bool = False,
):
words = []
words.append(text)
word_list = list(self._word_prob.keys())
if N > len(word_list):
N = len(word_list)
for _ in range(N):
w = random.choice(word_list)
if duplicate is False:
while w in words:
w = random.choice(word_list)
words.append(w)
if output_str:
return "".join(words)
return words
[docs]
class Bigram:
"""
Text generator using Bigram
:param str name: corpus name
* *tnc* - Thai National Corpus (default)
"""
[docs]
def __init__(self, name: str = "tnc"):
if name == "tnc":
self.uni = tnc_word_freqs_unigram()
self.bi = tnc_word_freqs_bigram()
self.uni_keys = list(self.uni.keys())
self.bi_keys = list(self.bi.keys())
self.words = [i[-1] for i in self.bi_keys]
[docs]
def prob(self, t1: str, t2: str) -> float:
"""
probability of word
:param int t1: text 1
:param int t2: text 2
:return: probability value
:rtype: float
"""
try:
v = self.bi[(t1, t2)] / self.uni[t1]
except ZeroDivisionError:
v = 0.0
return v
[docs]
def gen_sentence(
self,
start_seq: str = "",
N: int = 4,
prob: float = 0.001,
output_str: bool = True,
duplicate: bool = False,
) -> Union[List[str], str]:
"""
:param str start_seq: word to begin sentence with
:param int N: number of words
:param bool output_str: output as string
:param bool duplicate: allow duplicate words in sentence
:return: list of words or a word string
:rtype: List[str], str
:Example:
::
from pythainlp.generate import Bigram
gen = Bigram()
gen.gen_sentence("แมว")
# output: 'แมวไม่ได้รับเชื้อมัน'
"""
if not start_seq:
start_seq = random.choice(self.words)
late_word = start_seq
list_word = []
list_word.append(start_seq)
for _ in range(N):
if duplicate:
temp = [j for j in self.bi_keys if j[0] == late_word]
else:
temp = [
j
for j in self.bi_keys
if j[0] == late_word and j[1] not in list_word
]
probs = [self.prob(late_word, next_word[-1]) for next_word in temp]
p2 = [j for j in probs if j >= prob]
if len(p2) == 0:
break
items = temp[probs.index(random.choice(p2))]
late_word = items[-1]
list_word.append(late_word)
if output_str:
return "".join(list_word)
return list_word
[docs]
class Trigram:
"""
Text generator using Trigram
:param str name: corpus name
* *tnc* - Thai National Corpus (default)
"""
[docs]
def __init__(self, name: str = "tnc"):
if name == "tnc":
self.uni = tnc_word_freqs_unigram()
self.bi = tnc_word_freqs_bigram()
self.ti = tnc_word_freqs_trigram()
self.uni_keys = list(self.uni.keys())
self.bi_keys = list(self.bi.keys())
self.ti_keys = list(self.ti.keys())
self.words = [i[-1] for i in self.bi_keys]
[docs]
def prob(self, t1: str, t2: str, t3: str) -> float:
"""
probability of word
:param int t1: text 1
:param int t2: text 2
:param int t3: text 3
:return: probability value
:rtype: float
"""
try:
v = self.ti[(t1, t2, t3)] / self.bi[(t1, t2)]
except ZeroDivisionError:
v = 0.0
return v
[docs]
def gen_sentence(
self,
start_seq: str = "",
N: int = 4,
prob: float = 0.001,
output_str: bool = True,
duplicate: bool = False,
) -> Union[List[str], str]:
"""
:param str start_seq: word to begin sentence with
:param int N: number of words
:param bool output_str: output as string
:param bool duplicate: allow duplicate words in sentence
:return: list of words or a word string
:rtype: List[str], str
:Example:
::
from pythainlp.generate import Trigram
gen = Trigram()
gen.gen_sentence()
# output: 'ยังทำตัวเป็นเซิร์ฟเวอร์คือ'
"""
if not start_seq:
start_seq = random.choice(self.bi_keys)
late_word = start_seq
list_word = []
list_word.append(start_seq)
for i in range(N):
if duplicate:
temp = [j for j in self.ti_keys if j[:2] == late_word]
else:
temp = [
j
for j in self.ti_keys
if j[:2] == late_word and j[1:] not in list_word
]
probs = [self.prob(word[0], word[1], word[2]) for word in temp]
p2 = [j for j in probs if j >= prob]
if len(p2) == 0:
break
items = temp[probs.index(random.choice(p2))]
late_word = items[1:]
list_word.append(late_word)
listdata = []
for i in list_word:
for j in i:
if j not in listdata:
listdata.append(j)
if output_str:
return "".join(listdata)
return listdata