# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import re
from typing import TYPE_CHECKING, Optional
if TYPE_CHECKING:
import pandas as pd
import torch
from transformers import PreTrainedModel, PreTrainedTokenizerBase
[docs]
class WangChanGLM:
exclude_pattern: "re.Pattern"
stop_token: str
PROMPT_DICT: dict[str, str]
device: str
torch_dtype: "torch.dtype"
model_path: str
model: "PreTrainedModel"
tokenizer: "PreTrainedTokenizerBase"
df: "pd.DataFrame"
exclude_ids: list[int]
[docs]
def __init__(self) -> None:
self.exclude_pattern: "re.Pattern" = re.compile(r"[^ก-๙]+")
self.stop_token: str = "\n" # noqa: S105
self.PROMPT_DICT: dict[str, str] = {
"prompt_input": (
"<context>: {input}\n<human>: {instruction}\n<bot>: "
),
"prompt_no_input": ("<human>: {instruction}\n<bot>: "),
"prompt_chatbot": ("<human>: {human}\n<bot>: {bot}"),
}
[docs]
def is_exclude(self, text: str) -> bool:
return bool(self.exclude_pattern.search(text))
[docs]
def load_model(
self,
model_path: str = "pythainlp/wangchanglm-7.5B-sft-en-sharded",
return_dict: bool = True,
load_in_8bit: bool = False,
device: str = "cuda",
torch_dtype: Optional["torch.dtype"] = None,
offload_folder: str = "./",
low_cpu_mem_usage: bool = True,
) -> None:
"""Load model
:param str model_path: model path
:param bool return_dict: return dict
:param bool load_in_8bit: load model in 8bit
:param str device: device (cpu, cuda or other)
:param Optional[torch.dtype] torch_dtype: torch_dtype
:param str offload_folder: offload folder
:param bool low_cpu_mem_usage: low cpu mem usage
"""
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
self.device: str = device
self.torch_dtype: "torch.dtype" = torch_dtype
self.model_path: str = model_path
self.model: "PreTrainedModel" = AutoModelForCausalLM.from_pretrained(
self.model_path,
return_dict=return_dict,
load_in_8bit=load_in_8bit,
device_map=device,
torch_dtype=torch_dtype,
offload_folder=offload_folder,
low_cpu_mem_usage=low_cpu_mem_usage,
)
self.tokenizer: "PreTrainedTokenizerBase" = (
AutoTokenizer.from_pretrained(self.model_path)
)
self.df: "pd.DataFrame" = pd.DataFrame(
self.tokenizer.vocab.items(), columns=["text", "idx"]
)
self.df["is_exclude"] = self.df.text.map(self.is_exclude)
self.exclude_ids: list[int] = self.df[
self.df.is_exclude is True
].idx.tolist()
[docs]
def gen_instruct(
self,
text: str,
max_new_tokens: int = 512,
top_p: float = 0.95,
temperature: float = 0.9,
top_k: int = 50,
no_repeat_ngram_size: int = 2,
typical_p: float = 1.0,
thai_only: bool = True,
skip_special_tokens: bool = True,
) -> str:
"""Generate Instruct
:param str text: text
:param int max_new_tokens: maximum number of new tokens
:param float top_p: top p
:param float temperature: temperature
:param int top_k: top k
:param int no_repeat_ngram_size: do not repeat ngram size
:param float typical_p: typical p
:param bool thai_only: Thai only
:param bool skip_special_tokens: skip special tokens
:return: the answer from Instruct
:rtype: str
"""
import torch
batch = self.tokenizer(text, return_tensors="pt")
with torch.autocast(device_type=self.device, dtype=self.torch_dtype):
if thai_only:
output_tokens = self.model.generate(
input_ids=batch["input_ids"],
max_new_tokens=max_new_tokens, # 512
begin_suppress_tokens=self.exclude_ids,
no_repeat_ngram_size=no_repeat_ngram_size,
# oasst k50
top_k=top_k,
top_p=top_p, # 0.95
typical_p=typical_p,
temperature=temperature, # 0.9
)
else:
output_tokens = self.model.generate(
input_ids=batch["input_ids"],
max_new_tokens=max_new_tokens, # 512
no_repeat_ngram_size=no_repeat_ngram_size,
# oasst k50
top_k=top_k,
top_p=top_p, # 0.95
typical_p=typical_p,
temperature=temperature, # 0.9
)
return self.tokenizer.decode( # type: ignore[no-any-return]
output_tokens[0][len(batch["input_ids"][0]) :],
skip_special_tokens=skip_special_tokens,
)
[docs]
def instruct_generate(
self,
instruct: str,
context: str = "",
max_new_tokens: int = 512,
temperature: float = 0.9,
top_p: float = 0.95,
top_k: int = 50,
no_repeat_ngram_size: int = 2,
typical_p: float = 1,
thai_only: bool = True,
skip_special_tokens: bool = True,
) -> str:
"""Generate Instruct
:param str instruct: Instruct
:param str context: context (optional, default is empty string)
:param int max_new_tokens: maximum number of new tokens
:param float top_p: top p
:param float temperature: temperature
:param int top_k: top k
:param int no_repeat_ngram_size: do not repeat ngram size
:param float typical_p: typical p
:param bool thai_only: Thai only
:param bool skip_special_tokens: skip special tokens
:return: the answer from Instruct
:rtype: str
:Example:
::
from pythainlp.generate.wangchanglm import WangChanGLM
import torch
model = WangChanGLM()
model.load_model(device="cpu", torch_dtype=torch.bfloat16)
print(model.instruct_generate(instruct="ขอวิธีลดน้ำหนัก"))
# output: ลดน้ําหนักให้ได้ผล ต้องทําอย่างค่อยเป็นค่อยไป
# ปรับเปลี่ยนพฤติกรรมการกินอาหาร
# ออกกําลังกายอย่างสม่ําเสมอ
# และพักผ่อนให้เพียงพอ
# ที่สําคัญควรหลีกเลี่ยงอาหารที่มีแคลอรี่สูง
# เช่น อาหารทอด อาหารมัน อาหารที่มีน้ําตาลสูง
# และเครื่องดื่มแอลกอฮอล์
"""
if not context:
prompt = self.PROMPT_DICT["prompt_no_input"].format_map(
{"instruction": instruct, "input": ""}
)
else:
prompt = self.PROMPT_DICT["prompt_input"].format_map(
{"instruction": instruct, "input": context}
)
result = self.gen_instruct(
prompt,
max_new_tokens=max_new_tokens,
top_p=top_p,
top_k=top_k,
temperature=temperature,
no_repeat_ngram_size=no_repeat_ngram_size,
typical_p=typical_p,
thai_only=thai_only,
skip_special_tokens=skip_special_tokens,
)
return result