Source code for pythainlp.generate.wangchanglm

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
import re
import torch


[docs]class WangChanGLM:
[docs]    def __init__(self):
        self.exclude_pattern = re.compile(r'[^ก-๙]+')
        self.stop_token = "\n"
        self.PROMPT_DICT = {
            "prompt_input": (
                "<context>: {input}\n<human>: {instruction}\n<bot>: "
            ),
            "prompt_no_input": (
                "<human>: {instruction}\n<bot>: "
            ),
            "prompt_chatbot": (
                "<human>: {human}\n<bot>: {bot}"
            ),
        }
[docs]    def is_exclude(self, text:str)->bool:
        return bool(self.exclude_pattern.search(text))
[docs]    def load_model(
        self,
        model_path:str="pythainlp/wangchanglm-7.5B-sft-en-sharded",
        return_dict:bool=True,
        load_in_8bit:bool=False,
        device:str="cuda",
        torch_dtype=torch.float16,
        offload_folder:str="./",
        low_cpu_mem_usage:bool=True
    ):
        """
        Load model
        
        :param str model_path: model path
        :param bool return_dict: return dict
        :param bool load_in_8bit: load model in 8bit
        :param str device: device (cpu, cuda or other)
        :param torch_dtype torch_dtype: torch_dtype
        :param str offload_folder: offload folder
        :param bool low_cpu_mem_usage: low cpu mem usage
        """
        import pandas as pd
        from transformers import AutoModelForCausalLM, AutoTokenizer
        self.device = device
        self.torch_dtype = torch_dtype
        self.model_path = model_path
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_path,
            return_dict=return_dict,
            load_in_8bit=load_in_8bit,
            device_map=device,
            torch_dtype=torch_dtype,
            offload_folder=offload_folder,
            low_cpu_mem_usage=low_cpu_mem_usage
        )
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        self.df = pd.DataFrame(self.tokenizer.vocab.items(), columns=['text', 'idx'])
        self.df['is_exclude'] = self.df.text.map(self.is_exclude)
        self.exclude_ids = self.df[self.df.is_exclude is True].idx.tolist()
[docs]    def gen_instruct(
        self,
        text:str,
        max_new_tokens:int=512,
        top_p:float=0.95,
        temperature:float=0.9,
        top_k:int=50,
        no_repeat_ngram_size:int=2,
        typical_p:float=1.,
        thai_only:bool=True,
        skip_special_tokens:bool=True
    ):
        """
        Generate Instruct
        
        :param str text: text
        :param int max_new_tokens: maximum number of new tokens
        :param float top_p: top p
        :param float temperature: temperature
        :param int top_k: top k
        :param int no_repeat_ngram_size: do not repeat ngram size
        :param float typical_p: typical p
        :param bool thai_only: Thai only
        :param bool skip_special_tokens: skip special tokens
        :return: the answer from Instruct
        :rtype: str
        """
        batch = self.tokenizer(text, return_tensors="pt")
        with torch.autocast(device_type=self.device, dtype=self.torch_dtype):
            if thai_only:
                output_tokens = self.model.generate(
                    input_ids=batch["input_ids"],
                    max_new_tokens=max_new_tokens, # 512
                    begin_suppress_tokens = self.exclude_ids,
                    no_repeat_ngram_size=no_repeat_ngram_size,
                    #oasst k50
                    top_k=top_k,
                    top_p=top_p, # 0.95
                    typical_p=typical_p,
                    temperature=temperature, # 0.9
                )
            else:
                output_tokens = self.model.generate(
                    input_ids=batch["input_ids"],
                    max_new_tokens=max_new_tokens, # 512
                    no_repeat_ngram_size=no_repeat_ngram_size,
                    #oasst k50
                    top_k=top_k,
                    top_p=top_p, # 0.95
                    typical_p=typical_p,
                    temperature=temperature, # 0.9
                )
        return self.tokenizer.decode(output_tokens[0][len(batch["input_ids"][0]):], skip_special_tokens=skip_special_tokens)
[docs]    def instruct_generate(
        self,
        instruct: str,
        context: str = None,
        max_new_tokens=512,
        temperature: float =0.9,
        top_p: float = 0.95,
        top_k:int=50,
        no_repeat_ngram_size:int=2,
        typical_p:float=1,
        thai_only:bool=True,
        skip_special_tokens:bool=True
    ):
        """
        Generate Instruct
        
        :param str instruct: Instruct
        :param str context: context
        :param int max_new_tokens: maximum number of new tokens
        :param float top_p: top p
        :param float temperature: temperature
        :param int top_k: top k
        :param int no_repeat_ngram_size: do not repeat ngram size
        :param float typical_p: typical p
        :param bool thai_only: Thai only
        :param bool skip_special_tokens: skip special tokens
        :return: the answer from Instruct
        :rtype: str

        :Example:
        ::

                from pythainlp.generate.wangchanglm import WangChanGLM
                import torch

                model = WangChanGLM()

                model.load_model(device="cpu",torch_dtype=torch.bfloat16)

                print(model.instruct_generate(instruct="ขอวิธีลดน้ำหนัก"))
                # output: ลดน้ําหนักให้ได้ผล ต้องทําอย่างค่อยเป็นค่อยไป
                # ปรับเปลี่ยนพฤติกรรมการกินอาหาร
                # ออกกําลังกายอย่างสม่ําเสมอ
                # และพักผ่อนให้เพียงพอ
                # ที่สําคัญควรหลีกเลี่ยงอาหารที่มีแคลอรี่สูง
                # เช่น อาหารทอด อาหารมัน อาหารที่มีน้ําตาลสูง
                # และเครื่องดื่มแอลกอฮอล์

        """
        if context in (None, ""):
            prompt = self.PROMPT_DICT['prompt_no_input'].format_map(
                {'instruction': instruct, 'input': ''}
            )
        else:
            prompt = self.PROMPT_DICT['prompt_input'].format_map(
                {'instruction': instruct, 'input': context}
            )
        result = self.gen_instruct(
            prompt,
            max_new_tokens=max_new_tokens,
            top_p=top_p,
            top_k=top_k,
            temperature=temperature,
            no_repeat_ngram_size=no_repeat_ngram_size,
            typical_p=typical_p,
            thai_only=thai_only,
            skip_special_tokens=skip_special_tokens
        )
        return result