Source code for pythaitts.preprocess

# -*- coding: utf-8 -*-
"""
Thai Text Preprocessing for TTS

This module provides text preprocessing functions for Thai Text-to-Speech,
including number to Thai text conversion and handling of Thai repetition character (ๆ).
"""
import re


# Thai number words
THAI_ONES = ["", "หนึ่ง", "สอง", "สาม", "สี่", "ห้า", "หก", "เจ็ด", "แปด", "เก้า"]
THAI_TENS = ["", "สิบ", "ยี่สิบ", "สามสิบ", "สี่สิบ", "ห้าสิบ", "หกสิบ", "เจ็ดสิบ", "แปดสิบ", "เก้าสิบ"]


def _num_to_thai_under_hundred(num: int) -> str:
    """
    Convert numbers 0-99 to Thai text.
    
    :param int num: Number to convert (0-99)
    :return: Thai text representation
    :rtype: str
    """
    if num == 0:
        return "ศูนย์"
    elif num < 10:
        return THAI_ONES[num]
    elif num < 20:
        if num == 10:
            return "สิบ"
        elif num == 11:
            return "สิบเอ็ด"
        else:
            return "สิบ" + THAI_ONES[num % 10]
    elif num < 100:
        tens = num // 10
        ones = num % 10
        result = THAI_TENS[tens]
        if ones == 1:
            result += "เอ็ด"
        elif ones > 1:
            result += THAI_ONES[ones]
        return result
    return ""


def _num_to_thai_under_thousand(num: int) -> str:
    """
    Convert numbers 0-999 to Thai text.
    
    :param int num: Number to convert (0-999)
    :return: Thai text representation
    :rtype: str
    """
    if num < 100:
        return _num_to_thai_under_hundred(num)
    
    hundreds = num // 100
    remainder = num % 100
    
    if hundreds == 1:
        result = "หนึ่งร้อย"
    elif hundreds == 2:
        result = "สองร้อย"
    else:
        result = THAI_ONES[hundreds] + "ร้อย"
    
    if remainder > 0:
        result += _num_to_thai_under_hundred(remainder)
    
    return result


[docs]def num_to_thai(num_str: str) -> str: """ Convert number string to Thai text. Supports integers and decimals. :param str num_str: Number string to convert (e.g., "123", "1234", "12.5") :return: Thai text representation :rtype: str Examples: >>> num_to_thai("0") 'ศูนย์' >>> num_to_thai("123") 'หนึ่งร้อยยี่สิบสาม' >>> num_to_thai("1000") 'หนึ่งพัน' """ # Handle decimal numbers if '.' in num_str: integer_part, decimal_part = num_str.split('.') result = num_to_thai(integer_part) + "จุด" for digit in decimal_part: result += THAI_ONES[int(digit)] if int(digit) > 0 else "ศูนย์" return result # Convert to integer try: num = int(num_str) except ValueError: return num_str # Return original if cannot convert if num == 0: return "ศูนย์" if num < 0: return "ลบ" + num_to_thai(str(-num)) # Handle numbers by magnitude if num < 1000: return _num_to_thai_under_thousand(num) elif num < 10000: thousands = num // 1000 remainder = num % 1000 result = THAI_ONES[thousands] + "พัน" if remainder > 0: result += _num_to_thai_under_thousand(remainder) return result elif num < 100000: ten_thousands = num // 10000 remainder = num % 10000 if ten_thousands == 1: result = "หนึ่งหมื่น" elif ten_thousands == 2: result = "สองหมื่น" else: result = THAI_ONES[ten_thousands] + "หมื่น" if remainder > 0: thousands = remainder // 1000 if thousands > 0: result += THAI_ONES[thousands] + "พัน" remainder = remainder % 1000 if remainder > 0: result += _num_to_thai_under_thousand(remainder) return result elif num < 1000000: hundred_thousands = num // 100000 remainder = num % 100000 result = THAI_ONES[hundred_thousands] + "แสน" if remainder > 0: ten_thousands = remainder // 10000 if ten_thousands > 0: result += THAI_ONES[ten_thousands] + "หมื่น" remainder = remainder % 10000 thousands = remainder // 1000 if thousands > 0: result += THAI_ONES[thousands] + "พัน" remainder = remainder % 1000 if remainder > 0: result += _num_to_thai_under_thousand(remainder) return result elif num < 10000000: millions = num // 1000000 remainder = num % 1000000 result = THAI_ONES[millions] + "ล้าน" if remainder > 0: result += num_to_thai(str(remainder)) return result else: # For very large numbers, use a simple approach millions = num // 1000000 remainder = num % 1000000 result = num_to_thai(str(millions)) + "ล้าน" if remainder > 0: result += num_to_thai(str(remainder)) return result
[docs]def expand_maiyamok(text: str) -> str: """ Expand Thai repetition character (ๆ) by repeating the previous word or syllable. The mai yamok (ๆ) is a Thai repetition mark that indicates the previous word or syllable should be repeated. :param str text: Text containing ๆ character :return: Text with ๆ expanded :rtype: str Examples: >>> expand_maiyamok("ช้าๆ") 'ช้าช้า' >>> expand_maiyamok("ดีๆ") 'ดีดี' """ if 'ๆ' not in text: return text result = [] i = 0 while i < len(text): if text[i] == 'ๆ': # Find the previous word/syllable to repeat if result: # Look back to find the word to repeat # Thai words are typically separated by spaces or are continuous # We'll repeat the last word or syllable prev_text = ''.join(result) # Find the last word (sequence of Thai characters) thai_char_pattern = r'[ก-๙]+' matches = list(re.finditer(thai_char_pattern, prev_text)) if matches: last_match = matches[-1] word_to_repeat = last_match.group() result.append(word_to_repeat) else: # If no Thai characters found, just skip the ๆ pass i += 1 else: result.append(text[i]) i += 1 return ''.join(result)
[docs]def preprocess_text(text: str, expand_numbers: bool = True, expand_maiyamok_char: bool = True) -> str: """ Preprocess Thai text for TTS by converting numbers to text and expanding ๆ. :param str text: Input text to preprocess :param bool expand_numbers: Whether to convert numbers to Thai text (default: True) :param bool expand_maiyamok_char: Whether to expand ๆ character (default: True) :return: Preprocessed text :rtype: str Examples: >>> preprocess_text("ฉันมี 123 บาท") 'ฉันมี หนึ่งร้อยยี่สิบสาม บาท' >>> preprocess_text("ดีๆ") 'ดีดี' >>> preprocess_text("มี 5 คนๆ") 'มี ห้า คนคน' """ result = text # Expand mai yamok (ๆ) first if expand_maiyamok_char: result = expand_maiyamok(result) # Convert numbers to Thai text if expand_numbers: # Find all numbers in the text and replace them def replace_number(match): return num_to_thai(match.group()) # Match integers and decimals, including optional negative sign # Handles: -5, 123, 123.45 result = re.sub(r'-?\d+(?:\.\d+)?', replace_number, result) return result