Source code for pythainlp.translate.core

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""Translation."""

from __future__ import annotations

import re
from typing import TYPE_CHECKING, Optional, Union

if TYPE_CHECKING:
    from pythainlp.translate.en_th import EnThTranslator, ThEnTranslator
    from pythainlp.translate.small100 import Small100Translator
    from pythainlp.translate.th_fr import ThFrTranslator
    from pythainlp.translate.zh_th import ThZhTranslator, ZhThTranslator


def _prepare_text_with_exclusions(
    text: str, exclude_words: Optional[list[str]]
) -> tuple[str, dict[str, str]]:
    """Replace excluded words with placeholders.

    :param str text: input text
    :param list[str] exclude_words: words to exclude from translation
    :return: tuple of (modified text, placeholder mapping)
    :rtype: tuple[str, dict[str, str]]

    Note: For text that contains spaces (for example, English sentences),
    this function attempts to match whole tokens delimited by whitespace
    and common punctuation characters. If the text contains no spaces at
    all (as in many sentences in languages without explicit word
    boundaries, such as Thai), it will match the exact exclude string
    anywhere it appears using simple substring replacement.
    """
    if not exclude_words:
        return text, {}

    placeholder_map = {}
    modified_text = text

    # Remove duplicates while preserving order
    seen = set()
    unique_words = []
    for word in exclude_words:
        if word not in seen:
            seen.add(word)
            unique_words.append(word)

    # Sort by length (longest first) to handle overlapping words correctly
    # For example, if we have ["cat", "category"], we want to replace
    # "category" first
    sorted_words = sorted(unique_words, key=len, reverse=True)

    for i, word in enumerate(sorted_words):
        # Use a placeholder that is very unlikely to appear in natural text
        # and includes special markers to avoid conflicts
        placeholder = f"<<<PYTHAINLP_EXCLUDE_{i}>>>"
        placeholder_map[placeholder] = word

        # Escape the word to handle special regex characters
        escaped_word = re.escape(word)

        # Try token boundary matching for space-separated languages.
        # A token boundary is:
        # - the start or end of the string, or
        # - a delimiter character such as whitespace or common punctuation.
        # This allows matching words like "cat" in "I love cat.".
        delimiter_chars = r"\s" + re.escape(
            ".,!?;:'\"()[]{}<>/\\|`~@#$%^&*-+=""''、,。!?;:()【】《》"
        )
        pattern = (
            fr"(?:(?<=^)|(?<=[{delimiter_chars}]))"
            f"{escaped_word}"
            fr"(?:(?=$)|(?=[{delimiter_chars}]))"
        )

        # Check if there's a match with token boundaries
        if re.search(pattern, modified_text):
            # Use token boundary matching for space-separated text
            modified_text = re.sub(pattern, placeholder, modified_text)
        elif " " not in modified_text:
            # For languages without spaces (like Thai), use simple replacement.
            # Only do this if the text does not contain spaces, indicating
            # it is likely a non-space-separated language.
            modified_text = modified_text.replace(word, placeholder)

    return modified_text, placeholder_map


def _restore_excluded_words(
    translated_text: str, placeholder_map: dict[str, str]
) -> str:
    """Restore excluded words from placeholders.

    :param str translated_text: translated text with placeholders
    :param dict[str, str] placeholder_map: mapping of placeholders to
                                           original words
    :return: text with original words restored
    :rtype: str
    """
    if not placeholder_map:
        return translated_text

    result = translated_text
    # Sort by placeholder to ensure consistent replacement order
    for placeholder in sorted(placeholder_map.keys()):
        original_word = placeholder_map[placeholder]
        # Direct replacement since placeholders are very specific
        result = result.replace(placeholder, original_word)

    return result


[docs] class Translate: """Machine Translation"""
[docs] def __init__( self, src_lang: str, target_lang: str, engine: str = "default", use_gpu: bool = False, ) -> None: """:param str src_lang: source language :param str target_lang: target language :param str engine: machine translation engine :param bool use_gpu: load model using GPU (Default is False) **Options for engine* * *default* - The default engine for each language. * *small100* - A multilingual machine translation model (covering 100 languages) **Options for source & target language** * *th* - *en* - Thai to English * *en* - *th* - English to Thai * *th* - *zh* - Thai to Chinese * *zh* - *th* - Chinese to Thai * *th* - *fr* - Thai to French * *th* - *xx* - Thai to xx (xx is language code). It uses small100 model. * *xx* - *th* - xx to Thai (xx is language code). It uses small100 model. :Example: Translate text from Thai to English:: from pythainlp.translate import Translate th2en = Translate("th", "en") th2en.translate("ฉันรักแมว") # output: I love cat. Translate text with excluded words:: th2en.translate("ฉันรักแมว", exclude_words=["แมว"]) # output: I love แมว. """ self.model: Union[ Small100Translator, ThEnTranslator, EnThTranslator, ThZhTranslator, ZhThTranslator, ThFrTranslator, ] self.engine: str = engine self.src_lang: str = src_lang self.use_gpu: bool = use_gpu self.target_lang: str = target_lang self.load_model()
[docs] def load_model(self) -> None: src_lang = self.src_lang target_lang = self.target_lang use_gpu = self.use_gpu if self.engine == "small100": from .small100 import Small100Translator self.model = Small100Translator(use_gpu) elif src_lang == "th" and target_lang == "en": from pythainlp.translate.en_th import ThEnTranslator self.model = ThEnTranslator(use_gpu) elif src_lang == "en" and target_lang == "th": from pythainlp.translate.en_th import EnThTranslator self.model = EnThTranslator(use_gpu) elif src_lang == "th" and target_lang == "zh": from pythainlp.translate.zh_th import ThZhTranslator self.model = ThZhTranslator(use_gpu) elif src_lang == "zh" and target_lang == "th": from pythainlp.translate.zh_th import ZhThTranslator self.model = ZhThTranslator(use_gpu) elif src_lang == "th" and target_lang == "fr": from pythainlp.translate.th_fr import ThFrTranslator self.model = ThFrTranslator(use_gpu) else: raise ValueError("Not support language!")
[docs] def translate( self, text: str, exclude_words: Optional[list[str]] = None ) -> str: """Translate text :param str text: input text in source language :param list[str] exclude_words: words to exclude from translation (optional) :return: translated text in target language :rtype: str """ if self.engine == "small100": return self.model.translate( # type: ignore[call-arg] text, tgt_lang=self.target_lang, exclude_words=exclude_words ) return self.model.translate(text, exclude_words=exclude_words)
[docs] def word_translate( word: str, src: str, target: str, engine: str = "word2word" ) -> Optional[list[str]]: """Translate word from source language to target language. :param str word: text :param str src: src language :param str target: target language :param str engine: Word translate engine (the default engine is word2word) :return: return list word translate or None :rtype: Union[List[str], None] :Example: Translate word from Thai to English:: from pythainlp.translate import word_translate print(word_translate("แมว", "th", "en")) # output: ['cat', 'cats', 'kitty', 'kitten', 'Cat'] Translate word from English to Thai:: from pythainlp.translate import word_translate print(word_translate("cat", "en", "th")) # output: ['แมว', 'แมวป่า', 'ข่วน', 'เลี้ยง', 'อาหาร'] """ if engine == "word2word": from .word2word_translate import translate return translate(word=word, src=src, target=target) else: raise NotImplementedError( f"pythainlp.translate.word_translate isn't support {engine}." )