Source code for pythainlp.util.profanity

# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Profanity detection for Thai language
"""

from __future__ import annotations

from typing import Optional

from pythainlp.corpus.common import thai_profanity_words, thai_words
from pythainlp.tokenize import word_tokenize
from pythainlp.util.trie import dict_trie



[docs]
def contains_profanity(
    text: str, custom_words: Optional[set[str]] = None, engine: str = "newmm"
) -> bool:
    """
    Check if the given text contains profanity words.

    :param str text: Thai text to check
    :param set custom_words: additional profanity words to check (default: None)
    :param str engine: tokenization engine (default: "newmm")
    :return: True if text contains profanity, False otherwise
    :rtype: bool

    :Example:
    ::

        from pythainlp.util import contains_profanity

        print(contains_profanity("สวัสดีครับ"))
        # output: False

        print(contains_profanity("คำหยาบคาย"))
        # output: True if the word is in the profanity list

        # Add custom profanity words
        print(contains_profanity("คำใหม่", custom_words={"คำใหม่"}))
        # output: True
    """
    if not text:
        return False

    profanity_set = set(thai_profanity_words())
    if custom_words:
        profanity_set.update(custom_words)

    # Create custom dictionary that merges thai_words and profanity_set
    # for better tokenization
    custom_dict_set = set(thai_words())
    custom_dict_set.update(profanity_set)
    custom_dict = dict_trie(dict_source=custom_dict_set)

    tokens = word_tokenize(text, custom_dict=custom_dict, engine=engine)

    for token in tokens:
        if token in profanity_set:
            return True

    return False




[docs]
def find_profanity(
    text: str, custom_words: Optional[set[str]] = None, engine: str = "newmm"
) -> list[str]:
    """
    Find all profanity words in the given text.

    :param str text: Thai text to check
    :param set custom_words: additional profanity words to check (default: None)
    :param str engine: tokenization engine (default: "newmm")
    :return: list of profanity words found in the text
    :rtype: list[str]

    :Example:
    ::

        from pythainlp.util import find_profanity

        print(find_profanity("สวัสดีครับ"))
        # output: []

        print(find_profanity("text with profanity words"))
        # output: ['profanity_word1', 'profanity_word2']

        # Add custom profanity words
        print(find_profanity("คำใหม่", custom_words={"คำใหม่"}))
        # output: ['คำใหม่']
    """
    if not text:
        return []

    profanity_set = set(thai_profanity_words())
    if custom_words:
        profanity_set.update(custom_words)

    # Create custom dictionary that merges thai_words and profanity_set
    # for better tokenization
    custom_dict_set = set(thai_words())
    custom_dict_set.update(profanity_set)
    custom_dict = dict_trie(dict_source=custom_dict_set)

    tokens = word_tokenize(text, custom_dict=custom_dict, engine=engine)

    found_profanity = []
    for token in tokens:
        if token in profanity_set:
            found_profanity.append(token)

    return found_profanity




[docs]
def censor_profanity(
    text: str,
    replacement: str = "*",
    custom_words: Optional[set[str]] = None,
    engine: str = "newmm",
) -> str:
    """
    Replace profanity words in the text with a replacement character.

    :param str text: Thai text to censor
    :param str replacement: character to replace profanity with (default: "*")
    :param set custom_words: additional profanity words to censor (default: None)
    :param str engine: tokenization engine (default: "newmm")
    :return: Text with profanity words censored
    :rtype: str

    :Example:
    ::

        from pythainlp.util import censor_profanity

        print(censor_profanity("สวัสดีครับ"))
        # output: สวัสดีครับ

        print(censor_profanity("text with profanity word"))
        # output: text with *** word

        # Add custom profanity words
        print(censor_profanity("คำใหม่", custom_words={"คำใหม่"}))
        # output: ******
    """
    if not text:
        return text

    profanity_set = set(thai_profanity_words())
    if custom_words:
        profanity_set.update(custom_words)

    # Create custom dictionary that merges thai_words and profanity_set
    # for better tokenization
    custom_dict_set = set(thai_words())
    custom_dict_set.update(profanity_set)
    custom_dict = dict_trie(dict_source=custom_dict_set)

    tokens = word_tokenize(
        text, custom_dict=custom_dict, engine=engine, keep_whitespace=True
    )

    censored_tokens = []
    for token in tokens:
        if token in profanity_set:
            censored_tokens.append(replacement * len(token))
        else:
            censored_tokens.append(token)

    return "".join(censored_tokens)