Source code for pythainlp.util.profanity

# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Profanity detection for Thai language
"""
from __future__ import annotations

from pythainlp.corpus.common import thai_profanity_words, thai_words
from pythainlp.tokenize import word_tokenize
from pythainlp.util.trie import dict_trie


[docs] def contains_profanity( text: str, custom_words: set[str] | None = None, engine: str = "newmm" ) -> bool: """ Check if the given text contains profanity words. :param str text: Thai text to check :param set custom_words: additional profanity words to check (default: None) :param str engine: tokenization engine (default: "newmm") :return: True if text contains profanity, False otherwise :rtype: bool :Example: :: from pythainlp.util import contains_profanity print(contains_profanity("สวัสดีครับ")) # output: False print(contains_profanity("คำหยาบคาย")) # output: True if the word is in the profanity list # Add custom profanity words print(contains_profanity("คำใหม่", custom_words={"คำใหม่"})) # output: True """ if not text: return False profanity_set = set(thai_profanity_words()) if custom_words: profanity_set.update(custom_words) # Create custom dictionary that merges thai_words and profanity_set # for better tokenization custom_dict_set = set(thai_words()) custom_dict_set.update(profanity_set) custom_dict = dict_trie(dict_source=custom_dict_set) tokens = word_tokenize(text, custom_dict=custom_dict, engine=engine) for token in tokens: if token in profanity_set: return True return False
[docs] def find_profanity( text: str, custom_words: set[str] | None = None, engine: str = "newmm" ) -> list[str]: """ Find all profanity words in the given text. :param str text: Thai text to check :param set custom_words: additional profanity words to check (default: None) :param str engine: tokenization engine (default: "newmm") :return: list of profanity words found in the text :rtype: list[str] :Example: :: from pythainlp.util import find_profanity print(find_profanity("สวัสดีครับ")) # output: [] print(find_profanity("text with profanity words")) # output: ['profanity_word1', 'profanity_word2'] # Add custom profanity words print(find_profanity("คำใหม่", custom_words={"คำใหม่"})) # output: ['คำใหม่'] """ if not text: return [] profanity_set = set(thai_profanity_words()) if custom_words: profanity_set.update(custom_words) # Create custom dictionary that merges thai_words and profanity_set # for better tokenization custom_dict_set = set(thai_words()) custom_dict_set.update(profanity_set) custom_dict = dict_trie(dict_source=custom_dict_set) tokens = word_tokenize(text, custom_dict=custom_dict, engine=engine) found_profanity = [] for token in tokens: if token in profanity_set: found_profanity.append(token) return found_profanity
[docs] def censor_profanity( text: str, replacement: str = "*", custom_words: set[str] | None = None, engine: str = "newmm", ) -> str: """ Replace profanity words in the text with a replacement character. :param str text: Thai text to censor :param str replacement: character to replace profanity with (default: "*") :param set custom_words: additional profanity words to censor (default: None) :param str engine: tokenization engine (default: "newmm") :return: Text with profanity words censored :rtype: str :Example: :: from pythainlp.util import censor_profanity print(censor_profanity("สวัสดีครับ")) # output: สวัสดีครับ print(censor_profanity("text with profanity word")) # output: text with *** word # Add custom profanity words print(censor_profanity("คำใหม่", custom_words={"คำใหม่"})) # output: ****** """ if not text: return text profanity_set = set(thai_profanity_words()) if custom_words: profanity_set.update(custom_words) # Create custom dictionary that merges thai_words and profanity_set # for better tokenization custom_dict_set = set(thai_words()) custom_dict_set.update(profanity_set) custom_dict = dict_trie(dict_source=custom_dict_set) tokens = word_tokenize( text, custom_dict=custom_dict, engine=engine, keep_whitespace=True ) censored_tokens = [] for token in tokens: if token in profanity_set: censored_tokens.append(replacement * len(token)) else: censored_tokens.append(token) return "".join(censored_tokens)