# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Profanity detection for Thai language
"""
from __future__ import annotations
from pythainlp.corpus.common import thai_profanity_words, thai_words
from pythainlp.tokenize import word_tokenize
from pythainlp.util.trie import dict_trie
[docs]
def contains_profanity(
text: str, custom_words: set[str] | None = None, engine: str = "newmm"
) -> bool:
"""
Check if the given text contains profanity words.
:param str text: Thai text to check
:param set custom_words: additional profanity words to check (default: None)
:param str engine: tokenization engine (default: "newmm")
:return: True if text contains profanity, False otherwise
:rtype: bool
:Example:
::
from pythainlp.util import contains_profanity
print(contains_profanity("สวัสดีครับ"))
# output: False
print(contains_profanity("คำหยาบคาย"))
# output: True if the word is in the profanity list
# Add custom profanity words
print(contains_profanity("คำใหม่", custom_words={"คำใหม่"}))
# output: True
"""
if not text:
return False
profanity_set = set(thai_profanity_words())
if custom_words:
profanity_set.update(custom_words)
# Create custom dictionary that merges thai_words and profanity_set
# for better tokenization
custom_dict_set = set(thai_words())
custom_dict_set.update(profanity_set)
custom_dict = dict_trie(dict_source=custom_dict_set)
tokens = word_tokenize(text, custom_dict=custom_dict, engine=engine)
for token in tokens:
if token in profanity_set:
return True
return False
[docs]
def find_profanity(
text: str, custom_words: set[str] | None = None, engine: str = "newmm"
) -> list[str]:
"""
Find all profanity words in the given text.
:param str text: Thai text to check
:param set custom_words: additional profanity words to check (default: None)
:param str engine: tokenization engine (default: "newmm")
:return: list of profanity words found in the text
:rtype: list[str]
:Example:
::
from pythainlp.util import find_profanity
print(find_profanity("สวัสดีครับ"))
# output: []
print(find_profanity("text with profanity words"))
# output: ['profanity_word1', 'profanity_word2']
# Add custom profanity words
print(find_profanity("คำใหม่", custom_words={"คำใหม่"}))
# output: ['คำใหม่']
"""
if not text:
return []
profanity_set = set(thai_profanity_words())
if custom_words:
profanity_set.update(custom_words)
# Create custom dictionary that merges thai_words and profanity_set
# for better tokenization
custom_dict_set = set(thai_words())
custom_dict_set.update(profanity_set)
custom_dict = dict_trie(dict_source=custom_dict_set)
tokens = word_tokenize(text, custom_dict=custom_dict, engine=engine)
found_profanity = []
for token in tokens:
if token in profanity_set:
found_profanity.append(token)
return found_profanity
[docs]
def censor_profanity(
text: str,
replacement: str = "*",
custom_words: set[str] | None = None,
engine: str = "newmm",
) -> str:
"""
Replace profanity words in the text with a replacement character.
:param str text: Thai text to censor
:param str replacement: character to replace profanity with (default: "*")
:param set custom_words: additional profanity words to censor (default: None)
:param str engine: tokenization engine (default: "newmm")
:return: Text with profanity words censored
:rtype: str
:Example:
::
from pythainlp.util import censor_profanity
print(censor_profanity("สวัสดีครับ"))
# output: สวัสดีครับ
print(censor_profanity("text with profanity word"))
# output: text with *** word
# Add custom profanity words
print(censor_profanity("คำใหม่", custom_words={"คำใหม่"}))
# output: ******
"""
if not text:
return text
profanity_set = set(thai_profanity_words())
if custom_words:
profanity_set.update(custom_words)
# Create custom dictionary that merges thai_words and profanity_set
# for better tokenization
custom_dict_set = set(thai_words())
custom_dict_set.update(profanity_set)
custom_dict = dict_trie(dict_source=custom_dict_set)
tokens = word_tokenize(
text, custom_dict=custom_dict, engine=engine, keep_whitespace=True
)
censored_tokens = []
for token in tokens:
if token in profanity_set:
censored_tokens.append(replacement * len(token))
else:
censored_tokens.append(token)
return "".join(censored_tokens)