Source code for pythainlp.soundex.metasound
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Thai soundex - MetaSound system
References:
Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with
a Statistical Ontology for Analysing Names Given in Accordance
with Thai Astrology.
https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
"""
_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
_THANTHAKHAT = "์" # \u0e4c
_C1 = "กขฃคฆฅ" # sound K -> coded letter 1
_C2 = "จฉชฌซฐทฒดฎตสศษ" # D -> 2
_C3 = "ฟฝพผภบป" # B -> 3
_C4 = "ง" # NG -> 4
_C5 = "ลฬรนณฦญ" # N -> 5
_C6 = "ม" # M -> 6
_C7 = "ย" # Y -> 7
_C8 = "ว" # W -> 8
[docs]
def metasound(text: str, length: int = 4) -> str:
"""
This function converts Thai text into phonetic code with the
matching technique called **MetaSound**
[#metasound]_ (combination between Soundex and Metaphone algorithms).
MetaSound algorithm was developed specifically for the Thai language.
:param str text: Thai text
:param int length: preferred length of the MetaSound code (default is 4)
:return: MetaSound for the given text
:rtype: str
:Example:
::
from pythainlp.soundex.metasound import metasound
metasound("ลัก")
# output: 'ล100'
metasound("รัก")
# output: 'ร100'
metasound("รักษ์")
# output: 'ร100'
metasound("บูรณการ", 5)
# output: 'บ5515'
metasound("บูรณการ", 6))
# output: 'บ55150'
metasound("บูรณการ", 4)
# output: 'บ551'
"""
if not text or not isinstance(text, str):
return ""
# keep only consonants and thanthakhat
chars = []
for ch in text:
if ch in _CONS_THANTHAKHAT:
chars.append(ch)
# remove karan (thanthakhat and a consonant before it)
i = 0
while i < len(chars):
if chars[i] == _THANTHAKHAT:
if i > 0:
chars[i - 1] = " "
chars[i] = " "
i += 1
# retain first consonant, encode the rest
chars = chars[:length]
i = 1
while i < len(chars):
if chars[i] in _C1:
chars[i] = "1"
elif chars[i] in _C2:
chars[i] = "2"
elif chars[i] in _C3:
chars[i] = "3"
elif chars[i] in _C4:
chars[i] = "4"
elif chars[i] in _C5:
chars[i] = "5"
elif chars[i] in _C6:
chars[i] = "6"
elif chars[i] in _C7:
chars[i] = "7"
elif chars[i] in _C8:
chars[i] = "8"
else:
chars[i] = "0"
i += 1
while len(chars) < length:
chars.append("0")
return "".join(chars)