Source code for pythainlp.soundex.metasound
# -*- coding: utf-8 -*-
"""
Thai soundex - MetaSound system
References:
Snae & Brückner. (2009). Novel Phonetic Name Matching Algorithm with
a Statistical Ontology for Analysing Names Given in Accordance
with Thai Astrology.
https://pdfs.semanticscholar.org/3983/963e87ddc6dfdbb291099aa3927a0e3e4ea6.pdf
"""
_CONS_THANTHAKHAT = "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮ์"
_THANTHAKHAT = "์"  # \u0e4c
_C1 = "กขฃคฆฅ"  # sound K -> coded letter 1
_C2 = "จฉชฌซฐทฒดฎตสศษ"  # D -> 2
_C3 = "ฟฝพผภบป"  # B -> 3
_C4 = "ง"  # NG -> 4
_C5 = "ลฬรนณฦญ"  # N -> 5
_C6 = "ม"  # M -> 6
_C7 = "ย"  # Y -> 7
_C8 = "ว"  # W -> 8
[docs]def metasound(text: str, length: int = 4) -> str:
    """
    This function converts Thai text into phonetic code with the
    mactching technique called **MetaSound**
    [#metasound]_ (combination between Soundex and Metaphone algorithms).
    MetaSound algorithm was developed specifically for Thai language.
    :param str text: Thai text
    :param int length: preferred length of the MetaSound code (default is 4)
    :return: MetaSound for the given text
    :rtype: str
    :Example:
    ::
        from pythainlp.metasound import metasound
        metasound("ลัก")
        # output: 'ล100'
        metasound("รัก")
        # output: 'ร100'
        metasound("รักษ์")
        # output: 'ร100'
        metasound("บูรณการ", 5)
        # output: 'บ5515'
        metasound("บูรณการ", 6))
        # output: 'บ55150'
        metasound("บูรณการ", 4)
        # output: 'บ551'
    """
    if not text or not isinstance(text, str):
        return ""
    # keep only consonants and thanthakhat
    chars = []
    for ch in text:
        if ch in _CONS_THANTHAKHAT:
            chars.append(ch)
    # remove karan (thanthakhat and a consonant before it)
    i = 0
    while i < len(chars):
        if chars[i] == _THANTHAKHAT:
            if i > 0:
                chars[i - 1] = " "
            chars[i] = " "
        i += 1
    # retain first consonant, encode the rest
    chars = chars[:length]
    i = 1
    while i < len(chars):
        if chars[i] in _C1:
            chars[i] = "1"
        elif chars[i] in _C2:
            chars[i] = "2"
        elif chars[i] in _C3:
            chars[i] = "3"
        elif chars[i] in _C4:
            chars[i] = "4"
        elif chars[i] in _C5:
            chars[i] = "5"
        elif chars[i] in _C6:
            chars[i] = "6"
        elif chars[i] in _C7:
            chars[i] = "7"
        elif chars[i] in _C8:
            chars[i] = "8"
        else:
            chars[i] = "0"
        i += 1
    while len(chars) < length:
        chars.append("0")
    return "".join(chars)