Source code for pythainlp.soundex.sound

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from typing import List
import panphon
import panphon.distance
from pythainlp.transliterate import pronunciate, transliterate
from pythainlp.tokenize import word_tokenize

_ft = panphon.FeatureTable()
_dst = panphon.distance.Distance()

def _clean_ipa(ipa: str) -> str:
    """
    Clean IPA by removing tones and space between phonetic codes

    :param str ipa: IPA text
    :return: IPA with tones removed from the text
    :rtype: str
    """
    return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()

[docs]def word2audio(word: str) -> str:
    """
    Convert word to IPA

    :param str word: Thai word
    :return: IPA with tones removed from the text
    :rtype: str

    :Example:
    ::

        from pythainlp.soundex.sound import word2audio

        word2audio("น้ำ")
        # output : 'n aː m .'
    """
    _word = word_tokenize(word)
    _phone = [pronunciate(w, engine="w2p") for w in _word]
    _ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]
    return '.'.join(_ipa)

[docs]def audio_vector(word:str) -> List[List[int]]:
    """
    Convert audio to vector list

    :param str word: Thai word
    :return: List of features from panphon
    :rtype: List[List[int]]

    :Example:
    ::

        from pythainlp.soundex.sound import audio_vector

        audio_vector("น้ำ")
        # output : [[-1, 1, 1, -1, -1, -1, ...]]
    """
    return _ft.word_to_vector_list(word2audio(word), numeric=True)

[docs]def word_approximation(word:str, list_word:List[str]):
    """
    Thai Word Approximation

    :param str word: Thai word
    :param str list_word: Thai word
    :return: List of approximation of words (The smaller the value, the closer)
    :rtype: List[str]

    :Example:
    ::

        from pythainlp.soundex.sound import word_approximation

        word_approximation("รถ", ["รด", "รส", "รม", "น้ำ"])
        # output : [0.0, 0.0, 3.875, 8.375]
    """
    _word = word2audio(word)
    _list_word = [word2audio(w) for w in list_word]
    _distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]
    return _distance