# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Thai-English Cross-Language Transliterated Word Retrieval
using Soundex Technique
References:
Prayut Suwanvisat, Somchai Prasitjutrakul.Thai-English Cross-Language Transliterated Word Retrieval using Soundex Technique. In 1998 [cited 2022 Sep 8]. Available from: https://www.cp.eng.chula.ac.th/~somchai/spj/papers/ThaiText/ncsec98-clir.pdf
"""
from pythainlp import thai_characters
_C0 = "AEIOUHWYอ"
_C1 = "BFPVบฝฟปผพภว"
_C2 = "CGJKQSXZขฃคฅฆฉขฌกจซศษส"
_C3 = "DTฎดฏตฐฑฒถทธ"
_C4 = "Lลฬ"
_C5 = "MNมณน"
_C6 = "Rร"
_C7 = "AEIOUอ"
_C8 = "Hหฮ"
_C1_1 = "Wว"
_C9 = "Yยญ"
_C52 = "ง"
[docs]def prayut_and_somchaip(text: str, length: int = 4) -> str:
"""
This function converts English-Thai Cross-Language Transliterated Word into
phonetic code with the mactching technique called **Soundex** [#prayut_and_somchaip]_.
:param str text: English-Thai Cross-Language Transliterated Word
:param int length: preferred length of the Soundex code (default is 4)
:return: Soundex for the given text
:rtype: str
:Example:
::
from pythainlp.soundex.prayut_and_somchaip import prayut_and_somchaip
prayut_and_somchaip("king", 2)
# output: '52'
prayut_and_somchaip("คิง", 2)
# output: '52'
"""
if not text or not isinstance(text, str):
return ""
text = text.upper()
# keep only consonants (English-Thai)
chars = []
for ch in text:
if ch in thai_characters + "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
chars.append(ch)
i = 0
while i < len(chars):
if i == 0 and chars[i] in _C0:
chars[i] = "0"
elif chars[i] in _C1:
chars[i] = "1"
elif chars[i] in _C2:
chars[i] = "2"
elif chars[i] in _C3:
chars[i] = "3"
elif chars[i] in _C4:
chars[i] = "4"
elif chars[i] in _C5:
chars[i] = "5"
elif chars[i] in _C6:
chars[i] = "6"
elif chars[i] in _C52:
chars[i] = "52"
elif chars[i] in _C7 and i != 0:
chars[i] = "7"
elif chars[i] in _C8 and i != 0:
chars[i] = "8"
elif chars[i] in _C1_1 and i != 0:
chars[i] = "1"
elif chars[i] in _C9 and i != 0:
chars[i] = "9"
else:
chars[i] = None
i += 1
chars = list("".join([i for i in chars if i is not None]))
return "".join(chars[-length:])