Source code for pythainlp.ancient.aksonhan

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from pythainlp.util import Trie
from pythainlp import thai_consonants, thai_tonemarks
from pythainlp.tokenize import Tokenizer
from pythainlp.corpus import thai_orst_words


_dict_aksonhan = {}
for i in list(thai_consonants):
    if i == "ร":
        continue
    for j in list(thai_tonemarks):
        _dict_aksonhan[i + j + i] = "ั" + j + i
        _dict_aksonhan[i + i + j + i] = i + "ั" + j + i
    _dict_aksonhan[i + i] = "ั" + i
_set_aksonhan = set(_dict_aksonhan.keys())
_trie = Trie(list(_dict_aksonhan.keys()) + list(thai_consonants))
_tokenizer = Tokenizer(custom_dict=_trie, engine="mm")
_dict_thai = set(thai_orst_words())  # call Thai words


[docs]def aksonhan_to_current(word: str) -> str: """ Convert AksonHan words to current Thai words AksonHan (อักษรหัน) writes down two consonants for the \ spelling of the /a/ vowels. (สระ อะ). Today, รร is an aksonHan word that is still used in Thai. :param str word: Thai word :return: Thai AksonHan to be converted to current Thai word :rtype: str :Example: :: from pythainlp.ancient import aksonhan_to_current print(aksonhan_to_current("จกก")) # output: จัก print(aksonhan_to_current("บงงคบบ")) # output: บังคับ print(aksonhan_to_current("สรรเพชญ")) # รร is still used. # output: สรรเพชญ """ if len(word) < 3: return word elif word in _set_aksonhan: return _dict_aksonhan[word] elif word in _dict_thai: # word in Thai words return word _seg = _tokenizer.word_tokenize(word) _w = [] for i in _seg: if i in _set_aksonhan: _w.append(_dict_aksonhan[i]) else: _w.append(i) return "".join(_w)