Source code for pythainlp.tools.misspell

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np

THAI_CHARACTERS_WITHOUT_SHIFT = [
    "ผปแอิืทมใฝ",
    "ฟหกดเ้่าสวง",
    "ๆไำพะัีรนยบลฃ",
    "ๅ/_ภถุึคตจขช",
]

THAI_CHARACTERS_WITH_SHIFT = [
    "()ฉฮฺ์?ฒฬฦ",
    "ฤฆฏโฌ็๋ษศซ.",
    '๐"ฎฑธํ๊ณฯญฐ,',
    "+๑๒๓๔ู฿๕๖๗๘๙",
]

ENGLISH_CHARACTERS_WITHOUT_SHIFT = [
    "1234567890-=",
    "qwertyuiop[]\\",
    "asdfghjkl;'",
    "zxcvbnm,./",
]

ENGLISH_CHARACTERS_WITH_SHIFT = [
    "!@#$%^&*()_+",
    "QWERTYUIOP{}|",
    'ASDFGHJKL:"',
    "ZXCVBNM<>?",
]


ALL_CHARACTERS = [
    THAI_CHARACTERS_WITHOUT_SHIFT + THAI_CHARACTERS_WITH_SHIFT,
    ENGLISH_CHARACTERS_WITHOUT_SHIFT + ENGLISH_CHARACTERS_WITH_SHIFT,
]


def search_location_of_character(char: str):
    for language_ix in [0, 1]:
        for ix, row in enumerate(ALL_CHARACTERS[language_ix]):
            if char in row:
                return (language_ix, ix // 4, ix % 4, row.index(char))


def find_neighbour_locations(
    loc: tuple,
    char: str,
    kernel: List = [(-1, -1), (-1, 0), (1, 1), (0, 1), (0, -1), (1, 0)],
):
    language_ix, is_shift, row, pos = loc

    valid_neighbours = []
    for kr, ks in kernel:
        _row, _pos = row + kr, pos + ks
        if 0 <= _row <= 3 and 0 <= _pos <= len(
            ALL_CHARACTERS[language_ix][is_shift * 4 + _row]
        ):
            valid_neighbours.append((language_ix, is_shift, _row, _pos, char))

    return valid_neighbours


def find_misspell_candidates(char: str, verbose: bool = False):
    loc = search_location_of_character(char)
    if loc is None:
        return None

    valid_neighbours = find_neighbour_locations(loc, char)

    chars = []
    printing_locations = ["▐"] * 3 + [char] + ["​▐"] * 3

    for language_ix, is_shift, row, pos, char in valid_neighbours:
        try:
            char = ALL_CHARACTERS[language_ix][is_shift * 4 + row][pos]
            chars.append(char)
            kernel = (row - loc[1], pos - loc[2])

            if kernel == (-1, -1):
                ix = 5
            elif kernel == (-1, 0):
                ix = 6
            elif kernel[0] == 0:
                ix = 3 + kernel[1]
            elif kernel == (1, 0):
                ix = 0
            elif kernel == (1, 1):
                ix = 1
            else:
                continue
            printing_locations[ix] = char
        except IndexError as e:
            continue
        except Exception as e:
            print("Something wrong with: ", char)
            raise e

    return chars


[docs]def misspell(sentence: str, ratio: float = 0.05): """ Simulate some mispellings for the input sentence. The number of mispelled locations is governed by ratio. :params str sentence: sentence to be mispelled :params float ratio: number of misspells per 100 chars. Defaults to 0.5. :return: sentence containing some misspelled :rtype: str :Example: :: from pythainlp.tools.misspell import misspell sentence = "ภาษาไทยปรากฏครั้งแรกในพุทธศักราช 1826" misspell(sent, ratio=0.1) # output: ภาษาไทยปรากฏครั้งแรกในกุทธศักราช 1727 """ num_misspells = np.floor(len(sentence) * ratio).astype(int) positions = np.random.choice( len(sentence), size=num_misspells, replace=False ) # convert strings to array of characters misspelled = list(sentence) for pos in positions: potential_candidates = find_misspell_candidates(sentence[pos]) if potential_candidates is None: continue candidate = np.random.choice(potential_candidates) misspelled[pos] = candidate return "".join(misspelled)