Source code for

# -*- coding: utf-8 -*-
Spell checker, using Peter Norvig algorithm.
Spelling dictionary can be customized.
Default spelling dictionary is based on Thai National Corpus.

Based on Peter Norvig's Python code from
from collections import Counter
from string import digits
from typing import Callable, List, Set, Tuple

from pythainlp import thai_digits, thai_letters
from pythainlp.corpus import tnc
from pythainlp.util import isthaichar

def _no_filter(word: str) -> bool:
    return True

def _is_thai_and_not_num(word: str) -> bool:
    for ch in word:
        if ch != "." and not isthaichar(ch):
            return False
        if ch in digits or ch in thai_digits:
            return False
    return True

def _keep(
    word_freq: int,
    min_freq: int,
    min_len: int,
    max_len: int,
    dict_filter: Callable[[str], bool],
) -> Callable[[str], bool]:
    Keep only Thai words with at least min_freq frequency
    and has length between min_len and max_len characters
    if not word_freq or word_freq[1] < min_freq:
        return False

    word = word_freq[0]
    if not word or len(word) < min_len or len(word) > max_len or word[0] == ".":
        return False

    return dict_filter(word)

def _edits1(word: str) -> Set[str]:
    Return a set of words with edit distance of 1 from the input word
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in thai_letters]
    inserts = [L + c + R for L, R in splits for c in thai_letters]

    return set(deletes + transposes + replaces + inserts)

def _edits2(word: str) -> Set[str]:
    Return a set of words with edit distance of 2 from the input word
    return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))

[docs]class NorvigSpellChecker: def __init__( self, custom_dict: List[Tuple[str, int]] = None, min_freq: int = 2, min_len: int = 2, max_len: int = 40, dict_filter: Callable[[str], bool] = _is_thai_and_not_num, ): """ Initialize Peter Norvig's spell checker object :param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words). :param int min_freq: Minimum frequency of a word to keep (default = 2) :param int min_len: Minimum length (in characters) of a word to keep (default = 2) :param int max_len: Maximum length (in characters) of a word to keep (default = 40) :param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None. """ if not custom_dict: # default, use Thai National Corpus custom_dict = tnc.word_freqs() if not dict_filter: dict_filter = _no_filter # filter word list custom_dict = [ word_freq for word_freq in custom_dict if _keep(word_freq, min_freq, min_len, max_len, dict_filter) ] self.__WORDS = Counter(dict(custom_dict)) self.__WORDS_TOTAL = sum(self.__WORDS.values()) if self.__WORDS_TOTAL < 1: self.__WORDS_TOTAL = 0
[docs] def dictionary(self) -> List[Tuple[str, int]]: """ Return the spelling dictionary currently used by this spell checker """ return self.__WORDS.items()
[docs] def known(self, words: List[str]) -> List[str]: """ Return a list of given words that found in the spelling dictionary :param str words: A list of words to check if they are in the spelling dictionary """ return list(w for w in words if w in self.__WORDS)
[docs] def prob(self, word: str) -> float: """ Return probability of an input word, according to the spelling dictionary :param str word: A word to check its probability of occurrence """ return self.__WORDS[word] / self.__WORDS_TOTAL
[docs] def freq(self, word: str) -> int: """ Return frequency of an input word, according to the spelling dictionary :param str word: A word to check its frequency """ return self.__WORDS[word]
[docs] def spell(self, word: str) -> List[str]: """ Return a list of possible words, according to edit distance of 1 and 2, sorted by frequency of word occurrance in the spelling dictionary :param str word: A word to check its spelling """ if not word: return "" candidates = ( self.known([word]) or self.known(_edits1(word)) or self.known(_edits2(word)) or [word] ) candidates.sort(key=self.freq, reverse=True) return candidates
[docs] def correct(self, word: str) -> str: """ Return the most possible word, using the probability from the spelling dictionary :param str word: A word to correct its spelling """ if not word: return "" return self.spell(word)[0]
DEFAULT_SPELL_CHECKER = NorvigSpellChecker()