# -*- coding: utf-8 -*-
"""
Spell checker, using Peter Norvig algorithm.
Spelling dictionary can be customized.
Default spelling dictionary is based on Thai National Corpus.
Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html
"""
from collections import Counter
from string import digits
from typing import Callable, List, Set, Tuple
from pythainlp import thai_digits, thai_letters
from pythainlp.corpus import tnc
from pythainlp.util import isthaichar
def _no_filter(word: str) -> bool:
return True
def _is_thai_and_not_num(word: str) -> bool:
for ch in word:
if ch != "." and not isthaichar(ch):
return False
if ch in digits or ch in thai_digits:
return False
return True
def _keep(
word_freq: int,
min_freq: int,
min_len: int,
max_len: int,
dict_filter: Callable[[str], bool],
) -> Callable[[str], bool]:
"""
Keep only Thai words with at least min_freq frequency
and has length between min_len and max_len characters
"""
if not word_freq or word_freq[1] < min_freq:
return False
word = word_freq[0]
if not word or len(word) < min_len or len(word) > max_len or word[0] == ".":
return False
return dict_filter(word)
def _edits1(word: str) -> Set[str]:
"""
Return a set of words with edit distance of 1 from the input word
"""
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in thai_letters]
inserts = [L + c + R for L, R in splits for c in thai_letters]
return set(deletes + transposes + replaces + inserts)
def _edits2(word: str) -> Set[str]:
"""
Return a set of words with edit distance of 2 from the input word
"""
return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))
[docs]class NorvigSpellChecker:
def __init__(
self,
custom_dict: List[Tuple[str, int]] = None,
min_freq: int = 2,
min_len: int = 2,
max_len: int = 40,
dict_filter: Callable[[str], bool] = _is_thai_and_not_num,
):
"""
Initialize Peter Norvig's spell checker object
:param str custom_dict: A list of tuple (word, frequency) to create a spelling dictionary. Default is from Thai National Corpus (around 40,000 words).
:param int min_freq: Minimum frequency of a word to keep (default = 2)
:param int min_len: Minimum length (in characters) of a word to keep (default = 2)
:param int max_len: Maximum length (in characters) of a word to keep (default = 40)
:param func dict_filter: A function to filter the dictionary. Default filter removes any word with number or non-Thai characters. If no filter is required, use None.
"""
if not custom_dict: # default, use Thai National Corpus
custom_dict = tnc.word_freqs()
if not dict_filter:
dict_filter = _no_filter
# filter word list
custom_dict = [
word_freq
for word_freq in custom_dict
if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
]
self.__WORDS = Counter(dict(custom_dict))
self.__WORDS_TOTAL = sum(self.__WORDS.values())
if self.__WORDS_TOTAL < 1:
self.__WORDS_TOTAL = 0
[docs] def dictionary(self) -> List[Tuple[str, int]]:
"""
Return the spelling dictionary currently used by this spell checker
"""
return self.__WORDS.items()
[docs] def known(self, words: List[str]) -> List[str]:
"""
Return a list of given words that found in the spelling dictionary
:param str words: A list of words to check if they are in the spelling dictionary
"""
return list(w for w in words if w in self.__WORDS)
[docs] def prob(self, word: str) -> float:
"""
Return probability of an input word, according to the spelling dictionary
:param str word: A word to check its probability of occurrence
"""
return self.__WORDS[word] / self.__WORDS_TOTAL
[docs] def freq(self, word: str) -> int:
"""
Return frequency of an input word, according to the spelling dictionary
:param str word: A word to check its frequency
"""
return self.__WORDS[word]
[docs] def spell(self, word: str) -> List[str]:
"""
Return a list of possible words, according to edit distance of 1 and 2,
sorted by frequency of word occurrance in the spelling dictionary
:param str word: A word to check its spelling
"""
if not word:
return ""
candidates = (
self.known([word])
or self.known(_edits1(word))
or self.known(_edits2(word))
or [word]
)
candidates.sort(key=self.freq, reverse=True)
return candidates
[docs] def correct(self, word: str) -> str:
"""
Return the most possible word, using the probability from the spelling dictionary
:param str word: A word to correct its spelling
"""
if not word:
return ""
return self.spell(word)[0]
DEFAULT_SPELL_CHECKER = NorvigSpellChecker()