# -*- coding: utf-8 -*-
"""
Spell checker, using Peter Norvig algorithm.
Spelling dictionary can be customized.
Default spelling dictionary is based on Thai National Corpus.
Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html
"""
from collections import Counter
from string import digits
from typing import (
Callable,
Dict,
ItemsView,
Iterable,
List,
Optional,
Set,
Tuple,
Union,
)
from pythainlp import thai_digits, thai_letters
from pythainlp.corpus import tnc
from pythainlp.util import isthaichar
def _no_filter(word: str) -> bool:
return True
def _is_thai_and_not_num(word: str) -> bool:
for ch in word:
if ch != "." and not isthaichar(ch):
return False
if ch in thai_digits or ch in digits:
return False
return True
def _keep(
word_freq: Tuple[str, int],
min_freq: int,
min_len: int,
max_len: int,
dict_filter: Callable[[str], bool],
) -> bool:
"""
Checks whether a given word has the required minimum frequency of min_freq
and its character length is between min_len and max_len (inclusive).
"""
if not word_freq or word_freq[1] < min_freq:
return False
word = word_freq[0]
if not (word and min_len <= len(word) <= max_len and word[0] != "."):
return False
return dict_filter(word)
def _edits1(word: str) -> Set[str]:
"""
Returns a set of words with edit distance of 1 from the input word
"""
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in thai_letters]
inserts = [L + c + R for L, R in splits for c in thai_letters]
return set(deletes + transposes + replaces + inserts)
def _edits2(word: str) -> Set[str]:
"""
Returns a set of words with edit distance of 2 from the input word
"""
return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))
def _convert_custom_dict(
custom_dict: Union[
Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]
],
min_freq: int,
min_len: int,
max_len: int,
dict_filter: Optional[Callable[[str], bool]],
) -> List[Tuple[str, int]]:
"""
Converts a custom dictionary to a list of (str, int) tuples
"""
if isinstance(custom_dict, dict):
custom_dict = [(word, freq) for word, freq in custom_dict.items()]
i = iter(custom_dict)
first_member = next(i)
if isinstance(first_member, str):
# create tuples of a word with frequency equal to 1,
# and filter word list
custom_dict = [
(word, 1)
for word in custom_dict
if _keep((word, 1), 1, min_len, max_len, dict_filter)
]
elif isinstance(first_member, tuple):
# filter word list
custom_dict = [
word_freq
for word_freq in custom_dict
if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
]
else:
raise TypeError(
"custom_dict must be either Dict[str, int], "
"Iterable[Tuple[str, int]], or Iterable[str]"
)
return custom_dict
[docs]class NorvigSpellChecker:
[docs] def __init__(
self,
custom_dict: Union[
Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]
] = None,
min_freq: int = 2,
min_len: int = 2,
max_len: int = 40,
dict_filter: Optional[Callable[[str], bool]] = _is_thai_and_not_num,
):
"""
Initializes Peter Norvig's spell checker object.
Spelling dictionary can be customized.
By default, spelling dictionary is from
`Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_
Basically, Norvig's spell checker will choose the most likely
spelling correction give a word by searching for candidate
corrected words based on edit distance.
Then, it selects the candidate with
the highest word occurrence probability.
:param str custom_dict: A custom spelling dictionary. This can be:
(1) a dictionary (`dict`), with words (`str`)
as keys and frequencies (`int`) as values;
(2) an iterable (list, tuple, or set) of word
(`str`) and frequency (`int`) tuples:
`(str, int)`; or
(3) an iterable of just words (`str`), without
frequencies -- in this case `1` will be
assigned to every words.
Default is from Thai National Corpus (around
40,000 words).
:param int min_freq: Minimum frequency of a word to keep (default = 2)
:param int min_len: Minimum length (in characters) of a word to keep
(default = 2)
:param int max_len: Maximum length (in characters) of a word to keep
(default = 40)
:param func dict_filter: A function to filter the dictionary.
Default filter removes any word
with number or non-Thai characters.
If no filter is required, use None.
"""
if not custom_dict: # default, use Thai National Corpus
custom_dict = tnc.word_freqs()
if not dict_filter:
dict_filter = _no_filter
custom_dict = _convert_custom_dict(
custom_dict, min_freq, min_len, max_len, dict_filter
)
self.__WORDS = Counter(dict(custom_dict))
self.__WORDS += Counter() # remove zero and negative counts
self.__WORDS_TOTAL = sum(self.__WORDS.values())
[docs] def dictionary(self) -> ItemsView[str, int]:
"""
Returns the spelling dictionary currently used by this spell checker
:return: spelling dictionary of this instance
:rtype: list[tuple[str, int]]
:Example:
::
from pythainlp.spell import NorvigSpellChecker
dictionary= [("หวาน", 30), ("มะนาว", 2), ("แอบ", 3223)]
checker = NorvigSpellChecker(custom_dict=dictionary)
checker.dictionary()
# output: dict_items([('หวาน', 30), ('มะนาว', 2), ('แอบ', 3223)])
"""
return self.__WORDS.items()
[docs] def known(self, words: Iterable[str]) -> List[str]:
"""
Returns a list of given words that found in the spelling dictionary
:param list[str] words: A list of words to check if they exist
in the spelling dictionary
:return: intersection of the given words list and words
in the spelling dictionary
:rtype: list[str]
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.known(["เพยน", "เพล", "เพลง"])
# output: ['เพล', 'เพลง']
checker.known(['ยกไ', 'ไฟล์ม'])
# output: []
checker.known([])
# output: []
"""
return list(w for w in words if w in self.__WORDS)
[docs] def prob(self, word: str) -> float:
"""
Returns the probability of an input word,
according to the spelling dictionary
:param str word: A word to check its probability of occurrence
:return: word occurrence probability
:rtype: float
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.prob("ครัช")
# output: 0.0
checker.prob("รัก")
# output: 0.0006959172792052158
checker.prob("น่ารัก")
# output: 9.482306849763902e-05
"""
return self.__WORDS[word] / self.__WORDS_TOTAL
[docs] def freq(self, word: str) -> int:
"""
Returns the frequency of an input word,
according to the spelling dictionary
:param str word: A word to check its frequency
:return: frequency of the given word in the spelling dictionary
:rtype: int
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.freq("ปัญญา")
# output: 3639
checker.freq("บิญชา")
# output: 0
"""
return self.__WORDS[word]
[docs] def spell(self, word: str) -> List[str]:
"""
Returns a list of all correctly-spelled words whose spelling
is similar to the given word by edit distance metrics.
The returned list of words will be sorted by the decreasing
order of word frequencies in the word spelling dictionary.
First, if the input word is spelled-correctly,
this method returns the list of exactly one word which is itself.
Next, this method looks for a list of all correctly-spelled words
whose edit distance value is 1 within the input word.
If there is no such word, that the search expands to
a list of words whose edit distance value is 2.
And if that still fails, the list of input word is returned.
:param str word: A word to check its spelling
:return: list of possible correct words within 1 or 2 edit distance
and sorted by frequency of word occurrence in the
spelling dictionary in descending order.
:rtype: list[str]
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.spell("เส้นตรบ")
# output: ['เส้นตรง']
checker.spell("ครัช")
# output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน',
# 'วรัช', 'ครัส', 'ปรัช', 'บรัช', 'ครัง',
#'คัช', 'คลัช', 'ครัย', 'ครัด']
"""
if not word:
return [""]
candidates = (
self.known([word])
or self.known(_edits1(word))
or self.known(_edits2(word))
or [word]
)
candidates.sort(key=self.freq, reverse=True)
return candidates
[docs] def correct(self, word: str) -> str:
"""
Returns the most possible word, using the probability from
the spelling dictionary
:param str word: A word to correct its spelling
:return: the correct spelling of the given word
:rtype: str
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.correct("ปัญชา")
# output: 'ปัญหา'
checker.correct("บิญชา")
# output: 'บัญชา'
checker.correct("มิตรภาบ")
# output: 'มิตรภาพ'
"""
if not word:
return ""
# Check for numeric type
try:
if "." in word:
float(word)
else:
int(word)
return word
except ValueError:
pass
return self.spell(word)[0]