# -*- coding: utf-8 -*-
"""
Spell checker, using Peter Norvig algorithm.
Spelling dictionary can be customized.
Default spelling dictionary is based on Thai National Corpus.
Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html
"""
from collections import Counter
from string import digits
from typing import (
Callable,
Dict,
ItemsView,
Iterable,
List,
Optional,
Set,
Tuple,
Union,
)
from pythainlp import thai_digits, thai_letters
from pythainlp.corpus import tnc
from pythainlp.util import isthaichar
def _no_filter(word: str) -> bool:
return True
def _is_thai_and_not_num(word: str) -> bool:
for ch in word:
if ch != "." and not isthaichar(ch):
return False
if ch in thai_digits or ch in digits:
return False
return True
def _keep(
word_freq: Tuple[str, int],
min_freq: int,
min_len: int,
max_len: int,
dict_filter: Callable[[str], bool],
) -> bool:
"""
Checks whether a given word has the required minimum frequency min_freq
and its character length is between min_len and max_len (inclusive).
"""
if not word_freq or word_freq[1] < min_freq:
return False
word = word_freq[0]
if not (word and min_len <= len(word) <= max_len and word[0] != "."):
return False
return dict_filter(word)
def _edits1(word: str) -> Set[str]:
"""
Returns a set of words with an edit distance of 1 from the input word
"""
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in thai_letters]
inserts = [L + c + R for L, R in splits for c in thai_letters]
return set(deletes + transposes + replaces + inserts)
def _edits2(word: str) -> Set[str]:
"""
Returns a set of words with an edit distance of 2 from the input word
"""
return set(e2 for e1 in _edits1(word) for e2 in _edits1(e1))
def _convert_custom_dict(
custom_dict: Union[
Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]
],
min_freq: int,
min_len: int,
max_len: int,
dict_filter: Optional[Callable[[str], bool]],
) -> List[Tuple[str, int]]:
"""
Converts a custom dictionary to a list of (str, int) tuples
"""
if isinstance(custom_dict, dict):
custom_dict = list(custom_dict.items())
i = iter(custom_dict)
first_member = next(i)
if isinstance(first_member, str):
# create tuples of a word with frequency equaling 1,
# and filter word list
custom_dict = [
(word, 1)
for word in custom_dict
if _keep((word, 1), 1, min_len, max_len, dict_filter)
]
elif isinstance(first_member, tuple):
# filter word list
custom_dict = [
word_freq
for word_freq in custom_dict
if _keep(word_freq, min_freq, min_len, max_len, dict_filter)
]
else:
raise TypeError(
"custom_dict must be either Dict[str, int], "
"Iterable[Tuple[str, int]], or Iterable[str]"
)
return custom_dict
[docs]
class NorvigSpellChecker:
[docs]
def __init__(
self,
custom_dict: Union[
Dict[str, int], Iterable[str], Iterable[Tuple[str, int]]
] = None,
min_freq: int = 2,
min_len: int = 2,
max_len: int = 40,
dict_filter: Optional[Callable[[str], bool]] = _is_thai_and_not_num,
):
"""
Initializes Peter Norvig's spell checker object.
Spelling dictionary can be customized.
By default, spelling dictionary is from
`Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_
Basically, Norvig's spell checker will choose the most likely
corrected spelling given a word by searching for candidates of
corrected words based on edit distance.
Then, it selects the candidate with
the highest word occurrence probability.
:param str custom_dict: A custom spelling dictionary. This can be:
(1) a dictionary (`dict`), with words (`str`)
as keys and frequencies (`int`) as values;
(2) an iterable (list, tuple, or set) of words
(`str`) and frequency (`int`) tuples:
`(str, int)`; or
(3) an iterable of just words (`str`), without
frequencies -- in this case `1` will be
assigned to every words.
Default is from Thai National Corpus (around
40,000 words).
:param int min_freq: Minimum frequency of a word to keep (default = 2)
:param int min_len: Minimum length (in characters) of a word to keep
(default = 2)
:param int max_len: Maximum length (in characters) of a word to keep
(default = 40)
:param func dict_filter: A function to filter the dictionary.
Default filter removes any word
with numbers or non-Thai characters.
If no filter is required, use None.
"""
if not custom_dict: # default, use Thai National Corpus
# TODO: #680 change the dict
custom_dict = [(i, j) for i, j in tnc.word_freqs()]
if not dict_filter:
dict_filter = _no_filter
custom_dict = _convert_custom_dict(
custom_dict, min_freq, min_len, max_len, dict_filter
)
self.__WORDS = Counter(dict(custom_dict))
self.__WORDS += Counter() # remove zero and negative counts
self.__WORDS_TOTAL = sum(self.__WORDS.values())
[docs]
def dictionary(self) -> ItemsView[str, int]:
"""
Returns the spelling dictionary currently used by this spell checker
:return: spelling dictionary of this instance
:rtype: list[tuple[str, int]]
:Example:
::
from pythainlp.spell import NorvigSpellChecker
dictionary= [("หวาน", 30), ("มะนาว", 2), ("แอบ", 3223)]
checker = NorvigSpellChecker(custom_dict=dictionary)
checker.dictionary()
# output: dict_items([('หวาน', 30), ('มะนาว', 2), ('แอบ', 3223)])
"""
return self.__WORDS.items()
[docs]
def known(self, words: Iterable[str]) -> List[str]:
"""
Returns a list of given words found in the spelling dictionary
:param list[str] words: A list of words to check if they exist
in the spelling dictionary
:return: intersection of the given word list and words
in the spelling dictionary
:rtype: list[str]
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.known(["เพยน", "เพล", "เพลง"])
# output: ['เพล', 'เพลง']
checker.known(['ยกไ', 'ไฟล์ม'])
# output: []
checker.known([])
# output: []
"""
return list(w for w in words if w in self.__WORDS)
[docs]
def prob(self, word: str) -> float:
"""
Returns the probability of an input word,
according to the spelling dictionary
:param str word: A word to check occurrence probability of
:return: word occurrence probability
:rtype: float
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.prob("ครัช")
# output: 0.0
checker.prob("รัก")
# output: 0.0006959172792052158
checker.prob("น่ารัก")
# output: 9.482306849763902e-05
"""
return self.__WORDS[word] / self.__WORDS_TOTAL
[docs]
def freq(self, word: str) -> int:
"""
Returns the frequency of an input word,
according to the spelling dictionary
:param str word: A word to check frequency of
:return: frequency of the given word in the spelling dictionary
:rtype: int
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.freq("ปัญญา")
# output: 3639
checker.freq("บิญชา")
# output: 0
"""
return self.__WORDS[word]
[docs]
def spell(self, word: str) -> List[str]:
"""
Returns a list of all correctly-spelled words whose spelling
is similar to the given word by edit distance metrics.
The returned list of words will be sorted by decreasing
order of word frequencies in the word spelling dictionary.
First, if the input word is spelled correctly,
this method returns a list of exactly one word which is itself.
Next, this method looks for a list of all correctly spelled words
whose edit distance value is 1 from the input word.
If there is no such word, then the search expands to
a list of words whose edit distance value is 2.
And if that still fails, the list of input words is returned.
:param str word: A word to check spelling of
:return: list of possibly correct words within 1 or 2 edit distance
and sorted by frequency of word occurrence in the
spelling dictionary in descending order.
:rtype: list[str]
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.spell("เส้นตรบ")
# output: ['เส้นตรง']
checker.spell("ครัช")
# output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน',
# 'วรัช', 'ครัส', 'ปรัช', 'บรัช', 'ครัง',
#'คัช', 'คลัช', 'ครัย', 'ครัด']
"""
if not word:
return [""]
candidates = (
self.known([word])
or self.known(_edits1(word))
or self.known(_edits2(word))
or [word]
)
candidates.sort(key=self.freq, reverse=True)
return candidates
[docs]
def correct(self, word: str) -> str:
"""
Returns the most possible word, using the probability from
the spelling dictionary
:param str word: A word to correct spelling of
:return: the correct spelling of the given word
:rtype: str
:Example:
::
from pythainlp.spell import NorvigSpellChecker
checker = NorvigSpellChecker()
checker.correct("ปัญชา")
# output: 'ปัญหา'
checker.correct("บิญชา")
# output: 'บัญชา'
checker.correct("มิตรภาบ")
# output: 'มิตรภาพ'
"""
if not word:
return ""
# Check for numeric type
try:
if "." in word:
float(word)
else:
int(word)
return word
except ValueError:
pass
return self.spell(word)[0]