Source code for pythainlp.corpus.util

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Tool for creating word lists
codes are from Korakot Chaovavanich.

:See also:
    * `Facebook post \
        <https://www.facebook.com/groups/colab.thailand/permalink/1667821073393244>`_
    * `Google Colab \
        <https://colab.research.google.com/drive/19kY2jCHONuxmTJM0U8PIE_I5OK1rO-x_>`_
"""

from collections import Counter
from typing import Callable, Iterable, Iterator, List, Set, Tuple

from pythainlp.corpus import thai_words
from pythainlp.tokenize import newmm
from pythainlp.util import Trie


def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]:
    """
    Return beginning and ending indexes of word pairs
    """
    i = 0
    for w in words:
        yield i, i + len(w)
        i += len(w)


[docs] def find_badwords( tokenize: Callable[[str], List[str]], training_data: Iterable[Iterable[str]], ) -> Set[str]: """ Find words that do not work well with the `tokenize` function for the provided `training_data`. :param Callable[[str], List[str]] tokenize: a tokenize function :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set :return: words that are considered to make `tokenize` perform badly :rtype: Set[str] """ right = Counter() wrong = Counter() for train_words in training_data: train_set = set(index_pairs(train_words)) test_words = tokenize("".join(train_words)) test_pairs = index_pairs(test_words) for w, p in zip(test_words, test_pairs): if p in train_set: right[w] += 1 else: wrong[w] += 1 # if wrong is more than right, then it's a bad word bad_words = [] for w, count in wrong.items(): if count > right[w]: bad_words.append(w) return set(bad_words)
[docs] def revise_wordset( tokenize: Callable[[str], List[str]], orig_words: Iterable[str], training_data: Iterable[Iterable[str]], ) -> Set[str]: """ Revise a set of words that could improve tokenization performance of a dictionary-based `tokenize` function. `orig_words` will be used as a base set for the dictionary. Words that do not performed well with `training_data` will be removed. The remaining words will be returned. :param Callable[[str], List[str]] tokenize: a tokenize function, can be\ any function that takes a string as input and returns a List[str] :param Iterable[str] orig_words: words that used by the tokenize function,\ will be used as a base for revision :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set :return: words that are considered to make `tokenize` perform badly :rtype: Set[str] :Example:: :: from pythainlp.corpus import thai_words from pythainlp.corpus.util import revise_wordset from pythainlp.tokenize.longest import segment base_words = thai_words() more_words = { "ถวิล อุดล", "ทองอินทร์ ภูริพัฒน์", "เตียง ศิริขันธ์", "จำลอง ดาวเรือง" } base_words = base_words.union(more_words) dict_trie = Trie(wordlist) tokenize = lambda text: segment(text, dict_trie) training_data = [ [str, str, str. ...], [str, str, str, str, ...], ... ] revised_words = revise_wordset(tokenize, wordlist, training_data) """ bad_words = find_badwords(tokenize, training_data) return set(orig_words) - bad_words
[docs] def revise_newmm_default_wordset( training_data: Iterable[Iterable[str]], ) -> Set[str]: """ Revise a set of word that could improve tokenization performance of `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default tokenizer for PyThaiNLP. Words from `pythainlp.corpus.thai_words()` will be used as a base set for the dictionary. Words that do not performed well with `training_data` will be removed. The remaining words will be returned. :param Iterable[Iterable[str]] training_data: tokenized text, to be used\ as a training set :return: words that are considered to make `tokenize` perform badly :rtype: Set[str] """ orig_words = thai_words() trie = Trie(orig_words) def tokenize(text): return newmm.segment(text, custom_dict=trie) revised_words = revise_wordset(tokenize, orig_words, training_data) return revised_words