Source code for pythainlp.corpus.util

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Tool for creating word lists
codes are from Korakot Chaovavanich.

:See also:
    * `Facebook post \
        <https://www.facebook.com/groups/colab.thailand/permalink/1667821073393244>`_
    * `Google Colab \
        <https://colab.research.google.com/drive/19kY2jCHONuxmTJM0U8PIE_I5OK1rO-x_>`_
"""

from collections import Counter
from typing import Callable, Iterable, Iterator, List, Set, Tuple

from pythainlp.corpus import thai_words
from pythainlp.tokenize import newmm
from pythainlp.util import Trie


def index_pairs(words: List[str]) -> Iterator[Tuple[int, int]]:
    """
    Return beginning and ending indexes of word pairs
    """
    i = 0
    for w in words:
        yield i, i + len(w)
        i += len(w)


[docs]def find_badwords(
    tokenize: Callable[[str], List[str]],
    training_data: Iterable[Iterable[str]],
) -> Set[str]:
    """
    Find words that do not work well with the `tokenize` function
    for the provided `training_data`.

    :param Callable[[str], List[str]] tokenize: a tokenize function
    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
        as a training set
    :return: words that are considered to make `tokenize` perform badly
    :rtype: Set[str]
    """
    right = Counter()
    wrong = Counter()

    for train_words in training_data:
        train_set = set(index_pairs(train_words))
        test_words = tokenize("".join(train_words))
        test_pairs = index_pairs(test_words)
        for w, p in zip(test_words, test_pairs):
            if p in train_set:
                right[w] += 1
            else:
                wrong[w] += 1

    # if wrong is more than right, then it's a bad word
    bad_words = []
    for w, count in wrong.items():
        if count > right[w]:
            bad_words.append(w)

    return set(bad_words)


[docs]def revise_wordset(
    tokenize: Callable[[str], List[str]],
    orig_words: Iterable[str],
    training_data: Iterable[Iterable[str]],
) -> Set[str]:
    """
    Revise a set of words that could improve tokenization performance of
    a dictionary-based `tokenize` function.

    `orig_words` will be used as a base set for the dictionary.
    Words that do not performed well with `training_data` will be removed.
    The remaining words will be returned.

    :param Callable[[str], List[str]] tokenize: a tokenize function, can be\
        any function that takes a string as input and returns a List[str]
    :param Iterable[str] orig_words: words that used by the tokenize function,\
        will be used as a base for revision
    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
        as a training set
    :return: words that are considered to make `tokenize` perform badly
    :rtype: Set[str]

    :Example::
    ::
    
        from pythainlp.corpus import thai_words
        from pythainlp.corpus.util import revise_wordset
        from pythainlp.tokenize.longest import segment

        base_words = thai_words()
        more_words = {
            "ถวิล อุดล", "ทองอินทร์ ภูริพัฒน์", "เตียง ศิริขันธ์", "จำลอง ดาวเรือง"
        }
        base_words = base_words.union(more_words)
        dict_trie = Trie(wordlist)

        tokenize = lambda text: segment(text, dict_trie)

        training_data = [
            [str, str, str. ...],
            [str, str, str, str, ...],
            ...
        ]

        revised_words = revise_wordset(tokenize, wordlist, training_data)
    """
    bad_words = find_badwords(tokenize, training_data)
    return set(orig_words) - bad_words


[docs]def revise_newmm_default_wordset(
    training_data: Iterable[Iterable[str]],
) -> Set[str]:
    """
    Revise a set of word that could improve tokenization performance of
    `pythainlp.tokenize.newmm`, a dictionary-based tokenizer and a default
    tokenizer for PyThaiNLP.

    Words from `pythainlp.corpus.thai_words()` will be used as a base set
    for the dictionary. Words that do not performed well with `training_data`
    will be removed. The remaining words will be returned.

    :param Iterable[Iterable[str]] training_data: tokenized text, to be used\
        as a training set
    :return: words that are considered to make `tokenize` perform badly
    :rtype: Set[str]
    """
    orig_words = thai_words()
    trie = Trie(orig_words)

    def tokenize(text):
        return newmm.segment(text, custom_dict=trie)

    revised_words = revise_wordset(tokenize, orig_words, training_data)
    return revised_words