Source code for pythainlp.tokenize

# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals

import codecs
import re

from pythainlp.corpus.thaisyllable import get_data as syllable_dict
from pythainlp.corpus.thaiword import get_data as word_dict
from six.moves import zip

from marisa_trie import Trie

DEFAULT_DICT_TRIE = Trie(word_dict())


[docs]def word_tokenize(text, engine="newmm", whitespaces=True):
    """
    :param str text:  the text to be tokenized
    :param str engine: the engine to tokenize text
    :param bool whitespaces: True to output no whitespace, a common mark of sentence or end of phrase in Thai.
    :Parameters for engine:
        * newmm - Maximum Matching algorithm + TCC
        * icu -  IBM ICU
        * longest-matching - Longest matching
        * mm - Maximum Matching algorithm
        * pylexto - LexTo
        * deepcut - Deep Neural Network
        * wordcutpy - wordcutpy (https://github.com/veer66/wordcutpy)
    :return: A list of words, tokenized from a text

    **Example**::

        from pythainlp.tokenize import word_tokenize
        text='ผมรักคุณนะครับโอเคบ่พวกเราเป็นคนไทยรักภาษาไทยภาษาบ้านเกิด'
        a=word_tokenize(text,engine='icu') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอ', 'เค', 'บ่', 'พวก', 'เรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้าน', 'เกิด']
        b=word_tokenize(text,engine='dict') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']
        c=word_tokenize(text,engine='mm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']
        d=word_tokenize(text,engine='pylexto') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']
        e=word_tokenize(text,engine='newmm') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คนไทย', 'รัก', 'ภาษาไทย', 'ภาษา', 'บ้านเกิด']
        g=word_tokenize(text,engine='wordcutpy') # ['ผม', 'รัก', 'คุณ', 'นะ', 'ครับ', 'โอเค', 'บ่', 'พวกเรา', 'เป็น', 'คน', 'ไทย', 'รัก', 'ภาษา', 'ไทย', 'ภาษา', 'บ้านเกิด']
    """
    if engine == "icu":
        from .pyicu import segment
    elif engine == "multi_cut" or engine == "mm":
        from .multi_cut import segment
    elif engine == "newmm" or engine == "onecut":
        from .newmm import mmcut as segment
    elif engine == "longest-matching":
        from .longest import segment
    elif engine == "pylexto":
        from .pylexto import segment
    elif engine == "deepcut":
        from .deepcut import segment
    elif engine == "wordcutpy":
        from .wordcutpy import segment
    else:
        raise Exception("Error: Unknown engine: {}".format(engine))

    if not whitespaces:
        return [i.strip(" ") for i in segment(text) if i.strip(" ")]

    return segment(text)


[docs]def dict_word_tokenize(text, custom_dict_trie, engine="newmm"):
    """
    :meth:`dict_word_tokenize` tokenizes word based on the dictionary you provide. The format has to be in trie data structure.

    :param str text: the text to be tokenized
    :param dict custom_dict_trie: คือ trie ที่สร้างจาก create_custom_dict_trie
    :param str engine: choose between different options of engine to token (newmm, wordcutpy, mm, longest-matching)
    :return: A list of words, tokenized from a text.
    **Example**::
        >>> from pythainlp.tokenize import dict_word_tokenize,create_custom_dict_trie
        >>> listword=['แมว',"ดี"]
        >>> data_dict=create_custom_dict_trie(listword)
        >>> dict_word_tokenize("แมวดีดีแมว",data_dict)
        ['แมว', 'ดี', 'ดี', 'แมว']
    """
    if engine == "newmm" or engine == "onecut":
        from .newmm import mmcut as segment
    elif engine == "mm" or engine == "multi_cut":
        from .multi_cut import segment
    elif engine == "longest-matching":
        from .longest import segment
    elif engine == "wordcutpy":
        from .wordcutpy import segment

        return segment(text, custom_dict_trie.keys())
    else:
        raise Exception("Error: Unknown engine: {}".format(engine))

    return segment(text, custom_dict_trie)


[docs]def sent_tokenize(text, engine="whitespace+newline"):
    """
    This function does not yet automatically recognize when a sentence actually ends. Rather it helps split text where white space and a new line is found.

    :param str text: the text to be tokenized
    :param str engine: choose between 'whitespace' or 'whitespace+newline'

    :return: a list of text, split by whitespace or new line.
    """
    if engine == "whitespace":
        sentences = re.split(r' +', text, re.U)
    else:
        sentences = text.split()

    return sentences


[docs]def subword_tokenize(text, engine="tcc"):
    """
    :param str text: text to be tokenized
    :param str engine: choosing 'tcc' uses the Thai Character Cluster rule to segment words into the smallest unique units.
    :return: a list of tokenized strings.
    """
    from .tcc import tcc

    return tcc(text)


[docs]def isthai(text, check_all=False):
    """
    :param str text: input string or list of strings
    :param bool check_all: checks all character or not

    :return: A dictionary with the first value as proportional of text that is Thai, and the second value being a tuple of all characters, along with true or false.
    """
    isthais = []
    num_isthai = 0

    for ch in text:
        ch_val = ord(ch)
        if ch_val >= 3584 and ch_val <= 3711:
            num_isthai += 1
            if check_all:
                isthais.append(True)
        else:
            if check_all:
                isthais.append(False)
    thai_percent = (num_isthai / len(text)) * 100

    if check_all:
        chars = list(text)
        isthai_pairs = tuple(zip(chars, isthais))
        data = {"thai": thai_percent, "check_all": isthai_pairs}
    else:
        data = {"thai": thai_percent}

    return data


def syllable_tokenize(text):
    """
    :param str text: input string to be tokenized

    :return: returns list of strings of syllables
    """
    syllables = []
    if text:
        words = word_tokenize(text)
        trie = create_custom_dict_trie(custom_dict_source=syllable_dict())
        for word in words:
            syllables.extend(dict_word_tokenize(text=word, custom_dict_trie=trie))

    return syllables


[docs]def create_custom_dict_trie(custom_dict_source):
    """The function is used to create a custom dict trie which will be used for word_tokenize() function. For more information on the trie data structure, see: https://marisa-trie.readthedocs.io/en/latest/index.html

    :param string/list custom_dict_source:  a list of vocaburaries or a path to source file

    :return: A trie created from custom dict input
    """

    if type(custom_dict_source) is str:
        # Receive a file path of the custom dict to read
        with codecs.open(custom_dict_source, "r", encoding="utf8") as f:
            _vocabs = f.read().splitlines()
            return Trie(_vocabs)
    elif isinstance(custom_dict_source, (list, tuple, set)):
        # Received a sequence type object of vocabs
        return Trie(custom_dict_source)
    else:
        raise TypeError(
            "Type of custom_dict_source must be either str (path to source file) or collections"
        )


class Tokenizer:
    def __init__(self, custom_dict=None):
        """
        Initialize tokenizer object

        :param str custom_dict: a file path or a list of vocaburaies to be used to create a trie (default - original lexitron)

        :return: trie_dict - a dictionary in the form of trie data for tokenizing engines
        """
        if custom_dict:
            if type(custom_dict) is list:
                self.trie_dict = Trie(custom_dict)
            elif type(custom_dict) is str:
                with codecs.open(custom_dict, "r", encoding="utf8") as f:
                    vocabs = f.read().splitlines()
                self.trie_dict = Trie(vocabs)
        else:
            self.trie_dict = Trie(word_dict())

    def word_tokenize(self, text, engine="newmm"):
        from .newmm import mmcut as segment

        return segment(text, self.trie_dict)