Source code for pythainlp.tokenize.multi_cut

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Multi cut -- Thai word segmentation with maximum matching.
Original code from Korakot Chaovavanich.

:See Also:
    * `Facebook post \
        <https://www.facebook.com/groups/408004796247683/permalink/431283740586455/>`_
    * `GitHub Gist \
        <https://gist.github.com/korakot/fe26c65dc9eed467f4497f784a805716>`_
"""

import re
from collections import defaultdict
from typing import Iterator, List

from pythainlp.tokenize import DEFAULT_WORD_DICT_TRIE
from pythainlp.util import Trie


[docs]class LatticeString(str): """String that keeps possible tokenizations""" def __new__(cls, value, multi=None, in_dict=True): return str.__new__(cls, value)
[docs] def __init__(self, value, multi=None, in_dict=True): self.unique = True if multi: self.multi = list(multi) if len(self.multi) > 1: self.unique = False else: self.multi = [value] self.in_dict = in_dict # if in dictionary
_RE_NONTHAI = r"""(?x) [-a-zA-Z]+| # Latin characters \d+([,\.]\d+)*| # number [ \t]+| # space \r?\n # newline """ _PAT_NONTHAI = re.compile(_RE_NONTHAI) def _multicut( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> Iterator[LatticeString]: """Return LatticeString""" if not custom_dict: custom_dict = DEFAULT_WORD_DICT_TRIE len_text = len(text) words_at = defaultdict(list) # main data structure def serialize(p, p2): # helper function for w in words_at[p]: p_ = p + len(w) if p_ == p2: yield w elif p_ < p2: for path in serialize(p_, p2): yield w + "/" + path q = {0} last_p = 0 # last position for yield while min(q) < len_text: p = min(q) q -= {p} # q.pop, but for set for w in custom_dict.prefixes(text[p:]): words_at[p].append(w) q.add(p + len(w)) len_q = len(q) if len_q == 1: q0 = min(q) yield LatticeString(text[last_p:q0], serialize(last_p, q0)) last_p = q0 elif len_q == 0: # len(q) == 0 means not found in dictionary m = _PAT_NONTHAI.match(text[p:]) if m: # non-Thai toekn i = p + m.span()[1] else: # non-Thai token, find minimum skip for i in range(p, len_text): ww = custom_dict.prefixes(text[i:]) m = _PAT_NONTHAI.match(text[i:]) if ww or m: break else: i = len_text w = text[p:i] words_at[p].append(w) yield LatticeString(w, in_dict=False) last_p = i q.add(i)
[docs]def mmcut(text: str) -> List[str]: res = [] for w in _multicut(text): mm = min(w.multi, key=lambda x: x.count("/")) res.extend(mm.split("/")) return res
def _combine(ww: List[LatticeString]) -> Iterator[str]: if ww == []: yield "" else: w = ww[0] for tail in _combine(ww[1:]): if w.unique: yield w + "|" + tail else: for m in w.multi: yield m.replace("/", "|") + "|" + tail
[docs]def segment( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> List[str]: """Dictionary-based maximum matching word segmentation. :param text: text to be tokenized :type text: str :param custom_dict: tokenization dictionary,\ defaults to DEFAULT_WORD_DICT_TRIE :type custom_dict: Trie, optional :return: list of segmented tokens :rtype: List[str] """ if not text or not isinstance(text, str): return [] return list(_multicut(text, custom_dict=custom_dict))
[docs]def find_all_segment( text: str, custom_dict: Trie = DEFAULT_WORD_DICT_TRIE ) -> List[str]: """Get all possible segment variations. :param text: input string to be tokenized :type text: str :param custom_dict: tokenization dictionary,\ defaults to DEFAULT_WORD_DICT_TRIE :type custom_dict: Trie, optional :return: list of segment variations :rtype: List[str] """ if not text or not isinstance(text, str): return [] ww = list(_multicut(text, custom_dict=custom_dict)) return list(_combine(ww))