Source code for pythainlp.ulmfit.preprocess

# -*- coding: utf-8 -*-
"""
Preprocessing for ULMFiT
"""
import html
import re
from typing import Collection, List

import emoji

_TK_UNK = "xxunk"
_TK_REP = "xxrep"
_TK_WREP = "xxwrep"
_TK_END = "xxend"
_TK_URL = "xxurl"


def replace_url(text: str) -> str:
    """
        Replace url in `text` with TK_URL

        :param str text: text to replace url

        :return: text where urls  are replaced
        :rtype: str

        :Example:

            >>> from pythainlp.ulmfit import replace_url
            >>> replace_url("go to github.com")
            go to xxurl
    """
    URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    return re.sub(URL_PATTERN, _TK_URL, text)


[docs]def fix_html(text: str) -> str: """ List of replacements from html strings in `test`. (code from `fastai`) :param str text: text to replace html string :return: text where html strings are replaced :rtype: str :Example: >>> from pythainlp.ulmfit import fix_html >>> fix_html("Anbsp;amp;nbsp;B @.@ ") A & B. """ re1 = re.compile(r" +") text = ( text.replace("#39;", "'") .replace("amp;", "&") .replace("#146;", "'") .replace("nbsp;", " ") .replace("#36;", "$") .replace("\\n", "\n") .replace("quot;", "'") .replace("<br />", "\n") .replace('\\"', '"') .replace("<unk>", _TK_UNK) .replace(" @.@ ", ".") .replace(" @-@ ", "-") .replace(" @,@ ", ",") .replace("\\", " \\ ") ) return re1.sub(" ", html.unescape(text))
[docs]def rm_useless_spaces(text: str) -> str: """Remove multiple spaces in `text`. (code from `fastai`)""" return re.sub(" {2,}", " ", text)
[docs]def spec_add_spaces(text: str) -> str: """Add spaces around / and # in `text`. \n (code from `fastai`)""" return re.sub(r"([/#\n])", r" \1 ", text)
[docs]def replace_rep_after(text: str) -> str: """ Replace repetitions at the character level in `text` after the repetition. This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep 8 ย' ;instead it will retain the word as 'น้อย xxrep 8' :param str text: input text to replace character repetition :return: text with repetitive token **xxrep** and the counter after character repetition :rtype: str :Example: >>> from pythainlp.ulmfit import replace_rep_after >>> >>> text = "กาาาาาาา" >>> replace_rep_after(text) 'กาxxrep7 ' """ def _replace_rep(m): c, cc = m.groups() return f"{c}{_TK_REP}{len(cc)+1} " re_rep = re.compile(r"(\S)(\1{3,})") return re_rep.sub(_replace_rep, text)
[docs]def replace_wrep_post(toks: Collection[str]) -> List[str]: """ Replace reptitive words post tokenization; fastai `replace_wrep` does not work well with Thai. :param list[str] toks: list of tokens :return: list of tokens where **xxwrep** token and the counter is added in front of repetitive words. :rtype: list[str] :Example: >>> from pythainlp.ulmfit import replace_wrep_post_nonum >>> >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] >>> replace_wrep_post(toks) ['กา', 'xxwrep', '3', 'น้ำ'] """ previous_word = None rep_count = 0 res = [] for current_word in toks + [_TK_END]: if current_word == previous_word: rep_count += 1 elif (current_word != previous_word) & (rep_count > 0): res += [_TK_WREP, str(rep_count), previous_word] rep_count = 0 else: res.append(previous_word) previous_word = current_word return res[1:]
[docs]def rm_useless_newlines(text: str) -> str: "Remove multiple newlines in `text`." return re.sub(r"[\n]{2,}", " ", text)
[docs]def rm_brackets(text: str) -> str: "Remove all empty brackets and artifacts within brackets from `text`." # remove empty brackets new_line = re.sub(r"\(\)", "", text) new_line = re.sub(r"\{\}", "", new_line) new_line = re.sub(r"\[\]", "", new_line) # brakets with only punctuations new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line) new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) # artifiacts after ( new_line = re.sub( r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line ) new_line = re.sub( r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line ) new_line = re.sub( r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line ) # artifacts before ) new_line = re.sub( r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line ) new_line = re.sub( r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line ) new_line = re.sub( r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line ) return new_line
[docs]def ungroup_emoji(toks: Collection[str]) -> List[str]: """ Ungroup Zero Width Joiner (ZVJ) Emojis See https://emojipedia.org/emoji-zwj-sequence/ """ res = [] for tok in toks: if emoji.emoji_count(tok) == len(tok): res.extend(list(tok)) else: res.append(tok) return res
[docs]def lowercase_all(toks: Collection[str]) -> List[str]: """ Lowercase all English words; English words in Thai texts don't usually have nuances of capitalization. """ return [tok.lower() for tok in toks]
[docs]def replace_rep_nonum(text: str) -> str: """ Replace repetitions at the character level in `text` after the repetition. This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep ย'; instead it will retain the word as 'น้อย xxrep ' :param str text: input text to replace character repetition :return: text with repetitive token **xxrep** after character repetition :rtype: str :Example: >>> from pythainlp.ulmfit import replace_rep_nonum >>> >>> text = "กาาาาาาา" >>> replace_rep_nonum(text) 'กา xxrep ' """ def _replace_rep(m): c, _ = m.groups() return f"{c} {_TK_REP} " re_rep = re.compile(r"(\S)(\1{3,})") return re_rep.sub(_replace_rep, text)
[docs]def replace_wrep_post_nonum(toks: Collection[str]) -> List[str]: """ Replace reptitive words post tokenization; fastai `replace_wrep` does not work well with Thai. :param list[str] toks: list of tokens :return: list of tokens where **xxwrep** token is added in front of repetitive words. :rtype: list[str] :Example: >>> from pythainlp.ulmfit import replace_wrep_post_nonum >>> >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] >>> replace_wrep_post_nonum(toks) ['กา', 'xxwrep', 'น้ำ'] """ previous_word = None rep_count = 0 res = [] for current_word in toks + [_TK_END]: if current_word == previous_word: rep_count += 1 elif (current_word != previous_word) & (rep_count > 0): res += [_TK_WREP, previous_word] rep_count = 0 else: res.append(previous_word) previous_word = current_word return res[1:]
[docs]def remove_space(toks: Collection[str]) -> List[str]: """ Do not include space for bag-of-word models. :param list[str] toks: list of tokens :return: list of tokens where space tokens (" ") are filtered out :rtype: list[str] """ res = [] for t in toks: t = t.strip() if t: res.append(t) return res