Source code for pythainlp.ulmfit.core

# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Universal Language Model Fine-tuning for Text Classification (ULMFiT).
"""
import collections
from typing import Callable, Collection

import numpy as np
import torch
from pythainlp.corpus import get_corpus_path
from pythainlp.tokenize import THAI2FIT_TOKENIZER
from pythainlp.ulmfit.preprocess import (
    fix_html,
    lowercase_all,
    remove_space,
    replace_rep_after,
    replace_rep_nonum,
    replace_url,
    replace_wrep_post,
    replace_wrep_post_nonum,
    rm_brackets,
    rm_useless_newlines,
    rm_useless_spaces,
    spec_add_spaces,
    ungroup_emoji,
)
from pythainlp.util import reorder_vowels

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_MODEL_NAME_LSTM = "wiki_lm_lstm"
_ITOS_NAME_LSTM = "wiki_itos_lstm"


# Pretrained model paths
THWIKI_LSTM = dict(
    wgts_fname=get_corpus_path(_MODEL_NAME_LSTM),
    itos_fname=get_corpus_path(_ITOS_NAME_LSTM),
)

# Preprocessing rules for Thai text
# dense features
pre_rules_th = [
    replace_rep_after,
    fix_html,
    reorder_vowels,
    spec_add_spaces,
    rm_useless_spaces,
    rm_useless_newlines,
    rm_brackets,
    replace_url,
]
post_rules_th = [replace_wrep_post, ungroup_emoji, lowercase_all]

# sparse features
pre_rules_th_sparse = pre_rules_th[1:] + [replace_rep_nonum]
post_rules_th_sparse = post_rules_th[1:] + [
    replace_wrep_post_nonum,
    remove_space,
]


[docs]def process_thai( text: str, pre_rules: Collection = pre_rules_th_sparse, tok_func: Callable = THAI2FIT_TOKENIZER.word_tokenize, post_rules: Collection = post_rules_th_sparse, ) -> Collection[str]: """ Process Thai texts for models (with sparse features as default) :param str text: text to be cleaned :param list[func] pre_rules: rules to apply before tokenization. :param func tok_func: tokenization function (by default, **tok_func** is :func:`pythainlp.tokenize.word_tokenize`) :param list[func] post_rules: rules to apply after tokenizations :return: a list of cleaned tokenized texts :rtype: list[str] :Note: - The default **pre-rules** consists of :func:`fix_html`, :func:`pythainlp.util.normalize`, :func:`spec_add_spaces`, :func:`rm_useless_spaces`, :func:`rm_useless_newlines`, :func:`rm_brackets` and :func:`replace_rep_nonum`. - The default **post-rules** consists of :func:`ungroup_emoji`, :func:`lowercase_all`, :func:`replace_wrep_post_nonum`, and :func:`remove_space`. :Example: 1. Use default pre-rules and post-rules: >>> from pythainlp.ulmfit import process_thai >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " >>> process_thai(text) [บ้าน', 'xxrep', ' ', 'อยู่', 'xxwrep', 'นาน', '😂', '🤣', '😃', '😄', '😅', 'pythainlp', '&'] 2. Modify pre_rules and post_rules arugments with rules provided in :mod:`pythainlp.ulmfit`: >>> from pythainlp.ulmfit import ( process_thai, replace_rep_after, fix_html, ungroup_emoji, replace_wrep_post, remove_space) >>> >>> text = "บ้านนนนน () อยู่นานนานนาน 😂🤣😃😄😅 PyThaiNLP amp; " >>> process_thai(text, pre_rules=[replace_rep_after, fix_html], post_rules=[ungroup_emoji, replace_wrep_post, remove_space] ) ['บ้าน', 'xxrep', '5', '()', 'อยู่', 'xxwrep', '2', 'นาน', '😂', '🤣', '😃', '😄', '😅', 'PyThaiNLP', '&'] """ res = text for rule in pre_rules: res = rule(res) res = tok_func(res) for rule in post_rules: res = rule(res) return res
[docs]def document_vector(text: str, learn, data, agg: str = "mean"): """ This function vectorize Thai input text into a 400 dimension vector using :class:`fastai` language model and data bunch. :meth: `document_vector` get document vector using fastai language model and data bunch :param str text: text to be vectorized with :class:`fastai` language model. :param learn: :class:`fastai` language model learner :param data: :class:`fastai` data bunch :param str agg: name of aggregation methods for word embeddings The avialable methods are "mean" and "sum" :return: :class:`numpy.array` of document vector sized 400 based on the encoder of the model :rtype: :class:`numpy.ndarray((1, 400))` :Example: >>> from pythainlp.ulmfit import document_vectorr >>> from fastai import * >>> from fastai.text import * >>> >>> # Load Data Bunch >>> data = load_data(MODEL_PATH, 'thwiki_lm_data.pkl') >>> >>> # Initialize language_model_learner >>> config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1, qrnn=False, tie_weights=True, out_bias=True, output_p=0.25, hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15) >>> trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1) >>> learn = language_model_learner(data, AWD_LSTM, config=config, pretrained=False, **trn_args) >>> document_vector('วันนี้วันดีปีใหม่', learn, data) :See Also: * A notebook showing how to train `ulmfit` language model and its usage, `Jupyter Notebook \ <https://github.com/cstorm125/thai2fit/blob/master/thwiki_lm/word2vec_examples.ipynb>`_ """ s = THAI2FIT_TOKENIZER.word_tokenize(text) t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to( device ) m = learn.model[0].encoder.to(device) res = m(t).cpu().detach().numpy() if agg == "mean": res = res.mean(0) elif agg == "sum": res = res.sum(0) else: raise ValueError("Aggregate by mean or sum") return res
[docs]def merge_wgts(em_sz, wgts, itos_pre, itos_new): """ This function is to insert new vocab into an existing model named `wgts` and update the model's weights for new vocab with the average embedding. :meth: `merge_wgts` insert pretrained weights and vocab into a new set of weights and vocab; use average if vocab not in pretrained vocab :param int em_sz: embedding size :param wgts: torch model weights :param list itos_pre: pretrained list of vocab :param list itos_new: list of new vocab :return: merged torch model weights :Example: :: from pythainlp.ulmfit import merge_wgts import torch wgts = {'0.encoder.weight': torch.randn(5,3)} itos_pre = ["แมว", "คน", "หนู"] itos_new = ["ปลา", "เต่า", "นก"] em_sz = 3 merge_wgts(em_sz, wgts, itos_pre, itos_new) # output: # {'0.encoder.weight': tensor([[0.5952, 0.4453, 0.0011], # [0.5952, 0.4453, 0.0011], # [0.5952, 0.4453, 0.0011]]), # '0.encoder_dp.emb.weight': tensor([[0.5952, 0.4453, 0.0011], # [0.5952, 0.4453, 0.0011], # [0.5952, 0.4453, 0.0011]]), # '1.decoder.weight': tensor([[0.5952, 0.4453, 0.0011], # [0.5952, 0.4453, 0.0011], # [0.5952, 0.4453, 0.0011]])} """ vocab_size = len(itos_new) enc_wgts = wgts["0.encoder.weight"].numpy() # Average weight of encoding row_m = enc_wgts.mean(0) stoi_pre = collections.defaultdict( lambda: -1, {v: k for k, v in enumerate(itos_pre)} ) # New embedding based on classification dataset new_w = np.zeros((vocab_size, em_sz), dtype=np.float32) for i, w in enumerate(itos_new): r = stoi_pre[w] # Use pretrianed embedding if present; else use the average new_w[i] = enc_wgts[r] if r >= 0 else row_m wgts["0.encoder.weight"] = torch.tensor(new_w) wgts["0.encoder_dp.emb.weight"] = torch.tensor(np.copy(new_w)) wgts["1.decoder.weight"] = torch.tensor(np.copy(new_w)) return wgts