Source code for pythainlp.wsd.core

# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
from typing import List, Tuple, Union

from pythainlp.tokenize import Tokenizer
from pythainlp.util.trie import Trie
from pythainlp.corpus import thai_wsd_dict

_wsd_dict = thai_wsd_dict()
_mean_all = {}

for i, j in zip(_wsd_dict["word"], _wsd_dict["meaning"]):
    _mean_all[i] = j

_all_word = set(list(_mean_all.keys()))
_TRIE = Trie(list(_all_word))
_word_cut = Tokenizer(custom_dict=_TRIE)

_MODEL = None


class _SentenceTransformersModel:
    def __init__(
        self,
        model: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
        device: str = "cpu",
    ):
        from sentence_transformers import SentenceTransformer

        self.device = device
        self.model_name = model
        self.model = SentenceTransformer(self.model_name, device=self.device)

    def change_device(self, device: str):
        from sentence_transformers import SentenceTransformer

        self.device = device
        self.model = SentenceTransformer(self.model_name, device=self.device)

    def get_score(self, sentences1: str, sentences2: str) -> float:
        from sentence_transformers import util

        embedding_1 = self.model.encode(sentences1, convert_to_tensor=True)
        embedding_2 = self.model.encode(sentences2, convert_to_tensor=True)
        return 1 - util.pytorch_cos_sim(embedding_1, embedding_2)[0][0].item()


[docs]def get_sense(
    sentence: str,
    word: str,
    device: str = "cpu",
    custom_dict: dict = dict(),
    custom_tokenizer: Tokenizer = _word_cut,
) -> List[Tuple[str, float]]:
    """
    Get word sense from the sentence.
    This function will get definition and distance from context in sentence.
    
    :param str sentence: Thai sentence
    :param str word: Thai word
    :param str device: device for running model on.
    :param dict custom_dict: Thai dictionary {"word":["definition",..]}
    :param Tokenizer custom_tokenizer: Tokenizer used to tokenize words in \
        sentence.
    :return: a list of definitions and distances (1 - cos_sim) or \
        an empty list (if word is not in the dictionary)
    :rtype: List[Tuple[str, float]]
    
    We get the ideas from `Context-Aware Semantic Similarity Measurement for \
        Unsupervised Word Sense Disambiguation \
        <https://arxiv.org/abs/2305.03520>`_ to build get_sense function.

    Use Thai dictionary from wiktionary.
    See `thai_dict <https://pythainlp.github.io/pythainlp-corpus/thai_dict.html>`_.
    
    Use sentence transformers model from \
        `sentence-transformers/paraphrase-multilingual-mpnet-base-v2 \
        <https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2>`_ \
        for unsupervised word sense disambiguation.
    
    :Example:
    ::

        from pythainlp.wsd import get_sense
        print(get_sense("เขากำลังอบขนมคุกกี้","คุกกี้"))
        # output:
        # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',
        #   0.0974416732788086),
        #  ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',
        #   0.09319090843200684)]

        print(get_sense("เว็บนี้ต้องการคุกกี้ในการทำงาน","คุกกี้"))
        # output:
        # [('โปรแกรมคอมพิวเตอร์ใช้ในทางอินเทอร์เน็ตสำหรับเก็บข้อมูลของผู้ใช้งาน',
        #   0.1005704402923584),
        #  ('ชื่อขนมชนิดหนึ่งจำพวกขนมเค้ก แต่ทำเป็นชิ้นเล็ก ๆ แบน ๆ แล้วอบให้กรอบ',
        #   0.12473666667938232)]
    """
    global _MODEL
    if not custom_dict:
        custom_dict = _mean_all

    w = custom_tokenizer.word_tokenize(sentence)
    if word not in set(custom_dict.keys()) or word not in sentence:
        return []

    if not _MODEL:
        _MODEL = _SentenceTransformersModel(device=device)
    if _MODEL.device != device:
        _MODEL.change_device(device=device)

    temp_mean = custom_dict[word]
    temp = []
    for i in temp_mean:
        _temp_2 = []
        for j in w:
            if j == word:
                j = (
                    word
                    + f" ({word} ความหมาย '"
                    + i.replace("(", "").replace(")", "")
                    + "') "
                )
            _temp_2.append(j)
        temp.append((i, _MODEL.get_score(sentence, "".join(_temp_2))))

    return temp