# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
import gzip
import json
from typing import TYPE_CHECKING, Any, Optional
if TYPE_CHECKING:
from numpy.typing import NDArray
[docs]
class GzipModel:
"""This class is a re-implementation of
“Low-Resource” Text Classification: A Parameter-Free Classification Method
with Compressors (Jiang et al., Findings 2023)
:param Optional[list] training_data: list [(text_sample,label)].
Default is None.
:param str model_path: Path for loading model (if you saved the model).
Default is empty string.
"""
cx2_list: list[int]
training_data: "NDArray[Any]"
[docs]
def __init__(
self,
training_data: Optional[list[tuple[str, str]]] = None,
model_path: str = "",
) -> None:
import numpy as np
if model_path:
self.load(model_path)
else:
self.training_data = np.array(training_data)
self.cx2_list = self.train()
[docs]
def train(self) -> list[int]:
temp_list = []
for i in range(len(self.training_data)):
temp_list.append(
len(gzip.compress(self.training_data[i][0].encode("utf-8")))
)
return temp_list
[docs]
def predict(self, x1: str, k: int = 1) -> str:
"""Predict the label for the given text.
:param str x1: the text that we want to predict label for
:param int k: number of nearest neighbors to consider (default: 1)
:return: predicted label
:rtype: str
:Example:
::
from pythainlp.classify import GzipModel
training_data = [
("รายละเอียดตามนี้เลยค่าา ^^", "Neutral"),
("กลัวพวกมึงหาย อดกินบาบิก้อน", "Neutral"),
("บริการแย่มากก เป็นหมอได้ไง😤", "Negative"),
("ขับรถแย่มาก", "Negative"),
("ดีนะครับ", "Positive"),
("ลองแล้วรสนี้อร่อย... ชอบๆ", "Positive"),
("ฉันรู้สึกโกรธ เวลามือถือแบตหมด", "Negative"),
("เธอภูมิใจที่ได้ทำสิ่งดี ๆ และดีใจกับเด็ก ๆ", "Positive"),
("นี่เป็นบทความหนึ่ง", "Neutral"),
]
model = GzipModel(training_data)
print(model.predict("ฉันดีใจ", k=1))
# output: Positive
"""
import numpy as np
cx1 = len(gzip.compress(x1.encode("utf-8")))
disance_from_x1 = []
for i in range(len(self.cx2_list)):
x2 = self.training_data[i][0]
cx2 = self.cx2_list[i]
x1x2 = "".join([x1, x2])
cx1x2 = len(gzip.compress(x1x2.encode("utf-8")))
# normalized compression distance
ncd = (cx1x2 - min(cx1, cx2)) / max(cx1, cx2)
disance_from_x1.append(ncd)
sorted_idx = np.argsort(np.array(disance_from_x1))
top_k_class = self.training_data[sorted_idx[:k], 1]
_, counts = np.unique(top_k_class, return_counts=True)
predict_class = str(top_k_class[counts.argmax()])
return predict_class
[docs]
def save(self, path: str) -> None:
""":param str path: path to save model"""
with open(path, "w", encoding="utf-8") as f:
json.dump(
{
"training_data": self.training_data.tolist(),
"cx2_list": self.cx2_list,
},
f,
ensure_ascii=False,
)
[docs]
def load(self, path: str) -> None:
""":param str path: path to load model"""
import numpy as np
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
self.cx2_list = data["cx2_list"]
self.training_data = np.array(data["training_data"])