# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
Thai unigram word frequency from OSCAR Corpus (words tokenized using ICU)
Credit: Korakot Chaovavanich
https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
"""
__all__ = ["word_freqs", "unigram_word_freqs"]
from collections import defaultdict
from typing import List, Tuple
from pythainlp.corpus import get_corpus_path
_OSCAR_FILENAME = "oscar_icu"
[docs]
def word_freqs() -> List[Tuple[str, int]]:
"""
Get word frequency from OSCAR Corpus (words tokenized using ICU)
"""
freqs: list[tuple[str, int]] = []
path = get_corpus_path(_OSCAR_FILENAME)
if not path:
return freqs
path = str(path)
with open(path, "r", encoding="utf-8-sig") as f:
lines = list(f.readlines())
del lines[0]
for line in lines:
temp = line.strip().split(",")
if len(temp) >= 2:
if temp[0] != " " and '"' not in temp[0]:
freqs.append((temp[0], int(temp[1])))
elif temp[0] == " ":
freqs.append(("<s/>", int(temp[1])))
return freqs
[docs]
def unigram_word_freqs() -> dict[str, int]:
"""
Get unigram word frequency from OSCAR Corpus (words tokenized using ICU)
"""
freqs: dict[str, int] = defaultdict(int)
path = get_corpus_path(_OSCAR_FILENAME)
if not path:
return freqs
path = str(path)
with open(path, "r", encoding="utf-8-sig") as fh:
lines = list(fh.readlines())
del lines[0]
for i in lines:
temp = i.strip().split(",")
if temp[0] != " " and '"' not in temp[0]:
freqs[temp[0]] = int(temp[-1])
elif temp[0] == " ":
freqs["<s/>"] = int(temp[-1])
return freqs