# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Thai unigram word frequency from OSCAR Corpus (icu word tokenize)
Credit: Korakot Chaovavanich
https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
"""
__all__ = ["word_freqs", "unigram_word_freqs"]
from collections import defaultdict
from typing import List, Tuple
from pythainlp.corpus import get_corpus_path
_FILENAME = "oscar_icu"
[docs]def word_freqs() -> List[Tuple[str, int]]:
"""
Get word frequency from OSCAR Corpus (icu word tokenize)
"""
word_freqs = []
_path = get_corpus_path(_FILENAME)
with open(_path, "r", encoding="utf-8-sig") as f:
_data = [i for i in f.readlines()]
del _data[0]
for line in _data:
_temp = line.strip().split(",")
if len(_temp) >= 2:
if _temp[0] != " " and '"' not in _temp[0]:
word_freqs.append((_temp[0], int(_temp[1])))
elif _temp[0] == " ":
word_freqs.append(("<s/>", int(_temp[1])))
return word_freqs
[docs]def unigram_word_freqs() -> defaultdict:
"""
Get unigram word frequency from OSCAR Corpus (icu word tokenize)
"""
_path = get_corpus_path(_FILENAME)
_word_freqs = defaultdict(int)
with open(_path, "r", encoding="utf-8-sig") as fh:
_data = [i for i in fh.readlines()]
del _data[0]
for i in _data:
_temp = i.strip().split(",")
if _temp[0] != " " and '"' not in _temp[0]:
_word_freqs[_temp[0]] = int(_temp[-1])
elif _temp[0] == " ":
_word_freqs["<s/>"] = int(_temp[-1])
return _word_freqs