Source code for pythainlp.corpus

# -*- coding: utf-8 -*-

import os
from typing import NoReturn, Union
from urllib.request import urlopen

import requests
from pythainlp.tools import get_full_data_path, get_pythainlp_path
from tinydb import Query, TinyDB
from tqdm import tqdm

# Remote and local corpus databases

_CORPUS_DIRNAME = "corpus"
_CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)

_CORPUS_DB_URL = (
    "https://raw.githubusercontent.com/"
    + "PyThaiNLP/pythainlp-corpus/"
    + "master/db.json"
)

_CORPUS_DB_FILENAME = "db.json"
_CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)

if not os.path.exists(_CORPUS_DB_PATH):
    TinyDB(_CORPUS_DB_PATH)


def corpus_path() -> str:
    return _CORPUS_PATH


def corpus_db_url() -> str:
    return _CORPUS_DB_URL


def corpus_db_path() -> str:
    return _CORPUS_DB_PATH


[docs]def get_corpus(filename: str) -> frozenset: """ Read corpus from file and return a frozenset :param string filename: file corpus """ lines = [] with open(os.path.join(corpus_path(), filename), "r", encoding="utf-8-sig") as fh: lines = fh.read().splitlines() return frozenset(lines)
[docs]def get_corpus_path(name: str) -> Union[str, None]: """ Get corpus path :param string name: corpus name """ db = TinyDB(corpus_db_path()) temp = Query() if len(db.search(temp.name == name)) > 0: path = get_full_data_path(db.search(temp.name == name)[0]["file"]) db.close() if not os.path.exists(path): download(name) return path return None
def _download(url: str, dst: str) -> int: """ @param: url to download file @param: dst place to put the file """ file_size = int(urlopen(url).info().get("Content-Length", -1)) if os.path.exists(dst): first_byte = os.path.getsize(dst) else: first_byte = 0 if first_byte >= file_size: return file_size header = {"Range": "bytes=%s-%s" % (first_byte, file_size)} pbar = tqdm( total=file_size, initial=first_byte, unit="B", unit_scale=True, desc=url.split("/")[-1], ) req = requests.get(url, headers=header, stream=True) with (open(get_full_data_path(dst), "wb")) as f: for chunk in req.iter_content(chunk_size=1024): if chunk: f.write(chunk) pbar.update(1024) pbar.close() return file_size
[docs]def download(name: str, force: bool = False) -> NoReturn: """ Download corpus :param string name: corpus name :param bool force: force install """ db = TinyDB(corpus_db_path()) temp = Query() data = requests.get(corpus_db_url()) data_json = data.json() if name in list(data_json.keys()): temp_name = data_json[name] print("Download:", name) if not db.search(temp.name == name): print(name + " " + temp_name["version"]) _download(temp_name["download"], temp_name["file_name"]) db.insert( { "name": name, "version": temp_name["version"], "file": temp_name["file_name"], } ) else: if not db.search( temp.name == name and temp.version == temp_name["version"] ): print("Alert: New version is ready to be updated.") print( "from " + name + " " + db.search(temp.name == name)[0]["version"] + " update to " + name + " " + temp_name["version"] ) yes_no = "y" if not force: yes_no = str(input("yes or no (y / n) : ")).lower() if "y" == yes_no: _download(temp_name["download"], temp_name["file_name"]) db.update({"version": temp_name["version"]}, temp.name == name) else: print("Redownload") print( "from " + name + " " + db.search(temp.name == name)[0]["version"] + " update to " + name + " " + temp_name["version"] ) yes_no = "y" if not force: yes_no = str(input("yes or no (y / n) : ")).lower() if "y" == yes_no: _download(temp_name["download"], temp_name["file_name"]) db.update({"version": temp_name["version"]}, temp.name == name) db.close()
[docs]def remove(name: str) -> bool: """ Remove corpus :param string name: corpus name :return: True or False """ db = TinyDB(corpus_db_path()) temp = Query() data = db.search(temp.name == name) if len(data) > 0: path = get_corpus_path(name) os.remove(path) db.remove(temp.name == name) return True return False
from pythainlp.corpus.common import ( countries, provinces, thai_negations, thai_stopwords, thai_syllables, thai_words, ) __all__ = [ "corpus_path", "corpus_db_path", "corpus_db_url", "countries", "download", "get_corpus", "get_corpus_path", "provinces", "remove", "thai_negations", "thai_stopwords", "thai_syllables", "thai_words", ]