Source code for pythainlp.corpus

# -*- coding: utf-8 -*-
import hashlib
import os
from typing import NoReturn, Union
from urllib.request import urlopen

import requests
from pythainlp.tools import get_full_data_path, get_pythainlp_path
from requests.exceptions import HTTPError
from tinydb import Query, TinyDB
from tqdm import tqdm

# Remote and local corpus databases

_CORPUS_DIRNAME = "corpus"
_CORPUS_PATH = os.path.join(get_pythainlp_path(), _CORPUS_DIRNAME)

_CORPUS_DB_URL = (
    "https://raw.githubusercontent.com/"
    + "PyThaiNLP/pythainlp-corpus/"
    + "2.1/db.json"
)

_CORPUS_DB_FILENAME = "db.json"
_CORPUS_DB_PATH = get_full_data_path(_CORPUS_DB_FILENAME)

# Create a local corpus database if it does not already exist
if not os.path.exists(_CORPUS_DB_PATH):
    TinyDB(_CORPUS_DB_PATH)


def corpus_path() -> str:
    return _CORPUS_PATH


def corpus_db_url() -> str:
    return _CORPUS_DB_URL


def corpus_db_path() -> str:
    return _CORPUS_DB_PATH


def get_corpus_db_detail(name: str) -> dict:
    db = TinyDB(corpus_db_path())
    query = Query()
    res = db.search(query.name == name)
    db.close()

    if res:
        return res[0]
    else:
        return dict()


[docs]def get_corpus(filename: str) -> frozenset: """ Read corpus from file and return a frozenset (Please see the filename from `this file <https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json>`_ :param string filename: filename of the corpus to be read :return: :mod:`frozenset` consist of lines in the file :rtype: :mod:`frozenset` :Example: :: from pythainlp.corpus import get_corpus get_corpus('negations_th.txt') # output: # frozenset({'แต่', 'ไม่'}) get_corpus('ttc_freq.txt') # output: # frozenset({'โดยนัยนี้\\t1', # 'ตัวบท\\t10', # 'หยิบยื่น\\t3', # 'เอย\\t555', # 'ค้าน\\t69', # 'เหนี่ยง\\t3', # 'ชงฆ์\\t3', # ...}) """ path = os.path.join(corpus_path(), filename) lines = [] with open(path, "r", encoding="utf-8-sig") as fh: lines = fh.read().splitlines() return frozenset(lines)
[docs]def get_corpus_path(name: str) -> Union[str, None]: """ Get corpus path :param string name: corpus name :return: path to the corpus or **None** of the corpus doesn't exist in the device :rtype: str :Example: If the corpus already exists:: from pythainlp.corpus import get_corpus_path print(get_corpus_path('ttc')) # output: /root/pythainlp-data/ttc_freq.txt If the corpus has not been downloaded yet:: from pythainlp.corpus import download, get_corpus_path print(get_corpus_path('wiki_lm_lstm')) # output: None download('wiki_lm_lstm') # output: # Download: wiki_lm_lstm # wiki_lm_lstm 0.32 # thwiki_lm.pth?dl=1: 1.05GB [00:25, 41.5MB/s] # /root/pythainlp-data/thwiki_model_lstm.pth print(get_corpus_path('wiki_lm_lstm')) # output: /root/pythainlp-data/thwiki_model_lstm.pth """ db = TinyDB(corpus_db_path()) query = Query() path = None if db.search(query.name == name): path = get_full_data_path(db.search(query.name == name)[0]["file"]) if not os.path.exists(path): download(name) db.close() return path
def _download(url: str, dst: str) -> int: """ @param: url to download file @param: dst place to put the file """ _CHUNK_SIZE = 1024 * 64 file_size = int(urlopen(url).info().get("Content-Length", -1)) r = requests.get(url, stream=True) with open(get_full_data_path(dst), "wb") as f: pbar = tqdm(total=int(r.headers["Content-Length"])) for chunk in r.iter_content(chunk_size=_CHUNK_SIZE): if chunk: f.write(chunk) pbar.update(len(chunk)) pbar.close() return file_size def _check_hash(dst: str, md5: str) -> NoReturn: """ @param: dst place to put the file @param: md5 place to hash the file (MD5) """ if md5 and md5 != "-": with open(get_full_data_path(dst), "rb") as f: content = f.read() file_md5 = hashlib.md5(content).hexdigest() if md5 != file_md5: raise Exception("Hash does not match expected.")
[docs]def download(name: str, force: bool = False) -> NoReturn: """ Download corpus. The available corpus names can be seen in this file: https://github.com/PyThaiNLP/pythainlp-corpus/blob/master/db.json :param string name: corpus name :param bool force: force download :Example: :: from pythainlp.corpus import download download('wiki_lm_lstm', force=True) # output: # Corpus: wiki_lm_lstm # - Downloading: wiki_lm_lstm 0.1 # thwiki_lm.pth: 26%|██▌ | 114k/434k [00:00<00:00, 690kB/s] By default, downloaded corpus and model will be saved in ``$HOME/pythainlp-data/`` (e.g. ``/Users/bact/pythainlp-data/wiki_lm_lstm.pth``). """ local_db = TinyDB(corpus_db_path()) query = Query() try: corpus_data = requests.get(corpus_db_url()) except HTTPError as http_err: print(f"Cannot download corpus data from: {corpus_db_url()}") print(f"HTTP error occurred: {http_err}") return except Exception as err: print(f"Cannot download corpus data from: {corpus_db_url()}") print(f"Non-HTTP error occurred: {err}") return corpus_data = corpus_data.json() if name in list(corpus_data.keys()): corpus = corpus_data[name] print("Corpus:", name) found = local_db.search(query.name == name) # If not found in local, download if force or not found: print(f"- Downloading: {name} {corpus['version']}") _download(corpus["download"], corpus["file_name"]) _check_hash(corpus["file_name"], corpus["md5"]) if found: local_db.update( {"version": corpus["version"]}, query.name == name ) else: local_db.insert( { "name": name, "version": corpus["version"], "file": corpus["file_name"], } ) else: if local_db.search( query.name == name and query.version == corpus["version"] ): # Already has the same version print("- Already up to date.") else: # Has the corpus but different version current_ver = local_db.search(query.name == name)[0]["version"] print(f"- Existing version: {current_ver}") print(f"- New version available: {corpus['version']}") print("- Use download(data_name, force=True) to update") else: print("Corpus not found:", name) local_db.close()
[docs]def remove(name: str) -> bool: """ Remove corpus :param string name: corpus name :return: **True** if the corpus is found and succesfully removed. Otherwise, it returns **False**. :rtype: bool :Example: :: from pythainlp.corpus import remove, get_corpus_path, get_corpus print(remove('ttc')) # output: True print(get_corpus_path('ttc')) # output: None get_corpus('ttc') # output: # FileNotFoundError: [Errno 2] No such file or directory: # '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc' """ db = TinyDB(corpus_db_path()) query = Query() data = db.search(query.name == name) if data: path = get_corpus_path(name) os.remove(path) db.remove(query.name == name) db.close() return True db.close() return False
from pythainlp.corpus.common import ( countries, provinces, thai_female_names, thai_male_names, thai_negations, thai_stopwords, thai_syllables, thai_words, ) __all__ = [ "corpus_path", "corpus_db_path", "corpus_db_url", "countries", "download", "get_corpus", "get_corpus_path", "provinces", "remove", "thai_female_names", "thai_male_names", "thai_negations", "thai_stopwords", "thai_syllables", "thai_words", "get_corpus_db_detail", ]