# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
CRFCut - Thai sentence segmenter.
Thai sentence segmentation using conditional random field,
default model trained on TED dataset
Performance:
- ORCHID - space-correct accuracy 87% vs 95% state-of-the-art
(Zhou et al, 2016; https://www.aclweb.org/anthology/C16-1031.pdf)
- TED dataset - space-correct accuracy 82%
See development notebooks at https://github.com/vistec-AI/ted_crawler;
POS features are not used due to unreliable POS tagging available
"""
import os
from typing import List
import pycrfsuite
from pythainlp.corpus import corpus_path
from pythainlp.tokenize import word_tokenize
_ENDERS = {
# ending honorifics
"ครับ",
"ค่ะ",
"คะ",
"นะคะ",
"นะ",
"จ้ะ",
"จ้า",
"จ๋า",
"ฮะ",
# enders
"ๆ",
"ได้",
"แล้ว",
"ด้วย",
"เลย",
"มาก",
"น้อย",
"กัน",
"เช่นกัน",
"เท่านั้น",
"อยู่",
"ลง",
"ขึ้น",
"มา",
"ไป",
"ไว้",
"เอง",
"อีก",
"ใหม่",
"จริงๆ",
"บ้าง",
"หมด",
"ทีเดียว",
"เดียว",
# demonstratives
"นั้น",
"นี้",
"เหล่านี้",
"เหล่านั้น",
# questions
"อย่างไร",
"ยังไง",
"หรือไม่",
"มั้ย",
"ไหน",
"ไหม",
"อะไร",
"ทำไม",
"เมื่อไหร่",
"เมื่อไร",
}
_STARTERS = {
# pronouns
"ผม",
"ฉัน",
"ดิฉัน",
"ชั้น",
"คุณ",
"มัน",
"เขา",
"เค้า",
"เธอ",
"เรา",
"พวกเรา",
"พวกเขา",
"กู",
"มึง",
"แก",
"ข้าพเจ้า",
# connectors
"และ",
"หรือ",
"แต่",
"เมื่อ",
"ถ้า",
"ใน",
"ด้วย",
"เพราะ",
"เนื่องจาก",
"ซึ่ง",
"ไม่",
"ตอนนี้",
"ทีนี้",
"ดังนั้น",
"เพราะฉะนั้น",
"ฉะนั้น",
"ตั้งแต่",
"ในที่สุด",
"ก็",
"กับ",
"แก่",
"ต่อ",
# demonstratives
"นั้น",
"นี้",
"เหล่านี้",
"เหล่านั้น",
}
_CRFCUT_DATA_FILENAME = "sentenceseg_crfcut.model"
_tagger = pycrfsuite.Tagger()
_tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME))
[docs]def segment(text: str) -> List[str]:
"""
CRF-based sentence segmentation.
:param str text: text to be tokenized to sentences
:return: list of words, tokenized from the text
"""
if isinstance(text, str):
toks = word_tokenize(text)
else:
toks = text
feat = extract_features(toks)
labs = _tagger.tag(feat)
labs[-1] = "E" # make sure it cuts the last sentence
sentences = []
sentence = ""
for i, w in enumerate(toks):
sentence = sentence + w
if labs[i] == "E":
sentences.append(sentence)
sentence = ""
return sentences