# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2024 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0
"""
CRFCut - Thai sentence segmenter.
Thai sentence segmentation using conditional random field,
with default model trained on TED dataset
Performance:
- ORCHID - space-correct accuracy 87% vs 95% state-of-the-art
(Zhou et al, 2016; https://www.aclweb.org/anthology/C16-1031.pdf)
- TED dataset - space-correct accuracy 82%
See development notebooks at https://github.com/vistec-AI/ted_crawler;
POS features are not used due to unreliable POS tagging available
"""
import os
from typing import List
import pycrfsuite
from pythainlp.corpus import corpus_path
from pythainlp.tokenize import word_tokenize
_ENDERS = {
# ending honorifics
"ครับ",
"ค่ะ",
"คะ",
"นะคะ",
"นะ",
"จ้ะ",
"จ้า",
"จ๋า",
"ฮะ",
# enders
"ๆ",
"ได้",
"แล้ว",
"ด้วย",
"เลย",
"มาก",
"น้อย",
"กัน",
"เช่นกัน",
"เท่านั้น",
"อยู่",
"ลง",
"ขึ้น",
"มา",
"ไป",
"ไว้",
"เอง",
"อีก",
"ใหม่",
"จริงๆ",
"บ้าง",
"หมด",
"ทีเดียว",
"เดียว",
# demonstratives
"นั้น",
"นี้",
"เหล่านี้",
"เหล่านั้น",
# questions
"อย่างไร",
"ยังไง",
"หรือไม่",
"มั้ย",
"ไหน",
"ไหม",
"อะไร",
"ทำไม",
"เมื่อไหร่",
"เมื่อไร",
}
_STARTERS = {
# pronouns
"ผม",
"ฉัน",
"ดิฉัน",
"ชั้น",
"คุณ",
"มัน",
"เขา",
"เค้า",
"เธอ",
"เรา",
"พวกเรา",
"พวกเขา",
"กู",
"มึง",
"แก",
"ข้าพเจ้า",
# connectors
"และ",
"หรือ",
"แต่",
"เมื่อ",
"ถ้า",
"ใน",
"ด้วย",
"เพราะ",
"เนื่องจาก",
"ซึ่ง",
"ไม่",
"ตอนนี้",
"ทีนี้",
"ดังนั้น",
"เพราะฉะนั้น",
"ฉะนั้น",
"ตั้งแต่",
"ในที่สุด",
"ก็",
"กับ",
"แก่",
"ต่อ",
# demonstratives
"นั้น",
"นี้",
"เหล่านี้",
"เหล่านั้น",
}
_CRFCUT_DATA_FILENAME = "sentenceseg_crfcut.model"
_tagger = pycrfsuite.Tagger()
_tagger.open(os.path.join(corpus_path(), _CRFCUT_DATA_FILENAME))
[docs]def segment(text: str) -> List[str]:
"""
CRF-based sentence segmentation.
:param str text: text to be tokenized into sentences
:return: list of words, tokenized from the text
"""
if isinstance(text, str):
toks = word_tokenize(text)
else:
toks = text
feat = extract_features(toks)
labs = _tagger.tag(feat)
labs[-1] = "E" # make sure it cuts the last sentence
# To ensure splitting of sentences using Terminal Punctuation
for idx, _ in enumerate(toks):
if toks[idx].strip().endswith(("!", ".", "?")):
labs[idx] = "E"
# Spaces or empty strings would no longer be treated as end of sentence.
elif (idx == 0 or labs[idx-1] == "E") and toks[idx].strip() == "":
labs[idx] = "I"
sentences = []
sentence = ""
for i, w in enumerate(toks):
sentence = sentence + w
# Empty strings should not be part of output.
if labs[i] == "E" and sentence != "":
sentences.append(sentence)
sentence = ""
return sentences