Source code for pythainlp.ner

# -*- coding: utf-8 -*-
from __future__ import absolute_import,unicode_literals
# NLP
import re
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from pythainlp.corpus import stopwords
thaicut="newmm" # ตัวตัดคำ
# CRF
try:
    import sklearn_crfsuite
except ImportError:
    from pythainlp.tools import install_package
    install_package('sklearn-crfsuite')
    import sklearn_crfsuite
# FILE
import glob
import codecs
from pythainlp.corpus import get_file,download

stopwords = stopwords.words('thai')


def isThai(chr): # เช็คว่าเป็น char ภาษาไทย
 cVal = ord(chr)
 if(cVal >= 3584 and cVal <= 3711):
  return True
 return False
def isThaiWord(word): # เช็คว่าเป็นคำภาษาไทย
 t=True
 for i in word:
  l=isThai(i)
  if l!=True and i!='.':
   t=False
   break
 return t

def is_stopword(word): # เช็คว่าเป็นคำฟุ่งเฟือง
    return word in stopwords
def doc2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    # Features from current word
    features={
        'word.word': word,
        'word.stopword': is_stopword(word),
        'word.isthai':isThaiWord(word),
        'word.isspace':word.isspace(),
        'postag':postag,
        'word.isdigit()': word.isdigit()
    }
    if word.isdigit() and len(word)==5:
        features['word.islen5']=True
    if i > 0:
        prevword = doc[i-1][0]
        postag1 = doc[i-1][1]
        features['word.prevword'] = prevword
        features['word.previsspace']=prevword.isspace()
        features['word.previsthai']=isThaiWord(prevword)
        features['word.prevstopword']=is_stopword(prevword)
        features['word.prepostag'] = postag1
        features['word.prevwordisdigit'] = prevword.isdigit()
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        postag1 = doc[i+1][1]
        features['word.nextword'] = nextword
        features['word.nextisspace']=nextword.isspace()
        features['word.nextpostag'] = postag1
        features['word.nextisthai']=isThaiWord(nextword)
        features['word.nextstopword']=is_stopword(nextword)
        features['word.nextwordisdigit'] = nextword.isdigit()
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

[docs]class thainer: def __init__(self): """ Thai NER """ self.data_path = get_file('thainer') if self.data_path==None: download('thainer') self.data_path = get_file('thainer') self.crf=sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=500, all_possible_transitions=True, model_filename=self.data_path)
[docs] def get_ner(self,text,postag=True): """ Get NER from Thai NER. :param string text: thai text :param boolean postag: get postag (True) or get not postag (False) :return: list NER. **Example**:: >>> from pythainlp.ner import thainer >>> ner=thainer() >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") [('วันที่', 'JSBR', 'O'), (' ', 'NCMN', 'O'), ('15', 'NCNM', 'B-DATE'), (' ', 'NCMN', 'I-DATE'), ('ก.ย.', 'CMTR', 'I-DATE'), (' ', 'NCMN', 'I-DATE'), ('61', 'NCNM', 'I-DATE'), (' ', 'NCMN', 'O'), ('ทดสอบ', 'VACT', 'O'), ('ระบบ', 'NCMN', 'O'), ('เวลา', 'NCMN', 'O'), (' ', 'NCMN', 'O'), ('14', 'NCNM', 'B-TIME'), (':', 'PUNC', 'I-TIME'), ('49', 'NCNM', 'I-TIME'), (' ', 'NCMN', 'I-TIME'), ('น.', 'CMTR', 'I-TIME')] >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",postag=False) [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')] """ self.word_cut=word_tokenize(text,engine=thaicut) self.list_word=pos_tag(self.word_cut,engine='perceptron') self.X_test = self.extract_features([(data,self.list_word[i][1]) for i,data in enumerate(self.word_cut)]) self.y_=self.crf.predict_single(self.X_test) if postag: return [(self.word_cut[i],self.list_word[i][1],data) for i,data in enumerate(self.y_)] else: return [(self.word_cut[i],data) for i,data in enumerate(self.y_)]
def extract_features(self,doc): return [doc2features(doc, i) for i in range(len(doc))] def get_labels(self,doc): return [tag for (token,postag,tag) in doc] def get_model(self): return self.crf