Interactive online version: Binder badge Google Colab badge

PyThaiNLP Get Started

Code examples for basic functions in PyThaiNLP https://github.com/PyThaiNLP/pythainlp

[1]:
# # pip install required modules
# # uncomment if running from colab
# # see list of modules in `requirements` and `extras`
# # in https://github.com/PyThaiNLP/pythainlp/blob/dev/setup.py

#!pip install -q python-crfsuite
#!pip install -q torch
#!pip install -q pythainlp
#!pip install -q epitran

Import PyThaiNLP

[2]:
import pythainlp

pythainlp.__version__
[2]:
'2.2.1'

Thai Characters

PyThaiNLP provides some ready-to-use Thai character set (e.g. Thai consonants, vowels, tonemarks, symbols) as a string for convenience. There are also few utility functions to test if a string is in Thai or not.

[3]:
pythainlp.thai_characters
[3]:
'āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪāļĪāļĶāļ°āļąāļēāļģāļīāļĩāļķāļ·āļļāļđāđ€āđāđ‚āđƒāđ„āđ…āđāđ‡āđˆāđ‰āđŠāđ‹āļŊāļšāđ†āđŒāđāđŽāđāđšāđ›āđāđ‘āđ’āđ“āđ”āđ•āđ–āđ—āđ˜āđ™āļŋ'
[4]:
len(pythainlp.thai_characters)
[4]:
88
[5]:
pythainlp.thai_consonants
[5]:
'āļāļ‚āļƒāļ„āļ…āļ†āļ‡āļˆāļ‰āļŠāļ‹āļŒāļāļŽāļāļāļ‘āļ’āļ“āļ”āļ•āļ–āļ—āļ˜āļ™āļšāļ›āļœāļāļžāļŸāļ āļĄāļĒāļĢāļĨāļ§āļĻāļĐāļŠāļŦāļŽāļ­āļŪ'
[6]:
len(pythainlp.thai_consonants)
[6]:
44
[7]:
"āđ”" in pythainlp.thai_digits  # check if Thai digit "4" is in the character set
[7]:
True

Checking if a string contains Thai character or not, or how many

[8]:
import pythainlp.util

pythainlp.util.isthai("āļ")
[8]:
True
[9]:
pythainlp.util.isthai("(āļ.āļž.)")
[9]:
False
[10]:
pythainlp.util.isthai("(āļ.āļž.)", ignore_chars=".()")
[10]:
True

counthai() returns proportion of Thai characters in the text. It will ignore non-alphabets by default.

[11]:
pythainlp.util.countthai("āļ§āļąāļ™āļ­āļēāļ—āļīāļ•āļĒāđŒāļ—āļĩāđˆ 24 āļĄāļĩāļ™āļēāļ„āļĄ 2562")
[11]:
100.0

You can specify characters to be ignored, using ignore_chars= parameter.

[12]:
pythainlp.util.countthai("āļ§āļąāļ™āļ­āļēāļ—āļīāļ•āļĒāđŒāļ—āļĩāđˆ 24 āļĄāļĩāļ™āļēāļ„āļĄ 2562", ignore_chars="")
[12]:
67.85714285714286

Collation

Sorting according to Thai dictionary.

[13]:
from pythainlp.util import collate

thai_words = ["āļ„āđ‰āļ­āļ™", "āļāļĢāļ°āļ”āļēāļĐ", "āļāļĢāļĢāđ„āļāļĢ", "āđ„āļ‚āđˆ", "āļœāđ‰āļēāđ„āļŦāļĄ"]
collate(thai_words)
[13]:
['āļāļĢāļĢāđ„āļāļĢ', 'āļāļĢāļ°āļ”āļēāļĐ', 'āđ„āļ‚āđˆ', 'āļ„āđ‰āļ­āļ™', 'āļœāđ‰āļēāđ„āļŦāļĄ']
[14]:
collate(thai_words, reverse=True)
[14]:
['āļœāđ‰āļēāđ„āļŦāļĄ', 'āļ„āđ‰āļ­āļ™', 'āđ„āļ‚āđˆ', 'āļāļĢāļ°āļ”āļēāļĐ', 'āļāļĢāļĢāđ„āļāļĢ']

Date/Time Format and Spellout

Date/Time Format

Get Thai day and month names with Thai Buddhist Era (B.E.). Use formatting directives similar to datetime.strftime().

[15]:
import datetime
from pythainlp.util import thai_strftime

fmt = "%Aāļ—āļĩāđˆ %-d %B āļž.āļĻ. %Y āđ€āļ§āļĨāļē %H:%M āļ™. (%a %d-%b-%y)"
date = datetime.datetime(1976, 10, 6, 1, 40)

thai_strftime(date, fmt)
[15]:
'āļ§āļąāļ™āļžāļļāļ˜āļ—āļĩāđˆ 6 āļ•āļļāļĨāļēāļ„āļĄ āļž.āļĻ. 2519 āđ€āļ§āļĨāļē 01:40 āļ™. (āļž 06-āļ•.āļ„.-19)'

From version 2.2, these modifiers can be applied right before the main directive:

    • (minus) Do not pad a numeric result string (also available in version 2.1)

  • _ (underscore) Pad a numeric result string with spaces

  • 0 (zero) Pad a number result string with zeros

  • ^ Convert alphabetic characters in result string to upper case

  • # Swap the case of the result string

  • O (letter o) Use the locale’s alternative numeric symbols (Thai digit)

[16]:
thai_strftime(date, "%d %b %y")
[16]:
'06 āļ•.āļ„. 19'
[17]:
thai_strftime(date, "%d %b %Y")
[17]:
'06 āļ•.āļ„. 2519'

Time Spellout

Note: ``thai_time()`` will be renamed to ``time_to_thaiword()`` in version 2.2.

[18]:
from pythainlp.util import thai_time

thai_time("00:14:29")
[18]:
'āļĻāļđāļ™āļĒāđŒāļ™āļēāļŽāļīāļāļēāļŠāļīāļšāļŠāļĩāđˆāļ™āļēāļ—āļĩāļĒāļĩāđˆāļŠāļīāļšāđ€āļāđ‰āļēāļ§āļīāļ™āļēāļ—āļĩ'

The way to spellout can be chosen, using fmt parameter. It can be 24h, 6h, or m6h. Try one by yourself.

[19]:
thai_time("00:14:29", fmt="6h")
[19]:
'āđ€āļ—āļĩāđˆāļĒāļ‡āļ„āļ·āļ™āļŠāļīāļšāļŠāļĩāđˆāļ™āļēāļ—āļĩāļĒāļĩāđˆāļŠāļīāļšāđ€āļāđ‰āļēāļ§āļīāļ™āļēāļ—āļĩ'

Precision of spellout can be chosen as well. Using precision parameter. It can be m for minute-level, s for second-level, or None for only read the non-zero value.

[20]:
thai_time("00:14:29", precision="m")
[20]:
'āļĻāļđāļ™āļĒāđŒāļ™āļēāļŽāļīāļāļēāļŠāļīāļšāļŠāļĩāđˆāļ™āļēāļ—āļĩ'
[21]:
print(thai_time("8:17:00", fmt="6h"))
print(thai_time("8:17:00", fmt="m6h", precision="s"))
print(thai_time("18:30:01", fmt="m6h", precision="m"))
print(thai_time("13:30:01", fmt="6h", precision="m"))
āļŠāļ­āļ‡āđ‚āļĄāļ‡āđ€āļŠāđ‰āļēāļŠāļīāļšāđ€āļˆāđ‡āļ”āļ™āļēāļ—āļĩ
āđāļ›āļ”āđ‚āļĄāļ‡āļŠāļīāļšāđ€āļˆāđ‡āļ”āļ™āļēāļ—āļĩāļĻāļđāļ™āļĒāđŒāļ§āļīāļ™āļēāļ—āļĩ
āļŦāļāđ‚āļĄāļ‡āļ„āļĢāļķāđˆāļ‡
āļšāđˆāļēāļĒāđ‚āļĄāļ‡āļ„āļĢāļķāđˆāļ‡

We can also pass datetime and time objects to thai_time().

[22]:
import datetime

time = datetime.time(13, 14, 15)
thai_time(time)
[22]:
'āļŠāļīāļšāļŠāļēāļĄāļ™āļēāļŽāļīāļāļēāļŠāļīāļšāļŠāļĩāđˆāļ™āļēāļ—āļĩāļŠāļīāļšāļŦāđ‰āļēāļ§āļīāļ™āļēāļ—āļĩ'
[23]:
time = datetime.datetime(10, 11, 12, 13, 14, 15)
thai_time(time, fmt="6h", precision="m")
[23]:
'āļšāđˆāļēāļĒāđ‚āļĄāļ‡āļŠāļīāļšāļŠāļĩāđˆāļ™āļēāļ—āļĩ'

Tokenization and Segmentation

At sentence, word, and sub-word levels.

Sentence

Default sentence tokenizer is “crfcut”. Tokenization engine can be chosen ussing engine= parameter.

[24]:
from pythainlp import sent_tokenize

text = ("āļžāļĢāļ°āļĢāļēāļŠāļšāļąāļāļāļąāļ•āļīāļ˜āļĢāļĢāļĄāļ™āļđāļāļāļēāļĢāļ›āļāļ„āļĢāļ­āļ‡āđāļœāđˆāļ™āļ”āļīāļ™āļŠāļĒāļēāļĄāļŠāļąāđˆāļ§āļ„āļĢāļēāļ§ āļžāļļāļ—āļ˜āļĻāļąāļāļĢāļēāļŠ āđ’āđ”āđ—āđ• "
        "āđ€āļ›āđ‡āļ™āļĢāļąāļāļ˜āļĢāļĢāļĄāļ™āļđāļāļ‰āļšāļąāļšāļŠāļąāđˆāļ§āļ„āļĢāļēāļ§ āļ‹āļķāđˆāļ‡āļ–āļ·āļ­āļ§āđˆāļēāđ€āļ›āđ‡āļ™āļĢāļąāļāļ˜āļĢāļĢāļĄāļ™āļđāļāļ‰āļšāļąāļšāđāļĢāļāđāļŦāđˆāļ‡āļĢāļēāļŠāļ­āļēāļ“āļēāļˆāļąāļāļĢāļŠāļĒāļēāļĄ "
        "āļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđ€āļĄāļ·āđˆāļ­āļ§āļąāļ™āļ—āļĩāđˆ 27 āļĄāļīāļ–āļļāļ™āļēāļĒāļ™ āļž.āļĻ. 2475 "
        "āđ‚āļ”āļĒāđ€āļ›āđ‡āļ™āļœāļĨāļžāļ§āļ‡āļŦāļĨāļąāļ‡āļāļēāļĢāļ›āļāļīāļ§āļąāļ•āļīāđ€āļĄāļ·āđˆāļ­āļ§āļąāļ™āļ—āļĩāđˆ 24 āļĄāļīāļ–āļļāļ™āļēāļĒāļ™ āļž.āļĻ. 2475 āđ‚āļ”āļĒāļ„āļ“āļ°āļĢāļēāļĐāļŽāļĢ")

print("default (crfcut):")
print(sent_tokenize(text))
print("\nwhitespace+newline:")
print(sent_tokenize(text, engine="whitespace+newline"))
default (crfcut):
['āļžāļĢāļ°āļĢāļēāļŠāļšāļąāļāļāļąāļ•āļīāļ˜āļĢāļĢāļĄāļ™āļđāļāļāļēāļĢāļ›āļāļ„āļĢāļ­āļ‡āđāļœāđˆāļ™āļ”āļīāļ™āļŠāļĒāļēāļĄāļŠāļąāđˆāļ§āļ„āļĢāļēāļ§ āļžāļļāļ—āļ˜āļĻāļąāļāļĢāļēāļŠ āđ’āđ”āđ—āđ• āđ€āļ›āđ‡āļ™āļĢāļąāļāļ˜āļĢāļĢāļĄāļ™āļđāļāļ‰āļšāļąāļšāļŠāļąāđˆāļ§āļ„āļĢāļēāļ§ ', 'āļ‹āļķāđˆāļ‡āļ–āļ·āļ­āļ§āđˆāļēāđ€āļ›āđ‡āļ™āļĢāļąāļāļ˜āļĢāļĢāļĄāļ™āļđāļāļ‰āļšāļąāļšāđāļĢāļāđāļŦāđˆāļ‡āļĢāļēāļŠāļ­āļēāļ“āļēāļˆāļąāļāļĢāļŠāļĒāļēāļĄ ', 'āļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđ€āļĄāļ·āđˆāļ­āļ§āļąāļ™āļ—āļĩāđˆ 27 āļĄāļīāļ–āļļāļ™āļēāļĒāļ™ āļž.āļĻ. 2475 ', 'āđ‚āļ”āļĒāđ€āļ›āđ‡āļ™āļœāļĨāļžāļ§āļ‡āļŦāļĨāļąāļ‡āļāļēāļĢāļ›āļāļīāļ§āļąāļ•āļīāđ€āļĄāļ·āđˆāļ­āļ§āļąāļ™āļ—āļĩāđˆ 24 āļĄāļīāļ–āļļāļ™āļēāļĒāļ™ āļž.āļĻ. 2475 āđ‚āļ”āļĒāļ„āļ“āļ°āļĢāļēāļĐāļŽāļĢ']

whitespace+newline:
['āļžāļĢāļ°āļĢāļēāļŠāļšāļąāļāļāļąāļ•āļīāļ˜āļĢāļĢāļĄāļ™āļđāļāļāļēāļĢāļ›āļāļ„āļĢāļ­āļ‡āđāļœāđˆāļ™āļ”āļīāļ™āļŠāļĒāļēāļĄāļŠāļąāđˆāļ§āļ„āļĢāļēāļ§', 'āļžāļļāļ—āļ˜āļĻāļąāļāļĢāļēāļŠ', 'āđ’āđ”āđ—āđ•', 'āđ€āļ›āđ‡āļ™āļĢāļąāļāļ˜āļĢāļĢāļĄāļ™āļđāļāļ‰āļšāļąāļšāļŠāļąāđˆāļ§āļ„āļĢāļēāļ§', 'āļ‹āļķāđˆāļ‡āļ–āļ·āļ­āļ§āđˆāļēāđ€āļ›āđ‡āļ™āļĢāļąāļāļ˜āļĢāļĢāļĄāļ™āļđāļāļ‰āļšāļąāļšāđāļĢāļāđāļŦāđˆāļ‡āļĢāļēāļŠāļ­āļēāļ“āļēāļˆāļąāļāļĢāļŠāļĒāļēāļĄ', 'āļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđ€āļĄāļ·āđˆāļ­āļ§āļąāļ™āļ—āļĩāđˆ', '27', 'āļĄāļīāļ–āļļāļ™āļēāļĒāļ™', 'āļž.āļĻ.', '2475', 'āđ‚āļ”āļĒāđ€āļ›āđ‡āļ™āļœāļĨāļžāļ§āļ‡āļŦāļĨāļąāļ‡āļāļēāļĢāļ›āļāļīāļ§āļąāļ•āļīāđ€āļĄāļ·āđˆāļ­āļ§āļąāļ™āļ—āļĩāđˆ', '24', 'āļĄāļīāļ–āļļāļ™āļēāļĒāļ™', 'āļž.āļĻ.', '2475', 'āđ‚āļ”āļĒāļ„āļ“āļ°āļĢāļēāļĐāļŽāļĢ']

Word

Default word tokenizer (“newmm”) use maximum matching algorithm.

[25]:
from pythainlp import word_tokenize

text = "āļāđ‡āļˆāļ°āļĢāļđāđ‰āļ„āļ§āļēāļĄāļŠāļąāđˆāļ§āļĢāđ‰āļēāļĒāļ—āļĩāđˆāļ—āļģāđ„āļ§āđ‰     āđāļĨāļ°āļ„āļ‡āļˆāļ°āđ„āļĄāđˆāļĒāļ­āļĄāđƒāļŦāđ‰āļ—āļģāļ™āļēāļšāļ™āļŦāļĨāļąāļ‡āļ„āļ™ "

print("default (newmm):")
print(word_tokenize(text))
print("\nnewmm and keep_whitespace=False:")
print(word_tokenize(text, keep_whitespace=False))
default (newmm):
['āļāđ‡', 'āļˆāļ°', 'āļĢāļđāđ‰āļ„āļ§āļēāļĄ', 'āļŠāļąāđˆāļ§āļĢāđ‰āļēāļĒ', 'āļ—āļĩāđˆ', 'āļ—āļģ', 'āđ„āļ§āđ‰', '     ', 'āđāļĨāļ°', 'āļ„āļ‡āļˆāļ°', 'āđ„āļĄāđˆ', 'āļĒāļ­āļĄāđƒāļŦāđ‰', 'āļ—āļģāļ™āļēāļšāļ™āļŦāļĨāļąāļ‡āļ„āļ™', ' ']

newmm and keep_whitespace=False:
['āļāđ‡', 'āļˆāļ°', 'āļĢāļđāđ‰āļ„āļ§āļēāļĄ', 'āļŠāļąāđˆāļ§āļĢāđ‰āļēāļĒ', 'āļ—āļĩāđˆ', 'āļ—āļģ', 'āđ„āļ§āđ‰', 'āđāļĨāļ°', 'āļ„āļ‡āļˆāļ°', 'āđ„āļĄāđˆ', 'āļĒāļ­āļĄāđƒāļŦāđ‰', 'āļ—āļģāļ™āļēāļšāļ™āļŦāļĨāļąāļ‡āļ„āļ™']

Other algorithm can be chosen. We can also create a tokenizer with a custom dictionary.

[3]:
from pythainlp import word_tokenize, Tokenizer

text = "āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™āļ‰āļšāļąāļšāļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āđƒāļŦāļĄāđˆāļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđāļĨāđ‰āļ§"

print("newmm  :", word_tokenize(text))  # default engine is "newmm"
print("longest:", word_tokenize(text, engine="longest"))

words = ["āđāļĢāļ‡āļ‡āļēāļ™"]
custom_tokenizer = Tokenizer(words)
print("newmm (custom dictionary):", custom_tokenizer.word_tokenize(text))
newmm  : ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™', 'āļ‰āļšāļąāļš', 'āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡', 'āđƒāļŦāļĄāđˆ', 'āļ›āļĢāļ°āļāļēāļĻ', 'āđƒāļŠāđ‰āđāļĨāđ‰āļ§']
longest: ['āļāļŽāļŦāļĄāļēāļĒāđāļĢāļ‡āļ‡āļēāļ™', 'āļ‰āļšāļąāļš', 'āļ›āļĢāļąāļšāļ›āļĢāļļāļ‡', 'āđƒāļŦāļĄāđˆ', 'āļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰', 'āđāļĨāđ‰āļ§']
newmm (custom dictionary): ['āļāļŽāļŦāļĄāļēāļĒ', 'āđāļĢāļ‡āļ‡āļēāļ™', 'āļ‰āļšāļąāļšāļ›āļĢāļąāļšāļ›āļĢāļļāļ‡āđƒāļŦāļĄāđˆāļ›āļĢāļ°āļāļēāļĻāđƒāļŠāđ‰āđāļĨāđ‰āļ§']

Default word tokenizer use a word list from pythainlp.corpus.common.thai_words(). We can get that list, add/remove words, and create new tokenizer from the modified list.

[4]:
from pythainlp.corpus.common import thai_words
from pythainlp import Tokenizer

text = "āļ™āļīāļĒāļēāļĒāļ§āļīāļ—āļĒāļēāļĻāļēāļŠāļ•āļĢāđŒāļ‚āļ­āļ‡āđ„āļ­āđāļ‹āļ„ āļ­āļŠāļīāļĄāļ­āļŸ"

print("default dictionary:", word_tokenize(text))

words = set(thai_words())  # thai_words() returns frozenset
words.add("āđ„āļ­āđāļ‹āļ„")  # Isaac
words.add("āļ­āļŠāļīāļĄāļ­āļŸ")  # Asimov
custom_tokenizer = Tokenizer(words)
print("custom dictionary :", custom_tokenizer.word_tokenize(text))
default dictionary: ['āļ™āļīāļĒāļēāļĒ', 'āļ§āļīāļ—āļĒāļēāļĻāļēāļŠāļ•āļĢāđŒ', 'āļ‚āļ­āļ‡', 'āđ„āļ­āđāļ‹āļ„', ' ', 'āļ­āļŠāļī', 'āļĄāļ­', 'āļŸ']
custom dictionary : ['āļ™āļīāļĒāļēāļĒ', 'āļ§āļīāļ—āļĒāļēāļĻāļēāļŠāļ•āļĢāđŒ', 'āļ‚āļ­āļ‡', 'āđ„āļ­āđāļ‹āļ„', ' ', 'āļ­āļŠāļīāļĄāļ­āļŸ']

We can also, alternatively, create a dictionary trie, using pythainlp.util.Trie() function, and pass it to a default tokenizer.

[5]:
from pythainlp.corpus.common import thai_words
from pythainlp.util import Trie

text = "ILO87 āļ§āđˆāļēāļ”āđ‰āļ§āļĒāđ€āļŠāļĢāļĩāļ āļēāļžāđƒāļ™āļāļēāļĢāļŠāļĄāļēāļ„āļĄāđāļĨāļ°āļāļēāļĢāļ„āļļāđ‰āļĄāļ„āļĢāļ­āļ‡āļŠāļīāļ—āļ˜āļīāđƒāļ™āļāļēāļĢāļĢāļ§āļĄāļ•āļąāļ§ ILO98 āļ§āđˆāļēāļ”āđ‰āļ§āļĒāļŠāļīāļ—āļ˜āļīāđƒāļ™āļāļēāļĢāļĢāļ§āļĄāļ•āļąāļ§āđāļĨāļ°āļāļēāļĢāļĢāđˆāļ§āļĄāđ€āļˆāļĢāļˆāļēāļ•āđˆāļ­āļĢāļ­āļ‡"

print("default dictionary:", word_tokenize(text))

new_words = {"ILO87", "ILO98", "āļāļēāļĢāļĢāđˆāļ§āļĄāđ€āļˆāļĢāļˆāļēāļ•āđˆāļ­āļĢāļ­āļ‡", "āļŠāļīāļ—āļ˜āļīāđƒāļ™āļāļēāļĢāļĢāļ§āļĄāļ•āļąāļ§", "āđ€āļŠāļĢāļĩāļ āļēāļžāđƒāļ™āļāļēāļĢāļŠāļĄāļēāļ„āļĄ", "āđāļĢāļ‡āļ‡āļēāļ™āļŠāļąāļĄāļžāļąāļ™āļ˜āđŒ"}
words = new_words.union(thai_words())

custom_dictionary_trie = Trie(words)
print("custom dictionary :", word_tokenize(text, custom_dict=custom_dictionary_trie))
default dictionary: ['ILO', '87', ' ', 'āļ§āđˆāļēāļ”āđ‰āļ§āļĒ', 'āđ€āļŠāļĢāļĩāļ āļēāļž', 'āđƒāļ™', 'āļāļēāļĢāļŠāļĄāļēāļ„āļĄ', 'āđāļĨāļ°', 'āļāļēāļĢ', 'āļ„āļļāđ‰āļĄāļ„āļĢāļ­āļ‡', 'āļŠāļīāļ—āļ˜āļī', 'āđƒāļ™', 'āļāļēāļĢ', 'āļĢāļ§āļĄāļ•āļąāļ§', ' ', 'ILO', '98', ' ', 'āļ§āđˆāļēāļ”āđ‰āļ§āļĒ', 'āļŠāļīāļ—āļ˜āļī', 'āđƒāļ™', 'āļāļēāļĢ', 'āļĢāļ§āļĄāļ•āļąāļ§', 'āđāļĨāļ°', 'āļāļēāļĢ', 'āļĢāđˆāļ§āļĄ', 'āđ€āļˆāļĢāļˆāļē', 'āļ•āđˆāļ­āļĢāļ­āļ‡']
custom dictionary : ['ILO87', ' ', 'āļ§āđˆāļēāļ”āđ‰āļ§āļĒ', 'āđ€āļŠāļĢāļĩāļ āļēāļžāđƒāļ™āļāļēāļĢāļŠāļĄāļēāļ„āļĄ', 'āđāļĨāļ°', 'āļāļēāļĢ', 'āļ„āļļāđ‰āļĄāļ„āļĢāļ­āļ‡', 'āļŠāļīāļ—āļ˜āļīāđƒāļ™āļāļēāļĢāļĢāļ§āļĄāļ•āļąāļ§', ' ', 'ILO98', ' ', 'āļ§āđˆāļēāļ”āđ‰āļ§āļĒ', 'āļŠāļīāļ—āļ˜āļīāđƒāļ™āļāļēāļĢāļĢāļ§āļĄāļ•āļąāļ§', 'āđāļĨāļ°', 'āļāļēāļĢāļĢāđˆāļ§āļĄāđ€āļˆāļĢāļˆāļēāļ•āđˆāļ­āļĢāļ­āļ‡']

Testing different tokenization engines

[29]:
speedtest_text = """
āļ„āļĢāļšāļĢāļ­āļš 14 āļ›āļĩ āļ•āļēāļāđƒāļš āđ€āļŠāđ‰āļēāļ§āļąāļ™āļ™āļąāđ‰āļ™ 25 āļ•.āļ„. 2547 āļœāļđāđ‰āļŠāļļāļĄāļ™āļļāļĄāļŠāļēāļĒāļāļ§āđˆāļē 1,370 āļ„āļ™
āļ–āļđāļāđ‚āļĒāļ™āļ‚āļķāđ‰āļ™āļĢāļ–āļĒāļĩāđ€āļ­āđ‡āļĄāļ‹āļĩ 22 āļŦāļĢāļ·āļ­ 24 āļ„āļąāļ™ āļ™āļ­āļ™āļ‹āđ‰āļ­āļ™āļāļąāļ™āļ„āļąāļ™āļĨāļ° 4-5 āļŠāļąāđ‰āļ™ āđ€āļ”āļīāļ™āļ—āļēāļ‡āļˆāļēāļāļŠāļ–āļēāļ™āļĩāļ•āļģāļĢāļ§āļˆāļ•āļēāļāđƒāļš āđ„āļ›āđ„āļāļĨ 150 āļāļīāđ‚āļĨāđ€āļĄāļ•āļĢ
āđ„āļ›āļ–āļķāļ‡āļ„āđˆāļēāļĒāļ­āļīāļ‡āļ„āļĒāļļāļ—āļ˜āļšāļĢāļīāļŦāļēāļĢ āđƒāļŠāđ‰āđ€āļ§āļĨāļēāļāļ§āđˆāļē 6 āļŠāļąāđˆāļ§āđ‚āļĄāļ‡ / āđƒāļ™āļ­āļĩāļāļ„āļ”āļĩāļ—āļĩāđˆāļāļēāļ•āļīāļŸāđ‰āļ­āļ‡āļĢāđ‰āļ­āļ‡āļĢāļąāļ āļ„āļ”āļĩāļˆāļšāļĨāļ‡āļ—āļĩāđˆāļāļēāļĢāļ›āļĢāļ°āļ™āļĩāļ›āļĢāļ°āļ™āļ­āļĄāļĒāļ­āļĄāļ„āļ§āļēāļĄ
āļāļĢāļ°āļ—āļĢāļ§āļ‡āļāļĨāļēāđ‚āļŦāļĄāļˆāđˆāļēāļĒāļ„āđˆāļēāļŠāļīāļ™āđ„āļŦāļĄāļ—āļ”āđāļ—āļ™āļĢāļ§āļĄ 42 āļĨāđ‰āļēāļ™āļšāļēāļ—āđƒāļŦāđ‰āļāļąāļšāļāļēāļ•āļīāļœāļđāđ‰āđ€āļŠāļĩāļĒāļŦāļēāļĒ 79 āļĢāļēāļĒ
āļ›āļīāļ”āļŦāļĩāļšāđāļĨāļ°āļ™āļąāļšāļ„āļ°āđāļ™āļ™āđ€āļŠāļĢāđ‡āļˆāđāļĨāđ‰āļ§ āļ—āļĩāđˆāļŦāļ™āđˆāļ§āļĒāđ€āļĨāļ·āļ­āļāļ•āļąāđ‰āļ‡āļ—āļĩāđˆ 32 āđ€āļ‚āļ• 13 āđāļ‚āļ§āļ‡āļŦāļąāļ§āļŦāļĄāļēāļ āđ€āļ‚āļ•āļšāļēāļ‡āļāļ°āļ›āļī āļāļĢāļļāļ‡āđ€āļ—āļžāļĄāļŦāļēāļ™āļ„āļĢ
āļœāļđāđ‰āļŠāļĄāļąāļ„āļĢ āļŠ.āļŠ. āđāļĨāļ°āļ•āļąāļ§āđāļ—āļ™āļžāļĢāļĢāļ„āļāļēāļĢāđ€āļĄāļ·āļ­āļ‡āļˆāļēāļāļŦāļĨāļēāļĒāļžāļĢāļĢāļ„āļ•āđˆāļēāļ‡āļĄāļēāđ€āļāđ‰āļēāļŠāļąāļ‡āđ€āļāļ•āļāļēāļĢāļ™āļąāļšāļ„āļ°āđāļ™āļ™āļ­āļĒāđˆāļēāļ‡āđƒāļāļĨāđ‰āļŠāļīāļ” āđ‚āļ”āļĒ
āļāļīāļ•āļīāļ āļąāļŠāļĢāđŒ āđ‚āļŠāļ•āļīāđ€āļ”āļŠāļēāļŠāļąāļĒāļ™āļąāļ™āļ•āđŒ āļˆāļēāļāļžāļĢāļĢāļ„āļžāļĨāļąāļ‡āļ›āļĢāļ°āļŠāļēāļĢāļąāļ āđāļĨāļ°āļžāļĢāļīāļĐāļāđŒ āļ§āļąāļŠāļĢāļŠāļīāļ™āļ˜āļļ āļˆāļēāļāļžāļĢāļĢāļ„āļ›āļĢāļ°āļŠāļēāļ˜āļīāļ›āļąāļ•āļĒāđŒāđ„āļ”āđ‰āļ„āļ°āđāļ™āļ™
96 āļ„āļ°āđāļ™āļ™āđ€āļ—āđˆāļēāļāļąāļ™
āđ€āļŠāđ‰āļēāļ§āļąāļ™āļ­āļēāļ—āļīāļ•āļĒāđŒāļ—āļĩāđˆ 21 āđ€āļĄāļĐāļēāļĒāļ™ 2019 āļ‹āļķāđˆāļ‡āđ€āļ›āđ‡āļ™āļ§āļąāļ™āļ­āļĩāļŠāđ€āļ•āļ­āļĢāđŒ āļ§āļąāļ™āļŠāļģāļ„āļąāļāļ‚āļ­āļ‡āļŠāļēāļ§āļ„āļĢāļīāļŠāļ•āđŒ
āđ€āļāļīāļ”āđ€āļŦāļ•āļļāļĢāļ°āđ€āļšāļīāļ”āļ•āđˆāļ­āđ€āļ™āļ·āđˆāļ­āļ‡āđƒāļ™āđ‚āļšāļŠāļ–āđŒāļ„āļĢāļīāļŠāļ•āđŒāđāļĨāļ°āđ‚āļĢāļ‡āđāļĢāļĄāļ­āļĒāđˆāļēāļ‡āļ™āđ‰āļ­āļĒ 7 āđāļŦāđˆāļ‡āđƒāļ™āļ›āļĢāļ°āđ€āļ—āļĻāļĻāļĢāļĩāļĨāļąāļ‡āļāļē
āļĄāļĩāļœāļđāđ‰āđ€āļŠāļĩāļĒāļŠāļĩāļ§āļīāļ•āđāļĨāđ‰āļ§āļ­āļĒāđˆāļēāļ‡āļ™āđ‰āļ­āļĒ 156 āļ„āļ™ āđāļĨāļ°āļšāļēāļ”āđ€āļˆāđ‡āļšāļŦāļĨāļēāļĒāļĢāđ‰āļ­āļĒāļ„āļ™ āļĒāļąāļ‡āđ„āļĄāđˆāļĄāļĩāļ‚āđ‰āļ­āļĄāļđāļĨāļ§āđˆāļēāļœāļđāđ‰āļāđˆāļ­āđ€āļŦāļ•āļļāļĄāļēāļˆāļēāļāļāđˆāļēāļĒāđƒāļ”
āļˆāļĩāļ™āļāļģāļŦāļ™āļ”āļˆāļąāļ”āļāļēāļĢāļ›āļĢāļ°āļŠāļļāļĄāļ‚āđ‰āļ­āļĢāļīāđ€āļĢāļīāđˆāļĄāļŠāļēāļĒāđāļ–āļšāđāļĨāļ°āđ€āļŠāđ‰āļ™āļ—āļēāļ‡āđƒāļ™āļŠāđˆāļ§āļ‡āļ›āļĨāļēāļĒāļŠāļąāļ›āļ”āļēāļŦāđŒāļ™āļĩāđ‰ āļ›āļąāļāļāļīāđˆāļ‡āļĒāļ·āļ™āļĒāļąāļ™āļ§āđˆāļē
āļ­āļ āļīāļĄāļŦāļēāđ‚āļ„āļĢāļ‡āļāļēāļĢāđ€āļŠāļ·āđˆāļ­āļĄāđ‚āļĨāļāļ‚āļ­āļ‡āļˆāļĩāļ™āđ„āļĄāđˆāđƒāļŠāđˆāđ€āļ„āļĢāļ·āđˆāļ­āļ‡āļĄāļ·āļ­āđāļœāđˆāļ­āļīāļ—āļ˜āļīāļžāļĨ āđāļ•āđˆāļĒāļīāļ™āļ”āļĩāļĢāļąāļšāļŸāļąāļ‡āļ‚āđ‰āļ­āļ§āļīāļˆāļēāļĢāļ“āđŒ āđ€āļŠāđˆāļ™ āļ›āļĢāļ°āđ€āļ”āđ‡āļ™āļāļąāļšāļ”āļąāļāļŦāļ™āļĩāđ‰āļŠāļīāļ™
āđāļĨāļ°āļ„āļ§āļēāļĄāđ„āļĄāđˆāđ‚āļ›āļĢāđˆāļ‡āđƒāļŠ āļĢāļąāļāļšāļēāļĨāļ›āļąāļāļāļīāđˆāļ‡āļšāļ­āļāļ§āđˆāļē āđ€āļ§āļ—āļĩāļ›āļĢāļ°āļŠāļļāļĄ Belt and Road Forum āđƒāļ™āļŠāđˆāļ§āļ‡āļ§āļąāļ™āļ—āļĩāđˆ 25-27 āđ€āļĄāļĐāļēāļĒāļ™
āļ–āļ·āļ­āđ€āļ›āđ‡āļ™āļ‡āļēāļ™āļāļēāļĢāļ—āļđāļ•āļ—āļĩāđˆāļŠāļģāļ„āļąāļāļ—āļĩāđˆāļŠāļļāļ”āļ‚āļ­āļ‡āļˆāļĩāļ™āđƒāļ™āļ›āļĩ 2019
"""
[30]:
# Speed test: Calling "longest" engine through word_tokenize wrapper
%time tokens = word_tokenize(speedtest_text, engine="longest")
CPU times: user 253 ms, sys: 2.27 ms, total: 256 ms
Wall time: 255 ms
[31]:
# Speed test: Calling "newmm" engine through word_tokenize wrapper
%time tokens = word_tokenize(speedtest_text, engine="newmm")
CPU times: user 3.4 ms, sys: 60 Âĩs, total: 3.46 ms
Wall time: 3.47 ms
[32]:
# Speed test: Calling "newmm" engine through word_tokenize wrapper
%time tokens = word_tokenize(speedtest_text, engine="newmm-safe")
CPU times: user 4.08 ms, sys: 88 Âĩs, total: 4.16 ms
Wall time: 4.15 ms
[33]:
#!pip install attacut
# Speed test: Calling "attacut" engine through word_tokenize wrapper
%time tokens = word_tokenize(speedtest_text, engine="attacut")
CPU times: user 833 ms, sys: 174 ms, total: 1.01 s
Wall time: 576 ms

Get all possible segmentations

[34]:
from pythainlp.tokenize.multi_cut import find_all_segment, mmcut, segment

find_all_segment("āļĄāļĩāļ„āļ§āļēāļĄāđ€āļ›āđ‡āļ™āđ„āļ›āđ„āļ”āđ‰āļ­āļĒāđˆāļēāļ‡āđ„āļĢāļšāđ‰āļēāļ‡")
[34]:
['āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™|āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡|āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡|āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™āđ„āļ›āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡|āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄāđ€āļ›āđ‡āļ™āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡|āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄāđ€āļ›āđ‡āļ™āđ„āļ›āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡|āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™|āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™āđ„āļ›āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄāđ€āļ›āđ‡āļ™āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄāđ€āļ›āđ‡āļ™āđ„āļ›āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢ|āļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™|āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢāļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢāļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄ|āđ€āļ›āđ‡āļ™āđ„āļ›āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢāļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄāđ€āļ›āđ‡āļ™āđ„āļ›|āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢāļšāđ‰āļēāļ‡|',
 'āļĄāļĩ|āļ„āļ§āļēāļĄāđ€āļ›āđ‡āļ™āđ„āļ›āđ„āļ”āđ‰|āļ­āļĒāđˆāļēāļ‡āđ„āļĢāļšāđ‰āļēāļ‡|']

Subword, syllable, and Thai Character Cluster (TCC)

Tokenization can also be done at subword level, either syllable or Thai Character Cluster (TCC).

Subword tokenization

Default subword tokenization engine is tcc, which will use Thai Character Cluster (TCC) as a subword unit.

[35]:
from pythainlp import subword_tokenize

subword_tokenize("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ")  # default subword unit is TCC
[35]:
['āļ›', 'āļĢāļ°', 'āđ€āļ—', 'āļĻ', 'āđ„āļ—', 'āļĒ']

Syllable tokenization

Default syllable tokenization engine is dict, which will use newmm word tokenization engine with a custom dictionary contains known syllables in Thai language.

[36]:
from pythainlp.tokenize import syllable_tokenize

text = "āļ­āļąāļšāļ”āļļāļĨāđ€āļĨāļēāļ° āļ­āļĩāļ‹āļ­āļĄāļđāļ‹āļ­ āļŠāļĄāļ­āļ‡āļšāļ§āļĄāļĢāļļāļ™āđāļĢāļ‡"

syllable_tokenize(text)  # default engine is "dict"
[36]:
['āļ­āļąāļš',
 'āļ”āļļāļĨ',
 'āđ€āļĨāļēāļ°',
 ' ',
 'āļ­āļĩ',
 'āļ‹āļ­',
 'āļĄāļđ',
 'āļ‹āļ­',
 ' ',
 'āļŠāļĄāļ­āļ‡',
 'āļšāļ§āļĄ',
 'āļĢāļļāļ™',
 'āđāļĢāļ‡']

External `ssg <https://github.com/ponrawee/ssg>`__ engine call be called. Note that ssg engine ommitted whitespaces in the output tokens.

[37]:
syllable_tokenize(text, engine="ssg")  # use "ssg" for syllable
[37]:
['āļ­āļąāļš', 'āļ”āļļāļĨ', 'āđ€āļĨāļēāļ°', ' āļ­āļĩ', 'āļ‹āļ­', 'āļĄāļđ', 'āļ‹āļ­ ', 'āļŠāļĄāļ­āļ‡', 'āļšāļ§āļĄ', 'āļĢāļļāļ™', 'āđāļĢāļ‡']

Low-level subword operations

These low-level TCC operations can be useful for some pre-processing tasks. Like checking if it’s ok to cut a string at a certain point or to find typos.

[38]:
from pythainlp.tokenize import tcc

tcc.segment("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ")
[38]:
['āļ›', 'āļĢāļ°', 'āđ€āļ—', 'āļĻ', 'āđ„āļ—', 'āļĒ']
[39]:
tcc.tcc_pos("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ")  # return positions
[39]:
{1, 3, 5, 6, 8, 9}
[40]:
for ch in tcc.tcc("āļ›āļĢāļ°āđ€āļ—āļĻāđ„āļ—āļĒ"):  # TCC generator
    print(ch, end='-')
āļ›-āļĢāļ°-āđ€āļ—-āļĻ-āđ„āļ—-āļĒ-

Transliteration

There are two types of transliteration here: romanization and transliteration.

  • Romanization will render Thai words in the Latin alphabet using the Royal Thai General System of Transcription (RTGS).

    • Two engines are supported here: a simple royin engine (default) and a more accurate thai2rom engine.

  • Transliteration here, in PyThaiNLP context, means the sound representation of a string.

    • Two engines are supported here: ipa (International Phonetic Alphabet system, using Epitran) (default) and icu (International Components for Unicode, using PyICU).

[41]:
from pythainlp.transliterate import romanize

romanize("āđāļĄāļ§")  # output: 'maeo'
[41]:
'maeo'
[42]:
romanize("āļ āļēāļžāļĒāļ™āļ•āļĢāđŒ")  # output: 'phapn' (*obviously wrong)
[42]:
'phapn'
[43]:
from pythainlp.transliterate import transliterate

transliterate("āđāļĄāļ§")  # output: 'mɛːw'
Update Corpus...
Corpus: thai-g2p
- Already up to date.
[43]:
'm ɛː w ˧'
[44]:
transliterate("āļ āļēāļžāļĒāļ™āļ•āļĢāđŒ")  # output: 'pĘ°aːpjanot'
[44]:
'pĘ° aː pĖš ËĨËĐ . pĘ° a ËĶËĨ . j o n ˧'

Normalization

normalize() removes zero-width spaces (ZWSP and ZWNJ), duplicated spaces, repeating vowels, and dangling characters. It also reorder vowels and tone marks during the process of removing repeating vowels.

[45]:
from pythainlp.util import normalize

normalize("āđ€āđ€āļ›āļĨāļ") == "āđāļ›āļĨāļ"  # āđ€ āđ€ āļ› āļĨ āļ  vs āđ āļ› āļĨ āļ
[45]:
True

The string below contains a non-standard order of Thai characters, Sara Aa (following vowel) + Mai Ek (upper tone mark). normalize() will reorder it to Mai Ek + Sara Aa.

[46]:
text = "āđ€āļāļēāđˆ"
normalize(text)
[46]:
'āđ€āļāđˆāļē'

This can be useful for string matching, including tokenization.

[47]:
from pythainlp import word_tokenize

text = "āđ€āļāđ‡āļšāļ§āļąāļ™āļ™āđ‰āļĩ āļžāļĢāđˆāļļāļ‡āļ™āđ‰āļĩāļāđ‡āđ€āļāļēāđˆ"

print("tokenize immediately:")
print(word_tokenize(text))
print("\nnormalize, then tokenize:")
print(word_tokenize(normalize(text)))
tokenize immediately:
['āđ€āļāđ‡āļš', 'āļ§āļąāļ™', 'āļ™āđ‰āļĩ', ' ', 'āļžāļĢāđˆāļļāļ‡āļ™āđ‰āļĩ', 'āļāđ‡', 'āđ€āļāļē', 'āđˆ']

normalize, then tokenize:
['āđ€āļāđ‡āļš', 'āļ§āļąāļ™āļ™āļĩāđ‰', ' ', 'āļžāļĢāļļāđˆāļ‡āļ™āļĩāđ‰', 'āļāđ‡', 'āđ€āļāđˆāļē']

The string below contains repeating vowels (multiple Sara A in a row) normalize() will keep only one of them. It can be use to reduce variations in spellings, useful for classification task.

[48]:
normalize("āđ€āļāļ°āļ°āļ°")
[48]:
'āđ€āļāļ°'

Internally, normalize() is just a series of function calls like this:

text = remove_zw(text)
text = remove_dup_spaces(text)
text = remove_repeat_vowels(text)
text = remove_dangling(text)

If you don’t like the behavior of default normalize(), you can call those functions shown above, also remove_tonemark() and reorder_vowels(), individually from pythainlp.util, to customize your own normalization.

Digit conversion

Thai text sometimes use Thai digits. This can reduce performance for classification and searching. PyThaiNP provides few utility functions to deal with this.

[49]:
from pythainlp.util import arabic_digit_to_thai_digit, thai_digit_to_arabic_digit, digit_to_text

text = "āļ‰āļļāļāđ€āļ‰āļīāļ™āļ—āļĩāđˆāļĒāļļāđ‚āļĢāļ›āđ€āļĢāļĩāļĒāļ 112 āđ‘āđ‘āđ’"

arabic_digit_to_thai_digit(text)
[49]:
'āļ‰āļļāļāđ€āļ‰āļīāļ™āļ—āļĩāđˆāļĒāļļāđ‚āļĢāļ›āđ€āļĢāļĩāļĒāļ āđ‘āđ‘āđ’ āđ‘āđ‘āđ’'
[50]:
thai_digit_to_arabic_digit(text)
[50]:
'āļ‰āļļāļāđ€āļ‰āļīāļ™āļ—āļĩāđˆāļĒāļļāđ‚āļĢāļ›āđ€āļĢāļĩāļĒāļ 112 112'
[51]:
digit_to_text(text)
[51]:
'āļ‰āļļāļāđ€āļ‰āļīāļ™āļ—āļĩāđˆāļĒāļļāđ‚āļĢāļ›āđ€āļĢāļĩāļĒāļ āļŦāļ™āļķāđˆāļ‡āļŦāļ™āļķāđˆāļ‡āļŠāļ­āļ‡ āļŦāļ™āļķāđˆāļ‡āļŦāļ™āļķāđˆāļ‡āļŠāļ­āļ‡'

Soundex

“Soundex is a phonetic algorithm for indexing names by sound.” (Wikipedia). PyThaiNLP provides three kinds of Thai soundex.

[52]:
from pythainlp.soundex import lk82, metasound, udom83

# check equivalence
print(lk82("āļĢāļ–") == lk82("āļĢāļ”"))
print(udom83("āļ§āļĢāļĢ") == udom83("āļ§āļąāļ™"))
print(metasound("āļ™āļž") == metasound("āļ™āļ "))
True
True
True
[53]:
texts = ["āļšāļđāļĢāļ“āļ°", "āļšāļđāļĢāļ“āļāļēāļĢ", "āļĄāļąāļ", "āļĄāļąāļ„", "āļĄāļĢāļĢāļ„", "āļĨāļąāļ", "āļĢāļąāļ", "āļĢāļąāļāļĐāđŒ", ""]
for text in texts:
    print(
        "{} - lk82: {} - udom83: {} - metasound: {}".format(
            text, lk82(text), udom83(text), metasound(text)
        )
    )
āļšāļđāļĢāļ“āļ° - lk82: āļšE400 - udom83: āļš930000 - metasound: āļš550
āļšāļđāļĢāļ“āļāļēāļĢ - lk82: āļšE419 - udom83: āļš931900 - metasound: āļš551
āļĄāļąāļ - lk82: āļĄ1000 - udom83: āļĄ100000 - metasound: āļĄ100
āļĄāļąāļ„ - lk82: āļĄ1000 - udom83: āļĄ100000 - metasound: āļĄ100
āļĄāļĢāļĢāļ„ - lk82: āļĄ1000 - udom83: āļĄ310000 - metasound: āļĄ551
āļĨāļąāļ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĨ100
āļĢāļąāļ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĢ100
āļĢāļąāļāļĐāđŒ - lk82: āļĢ1000 - udom83: āļĢ100000 - metasound: āļĢ100
 - lk82:  - udom83:  - metasound:

Spellchecking

Default spellchecker uses Peter Norvig’s algorithm together with word frequency from Thai National Corpus (TNC).

spell() returns a list of all possible spellings.

[54]:
from pythainlp import spell

spell("āđ€āļŦāļĨāļ·āļĒāļĄ")
[54]:
['āđ€āļŦāļĨāļĩāļĒāļĄ', 'āđ€āļŦāļĨāļ·āļ­āļĄ']

correct() returns the most likely spelling.

[55]:
from pythainlp import correct

correct("āđ€āļŦāļĨāļ·āļĒāļĄ")
[55]:
'āđ€āļŦāļĨāļĩāļĒāļĄ'

Spellchecking - Custom dictionary and word frequency

Custom dictionary can be provided when creating spellchecker.

When create a NorvigSpellChecker object, you can pass a custom dictionary to custom_dict parameter.

custom_dict can be: - a dictionary (dict), with words (str) as keys and frequencies (int) as values; or - a list, a tuple, or a set of (word, frequency) tuples; or - a list, a tuple, or a set of just words, without their frequencies – in this case 1 will be assigned to every words.

[56]:
from pythainlp.spell import NorvigSpellChecker

user_dict = [("āđ€āļŦāļĨāļĩāļĒāļĄ", 50), ("āđ€āļŦāļĨāļ·āļ­āļĄ", 1000), ("āđ€āļŦāļĨāļĩāļĒāļ§", 1000000)]
checker = NorvigSpellChecker(custom_dict=user_dict)

checker.spell("āđ€āļŦāļĨāļ·āļĒāļĄ")
[56]:
['āđ€āļŦāļĨāļ·āļ­āļĄ', 'āđ€āļŦāļĨāļĩāļĒāļĄ']

As you can see, our version of NorvigSpellChecker gives the edit distance a priority over a word frequency.

You can use word frequencies from Thai National Corpus and Thai Textbook Corpus as well.

By default, NorvigSpellChecker uses Thai National Corpus.

[57]:
from pythainlp.corpus import ttc  # Thai Textbook Corpus

checker = NorvigSpellChecker(custom_dict=ttc.word_freqs())

checker.spell("āđ€āļŦāļĨāļ·āļĒāļĄ")
[57]:
['āđ€āļŦāļĨāļ·āļ­āļĄ']
[58]:
checker.correct("āđ€āļŦāļĨāļ·āļĒāļĄ")
[58]:
'āđ€āļŦāļĨāļ·āļ­āļĄ'

To check the current dictionary of a spellchecker:

[59]:
list(checker.dictionary())[1:10]
[59]:
[('āļžāļīāļ˜āļĩāđ€āļ›āļīāļ”', 18),
 ('āđ„āļŠāđ‰āļāļĢāļ­āļ', 40),
 ('āļ›āļĨāļīāļ‡', 6),
 ('āđ€āļ•āđ‡āļ‡', 13),
 ('āļ‚āļ­āļšāļ„āļļāļ“', 356),
 ('āļ›āļĢāļ°āļŠāļēāļ™', 84),
 ('āļĢāļģāđ„āļĢ', 11),
 ('āļĢāđˆāļ§āļĄāļ—āđ‰āļ­āļ‡', 4),
 ('āļāļąāļāļĄāļ°āļ‚āļēāļĄ', 3)]

We can also apply conditions and filter function to dictionary when creating spellchecker.

[60]:
checker = NorvigSpellChecker()  # use default filter (remove any word with number or non-Thai character)
len(checker.dictionary())
[60]:
39963
[61]:
checker = NorvigSpellChecker(min_freq=5, min_len=2, max_len=15)
len(checker.dictionary())
[61]:
30376
[62]:
checker_no_filter = NorvigSpellChecker(dict_filter=None)  # use no filter
len(checker_no_filter.dictionary())
[62]:
66209
[63]:
def remove_yamok(word):
    return False if "āđ†" in word else True

checker_custom_filter = NorvigSpellChecker(dict_filter=remove_yamok)  # use custom filter
len(checker_custom_filter.dictionary())
[63]:
66204

Part-of-Speech Tagging

[64]:
from pythainlp.tag import pos_tag, pos_tag_sents

pos_tag(["āļāļēāļĢ","āđ€āļ”āļīāļ™āļ—āļēāļ‡"])
[64]:
[('āļāļēāļĢ', 'FIXN'), ('āđ€āļ”āļīāļ™āļ—āļēāļ‡', 'VACT')]
[65]:
sents = [["āļ›āļĢāļ°āļāļēāļĻāļŠāļģāļ™āļąāļāļ™āļēāļĒāļāļŊ", " ", "āđƒāļŦāđ‰",
    " ", "'āļžāļĨ.āļ—.āļŠāļĢāļĢāđ€āļŠāļĢāļīāļ āđāļāđ‰āļ§āļāļģāđ€āļ™āļīāļ”'", " ", "āļžāđ‰āļ™āļˆāļēāļāļ•āļģāđāļŦāļ™āđˆāļ‡",
    " ", "āļœāļđāđ‰āļ—āļĢāļ‡āļ„āļļāļ“āļ§āļļāļ’āļīāļžāļīāđ€āļĻāļĐ", "āļāļ­āļ‡āļ—āļąāļžāļšāļ", " ", "āļāļĢāļ°āļ—āļĢāļ§āļ‡āļāļĨāļēāđ‚āļŦāļĄ"],
    ["āđāļĨāļ°", "āđāļ•āđˆāļ‡āļ•āļąāđ‰āļ‡", "āđƒāļŦāđ‰", "āđ€āļ›āđ‡āļ™", "'āļ­āļ˜āļīāļšāļ”āļĩāļāļĢāļĄāļ›āļĢāļ°āļŠāļēāļŠāļąāļĄāļžāļąāļ™āļ˜āđŒ'"]]

pos_tag_sents(sents)
[65]:
[[('āļ›āļĢāļ°āļāļēāļĻāļŠāļģāļ™āļąāļāļ™āļēāļĒāļāļŊ', 'NCMN'),
  (' ', 'PUNC'),
  ('āđƒāļŦāđ‰', 'JSBR'),
  (' ', 'PUNC'),
  ("'āļžāļĨ.āļ—.āļŠāļĢāļĢāđ€āļŠāļĢāļīāļ āđāļāđ‰āļ§āļāļģāđ€āļ™āļīāļ”'", 'NCMN'),
  (' ', 'PUNC'),
  ('āļžāđ‰āļ™āļˆāļēāļāļ•āļģāđāļŦāļ™āđˆāļ‡', 'NCMN'),
  (' ', 'PUNC'),
  ('āļœāļđāđ‰āļ—āļĢāļ‡āļ„āļļāļ“āļ§āļļāļ’āļīāļžāļīāđ€āļĻāļĐ', 'NCMN'),
  ('āļāļ­āļ‡āļ—āļąāļžāļšāļ', 'NCMN'),
  (' ', 'PUNC'),
  ('āļāļĢāļ°āļ—āļĢāļ§āļ‡āļāļĨāļēāđ‚āļŦāļĄ', 'NCMN')],
 [('āđāļĨāļ°', 'JCRG'),
  ('āđāļ•āđˆāļ‡āļ•āļąāđ‰āļ‡', 'VACT'),
  ('āđƒāļŦāđ‰', 'JSBR'),
  ('āđ€āļ›āđ‡āļ™', 'VSTA'),
  ("'āļ­āļ˜āļīāļšāļ”āļĩāļāļĢāļĄāļ›āļĢāļ°āļŠāļēāļŠāļąāļĄāļžāļąāļ™āļ˜āđŒ'", 'NCMN')]]

Named-Entity Tagging

The tagger use BIO scheme: - B - beginning of entity - I - inside entity - O - outside entity

[1]:
#!pip3 install pythainlp[ner]
from pythainlp.tag.thainer import ThaiNameTagger

ner = ThaiNameTagger()
ner.get_ner("24 āļĄāļī.āļĒ. 2563 āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļšāđ€āļ§āļĨāļē 6:00 āļ™. āđ€āļ”āļīāļ™āļ—āļēāļ‡āļˆāļēāļāļ‚āļ™āļŠāđˆāļ‡āļāļĢāļļāļ‡āđ€āļ—āļžāđƒāļāļĨāđ‰āļ–āļ™āļ™āļāļģāđāļžāļ‡āđ€āļžāļŠāļĢ āđ„āļ›āļˆāļąāļ‡āļŦāļ§āļąāļ”āļāļģāđāļžāļ‡āđ€āļžāļŠāļĢ āļ•āļąāđ‹āļ§āļĢāļēāļ„āļē 297 āļšāļēāļ—")
[1]:
[('24', 'NUM', 'B-DATE'),
 (' ', 'PUNCT', 'I-DATE'),
 ('āļĄāļī.āļĒ.', 'NOUN', 'I-DATE'),
 (' ', 'PUNCT', 'O'),
 ('2563', 'NUM', 'O'),
 (' ', 'PUNCT', 'O'),
 ('āļ—āļ”āļŠāļ­āļšāļĢāļ°āļšāļš', 'PART', 'O'),
 ('āđ€āļ§āļĨāļē', 'NOUN', 'O'),
 (' ', 'PUNCT', 'O'),
 ('6:00', 'NUM', 'B-TIME'),
 (' ', 'PUNCT', 'I-TIME'),
 ('āļ™.', 'NOUN', 'I-TIME'),
 (' ', 'PUNCT', 'O'),
 ('āđ€āļ”āļīāļ™āļ—āļēāļ‡', 'VERB', 'O'),
 ('āļˆāļēāļ', 'ADP', 'O'),
 ('āļ‚āļ™āļŠāđˆāļ‡', 'NOUN', 'B-ORGANIZATION'),
 ('āļāļĢāļļāļ‡āđ€āļ—āļž', 'NOUN', 'I-ORGANIZATION'),
 ('āđƒāļāļĨāđ‰', 'ADJ', 'O'),
 ('āļ–āļ™āļ™', 'NOUN', 'B-LOCATION'),
 ('āļāļģāđāļžāļ‡āđ€āļžāļŠāļĢ', 'NOUN', 'I-LOCATION'),
 (' ', 'PUNCT', 'O'),
 ('āđ„āļ›', 'AUX', 'O'),
 ('āļˆāļąāļ‡āļŦāļ§āļąāļ”', 'VERB', 'B-LOCATION'),
 ('āļāļģāđāļžāļ‡āđ€āļžāļŠāļĢ', 'NOUN', 'I-LOCATION'),
 (' ', 'PUNCT', 'O'),
 ('āļ•āļąāđ‹āļ§', 'NOUN', 'O'),
 ('āļĢāļēāļ„āļē', 'NOUN', 'O'),
 (' ', 'PUNCT', 'O'),
 ('297', 'NUM', 'B-MONEY'),
 (' ', 'PUNCT', 'I-MONEY'),
 ('āļšāļēāļ—', 'NOUN', 'I-MONEY')]

Word Vector

[67]:
import pythainlp.word_vector

pythainlp.word_vector.similarity("āļ„āļ™", "āļĄāļ™āļļāļĐāļĒāđŒ")
[67]:
0.2504981
[68]:
pythainlp.word_vector.doesnt_match(["āļ„āļ™", "āļĄāļ™āļļāļĐāļĒāđŒ", "āļšāļļāļ„āļ„āļĨ", "āđ€āļˆāđ‰āļēāļŦāļ™āđ‰āļēāļ—āļĩāđˆ", "āđ„āļāđˆ"])
/usr/local/lib/python3.7/site-packages/gensim/models/keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
[68]:
'āđ„āļāđˆ'

Number Spell Out

[69]:
from pythainlp.util import bahttext

bahttext(1234567890123.45)
[69]:
'āļŦāļ™āļķāđˆāļ‡āļĨāđ‰āļēāļ™āļŠāļ­āļ‡āđāļŠāļ™āļŠāļēāļĄāļŦāļĄāļ·āđˆāļ™āļŠāļĩāđˆāļžāļąāļ™āļŦāđ‰āļēāļĢāđ‰āļ­āļĒāļŦāļāļŠāļīāļšāđ€āļˆāđ‡āļ”āļĨāđ‰āļēāļ™āđāļ›āļ”āđāļŠāļ™āđ€āļāđ‰āļēāļŦāļĄāļ·āđˆāļ™āļŦāļ™āļķāđˆāļ‡āļĢāđ‰āļ­āļĒāļĒāļĩāđˆāļŠāļīāļšāļŠāļēāļĄāļšāļēāļ—āļŠāļĩāđˆāļŠāļīāļšāļŦāđ‰āļēāļŠāļ•āļēāļ‡āļ„āđŒ'

bahttext() will round the satang part

[70]:
bahttext(1.909)
[70]:
'āļŦāļ™āļķāđˆāļ‡āļšāļēāļ—āđ€āļāđ‰āļēāļŠāļīāļšāđ€āļ­āđ‡āļ”āļŠāļ•āļēāļ‡āļ„āđŒ'