Source code for pythainlp.summarize

# -*- coding: utf-8 -*-

from __future__ import absolute_import, unicode_literals

from collections import defaultdict
from heapq import nlargest
from string import punctuation

from pythainlp.corpus import stopwords
from pythainlp.tokenize import sent_tokenize, word_tokenize


class FrequencySummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        self.__min_cut = min_cut
        self.__max_cut = max_cut
        self.__stopwords = set(stopwords.words("thai") + list(punctuation))

    def __compute_frequencies(self, word_tokenized_sents):
        word_freqs = defaultdict(int)
        for sent in word_tokenized_sents:
            for word in sent:
                if word not in self.__stopwords:
                    word_freqs[word] += 1

        max_freq = float(max(word_freqs.values()))
        for w in list(word_freqs):
            word_freqs[w] = word_freqs[w] / max_freq
            if word_freqs[w] >= self.__max_cut or word_freqs[w] <= self.__min_cut:
                del word_freqs[w]

        return word_freqs

    def __rank(self, ranking, n):
        return nlargest(n, ranking, key=ranking.get)

    def summarize(self, text, n, tokenizer):
        sents = sent_tokenize(text)
        word_tokenized_sents = [word_tokenize(sent, tokenizer) for sent in sents]
        self.__freq = self.__compute_frequencies(word_tokenized_sents)
        ranking = defaultdict(int)

        for i, sent in enumerate(word_tokenized_sents):
            for w in sent:
                if w in self.__freq:
                    ranking[i] += self.__freq[w]
        summaries_idx = self.__rank(ranking, n)

        return [sents[j] for j in summaries_idx]


[docs]def summarize_text(text, n, engine="frequency", tokenizer="newmm"): """ Thai text summarization :param str text: text to be summarized :param int n: number of sentences to be included in the summary :param str engine: text summarization engine :param str tokenizer: word tokenizer :return List[str] summary: list of selected sentences """ sents = [] if engine == "frequency": sents = FrequencySummarizer().summarize(text, n, tokenizer) else: # if engine not found, return first n sentences sents = sent_tokenize(text)[:n] return sents