Source code for pythainlp.benchmarks.metrics

# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Evaluation metrics for Thai text generation tasks.

This module provides pure Python implementations of common evaluation
metrics (BLEU, ROUGE) that handle Thai text tokenization automatically.
"""
from __future__ import annotations

import math
from collections import Counter
from typing import Union


def _get_ngrams(tokens: list[str], n: int) -> list[tuple[str, ...]]:
    """
    Get n-grams from a list of tokens.

    :param list[str] tokens: list of tokens
    :param int n: n-gram size

    :return: list of n-grams
    :rtype: list[tuple[str, ...]]
    """
    return [tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]


def _calculate_precision_recall_fmeasure(
    overlap: int, hyp_count: int, ref_count: int
) -> tuple[float, float, float]:
    """
    Calculate precision, recall, and F-measure.

    :param int overlap: number of overlapping items
    :param int hyp_count: number of items in hypothesis
    :param int ref_count: number of items in reference

    :return: precision, recall, and F-measure
    :rtype: tuple[float, float, float]
    """
    precision = overlap / hyp_count if hyp_count > 0 else 0.0
    recall = overlap / ref_count if ref_count > 0 else 0.0

    if precision + recall > 0:
        fmeasure = 2 * precision * recall / (precision + recall)
    else:
        fmeasure = 0.0

    return precision, recall, fmeasure


def _lcs_length(x: list[str], y: list[str]) -> int:
    """
    Calculate the length of the longest common subsequence (LCS).

    :param list[str] x: first sequence
    :param list[str] y: second sequence

    :return: length of LCS
    :rtype: int
    """
    m, n = len(x), len(y)
    # Create a 2D array to store LCS lengths
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if x[i - 1] == y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    return dp[m][n]


[docs] def bleu_score( references: Union[list[str], list[list[str]]], hypotheses: list[str], tokenize: str = "newmm", lowercase: bool = False, max_ngram: int = 4, smooth: bool = True, ) -> dict[str, float]: """ Calculate BLEU score for Thai text with automatic tokenization. This is a pure Python implementation of BLEU (Bilingual Evaluation Understudy) metric that automatically tokenizes Thai text using PyThaiNLP before calculating the score. :param list[str] | list[list[str]] references: reference translations. Can be: - A list of strings (one reference per hypothesis) - A list of lists of strings (multiple references per hypothesis) :param list[str] hypotheses: hypothesis translations to evaluate :param str tokenize: tokenization engine to use (default: "newmm"). See :func:`pythainlp.tokenize.word_tokenize` for available engines. :param bool lowercase: whether to lowercase text before evaluation (default: False) :param int max_ngram: maximum n-gram order (default: 4) :param bool smooth: whether to use smoothing for zero counts (default: True) :return: dictionary with 'bleu', 'precisions', 'bp', 'length_ratio', 'hyp_length', and 'ref_length' :rtype: dict[str, float] :Example: :: from pythainlp.benchmarks import bleu_score references = ["สวัสดีครับ วันนี้อากาศดีมาก"] hypotheses = ["สวัสดีค่ะ วันนี้อากาศดี"] score = bleu_score(references, hypotheses) print(f"BLEU score: {score['bleu']:.2f}") :: # Multiple references per hypothesis references = [ ["สวัสดีครับ", "สวัสดีค่ะ"], # two refs for first hypothesis ["ลาก่อนครับ", "ลาก่อนค่ะ"], # two refs for second hypothesis ] hypotheses = ["สวัสดี", "ลาก่อน"] score = bleu_score(references, hypotheses) """ from pythainlp.tokenize import word_tokenize # Normalize references format if references and isinstance(references[0], str): refs_normalized: list[list[str]] = [[ref] for ref in references] else: refs_normalized = references # type: ignore[assignment] # Tokenize all texts def _tokenize_text(text: str) -> list[str]: """Tokenize a single text.""" tokens = word_tokenize(text, engine=tokenize, keep_whitespace=False) if lowercase: tokens = [token.lower() for token in tokens] return tokens # Tokenize hypotheses and references hyp_tokens_list = [_tokenize_text(hyp) for hyp in hypotheses] refs_tokens_list = [ [_tokenize_text(ref) for ref in refs] for refs in refs_normalized ] # Calculate BLEU total_hyp_length = 0 total_ref_length = 0 clipped_counts = [0] * max_ngram total_counts = [0] * max_ngram for hyp_tokens, ref_tokens_group in zip(hyp_tokens_list, refs_tokens_list): total_hyp_length += len(hyp_tokens) # Find the reference length closest to hypothesis length ref_lengths = [len(ref) for ref in ref_tokens_group] closest_ref_len = min( ref_lengths, key=lambda ref_len: abs(ref_len - len(hyp_tokens)) ) total_ref_length += closest_ref_len # Calculate n-gram matches for each n for n in range(1, max_ngram + 1): hyp_ngrams = _get_ngrams(hyp_tokens, n) hyp_ngram_counts = Counter(hyp_ngrams) # Get maximum counts from all references max_ref_counts: Counter[tuple[str, ...]] = Counter() for ref_tokens in ref_tokens_group: ref_ngrams = _get_ngrams(ref_tokens, n) ref_ngram_counts = Counter(ref_ngrams) for ngram in ref_ngram_counts: max_ref_counts[ngram] = max( max_ref_counts[ngram], ref_ngram_counts[ngram] ) # Clip counts clipped_count = 0 for ngram, count in hyp_ngram_counts.items(): clipped_count += min(count, max_ref_counts[ngram]) clipped_counts[n - 1] += clipped_count total_counts[n - 1] += len(hyp_ngrams) # Calculate brevity penalty if total_hyp_length < total_ref_length: bp = math.exp(1 - total_ref_length / total_hyp_length) else: bp = 1.0 # Calculate precisions precisions = [] for i in range(max_ngram): if total_counts[i] > 0: if smooth and clipped_counts[i] == 0: # Add smoothing for zero counts precision = 1.0 / (2 * total_counts[i]) else: precision = clipped_counts[i] / total_counts[i] else: precision = 0.0 precisions.append(precision) # Calculate geometric mean of precisions if all(p > 0 for p in precisions): log_precisions = [math.log(p) for p in precisions] geo_mean = math.exp(sum(log_precisions) / max_ngram) bleu = bp * geo_mean else: bleu = 0.0 return { "bleu": bleu * 100, # Return as percentage "precisions": precisions, "bp": bp, "length_ratio": total_hyp_length / total_ref_length if total_ref_length > 0 else 0.0, "hyp_length": total_hyp_length, "ref_length": total_ref_length, }
[docs] def rouge_score( reference: str, hypothesis: str, tokenize: str = "newmm", rouge_types: list[str] | None = None, ) -> dict[str, tuple[float, float, float]]: """ Calculate ROUGE scores for Thai text with automatic tokenization. This is a pure Python implementation of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) metric that automatically tokenizes Thai text using PyThaiNLP. Supported ROUGE types: - rouge1: unigram-based scoring - rouge2: bigram-based scoring - rougeL: longest common subsequence-based scoring :param str reference: reference text :param str hypothesis: hypothesis text to evaluate :param str tokenize: tokenization engine to use (default: "newmm"). See :func:`pythainlp.tokenize.word_tokenize` for available engines. :param list[str] | None rouge_types: list of ROUGE types to calculate. Default is ["rouge1", "rouge2", "rougeL"] :return: dictionary mapping ROUGE type to (precision, recall, fmeasure) :rtype: dict[str, tuple[float, float, float]] :Example: :: from pythainlp.benchmarks import rouge_score reference = "สวัสดีครับ วันนี้อากาศดีมาก" hypothesis = "สวัสดีค่ะ วันนี้อากาศดี" scores = rouge_score(reference, hypothesis) print(f"ROUGE-1 F-measure: {scores['rouge1'][2]:.4f}") print(f"ROUGE-2 F-measure: {scores['rouge2'][2]:.4f}") print(f"ROUGE-L F-measure: {scores['rougeL'][2]:.4f}") """ from pythainlp.tokenize import word_tokenize if rouge_types is None: rouge_types = ["rouge1", "rouge2", "rougeL"] # Tokenize texts ref_tokens = word_tokenize(reference, engine=tokenize, keep_whitespace=False) hyp_tokens = word_tokenize(hypothesis, engine=tokenize, keep_whitespace=False) result: dict[str, tuple[float, float, float]] = {} for rouge_type in rouge_types: if rouge_type == "rouge1": # Unigram-based ref_ngrams = Counter(ref_tokens) hyp_ngrams = Counter(hyp_tokens) overlap = sum((ref_ngrams & hyp_ngrams).values()) ref_count = len(ref_tokens) hyp_count = len(hyp_tokens) result[rouge_type] = _calculate_precision_recall_fmeasure( overlap, hyp_count, ref_count ) elif rouge_type == "rouge2": # Bigram-based ref_bigrams = _get_ngrams(ref_tokens, 2) hyp_bigrams = _get_ngrams(hyp_tokens, 2) ref_bigram_counts = Counter(ref_bigrams) hyp_bigram_counts = Counter(hyp_bigrams) overlap = sum((ref_bigram_counts & hyp_bigram_counts).values()) ref_count = len(ref_bigrams) hyp_count = len(hyp_bigrams) result[rouge_type] = _calculate_precision_recall_fmeasure( overlap, hyp_count, ref_count ) elif rouge_type == "rougeL": # Longest Common Subsequence-based lcs_len = _lcs_length(ref_tokens, hyp_tokens) ref_count = len(ref_tokens) hyp_count = len(hyp_tokens) result[rouge_type] = _calculate_precision_recall_fmeasure( lcs_len, hyp_count, ref_count ) return result
[docs] def word_error_rate( reference: str, hypothesis: str, tokenize: str = "newmm", ) -> float: """ Calculate Word Error Rate (WER) for Thai text with automatic tokenization. Word Error Rate is a common metric for evaluating speech recognition and machine translation systems. It measures the minimum number of word-level edits (insertions, deletions, substitutions) needed to transform the hypothesis into the reference, normalized by the reference length. WER = (S + D + I) / N where: - S = number of substitutions - D = number of deletions - I = number of insertions - N = number of words in reference :param str reference: reference text :param str hypothesis: hypothesis text to evaluate :param str tokenize: tokenization engine to use (default: "newmm"). See :func:`pythainlp.tokenize.word_tokenize` for available engines. :return: word error rate as a float (0.0 = perfect, >1.0 = very poor) :rtype: float :Example: :: from pythainlp.benchmarks import word_error_rate reference = "สวัสดีครับ วันนี้อากาศดีมาก" hypothesis = "สวัสดีค่ะ วันนี้อากาศดี" wer = word_error_rate(reference, hypothesis) print(f"WER: {wer:.4f}") """ from pythainlp.tokenize import word_tokenize # Tokenize texts ref_tokens = word_tokenize(reference, engine=tokenize, keep_whitespace=False) hyp_tokens = word_tokenize(hypothesis, engine=tokenize, keep_whitespace=False) # Calculate edit distance using dynamic programming r = len(ref_tokens) h = len(hyp_tokens) # Create distance matrix d = [[0] * (h + 1) for _ in range(r + 1)] # Initialize first row and column for i in range(r + 1): d[i][0] = i for j in range(h + 1): d[0][j] = j # Fill in the rest of the matrix for i in range(1, r + 1): for j in range(1, h + 1): if ref_tokens[i - 1] == hyp_tokens[j - 1]: d[i][j] = d[i - 1][j - 1] else: substitution = d[i - 1][j - 1] + 1 insertion = d[i][j - 1] + 1 deletion = d[i - 1][j] + 1 d[i][j] = min(substitution, insertion, deletion) # Calculate WER if r == 0: return 0.0 if h == 0 else float('inf') return d[r][h] / r
[docs] def character_error_rate( reference: str, hypothesis: str, ) -> float: """ Calculate Character Error Rate (CER) for Thai text. Character Error Rate is a metric for evaluating speech recognition and optical character recognition (OCR) systems. It measures the minimum number of character-level edits (insertions, deletions, substitutions) needed to transform the hypothesis into the reference, normalized by the reference length. CER = (S + D + I) / N where: - S = number of substitutions - D = number of deletions - I = number of insertions - N = number of characters in reference :param str reference: reference text :param str hypothesis: hypothesis text to evaluate :return: character error rate as a float (0.0 = perfect, >1.0 = very poor) :rtype: float :Example: :: from pythainlp.benchmarks import character_error_rate reference = "สวัสดีครับ" hypothesis = "สวัสดีค่ะ" cer = character_error_rate(reference, hypothesis) print(f"CER: {cer:.4f}") """ # Work with characters directly (no tokenization needed) ref_chars = list(reference) hyp_chars = list(hypothesis) # Calculate edit distance using dynamic programming r = len(ref_chars) h = len(hyp_chars) # Create distance matrix d = [[0] * (h + 1) for _ in range(r + 1)] # Initialize first row and column for i in range(r + 1): d[i][0] = i for j in range(h + 1): d[0][j] = j # Fill in the rest of the matrix for i in range(1, r + 1): for j in range(1, h + 1): if ref_chars[i - 1] == hyp_chars[j - 1]: d[i][j] = d[i - 1][j - 1] else: substitution = d[i - 1][j - 1] + 1 insertion = d[i][j - 1] + 1 deletion = d[i - 1][j] + 1 d[i][j] = min(substitution, insertion, deletion) # Calculate CER if r == 0: return 0.0 if h == 0 else float('inf') return d[r][h] / r