Source code for aac_metrics.functional.rouge_l

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
from typing import Callable, TypedDict, Union

import numpy as np
import torch
from torch import Tensor

from aac_metrics.utils.checks import check_metric_inputs

ROUGELScores = TypedDict("ROUGELScores", {"rouge_l": Tensor})
ROUGELOuts = tuple[ROUGELScores, ROUGELScores]


pylog = logging.getLogger(__name__)



[docs]
def rouge_l(
    candidates: list[str],
    mult_references: list[list[str]],
    return_all_scores: bool = True,
    *,
    beta: float = 1.2,
    tokenizer: Callable[[str], list[str]] = str.split,
) -> Union[ROUGELOuts, Tensor]:
    """Recall-Oriented Understudy for Gisting Evaluation function.

    - Paper: https://aclanthology.org/W04-1013.pdf
    - Original Author: Ramakrishna Vedantam <vrama91@vt.edu>
    - Original implementation: https://github.com/tylin/coco-caption

    :param candidates: The list of sentences to evaluate.
    :param mult_references: The list of list of sentences used as target.
    :param return_all_scores: If True, returns a tuple containing the globals and locals scores.
        Otherwise returns a scalar tensor containing the main global score.
        defaults to True.
    :param beta: Determines the weight of recall in the combined f-score. defaults to 1.2.
    :param tokenizer: The fast tokenizer used to split sentences into words. defaults to str.split.
    :returns: A tuple of globals and locals scores or a scalar tensor with the main global score.
    """
    rouge_l_scores = _rouge_l_update(candidates, mult_references, beta, tokenizer, [])
    return _rouge_l_compute(rouge_l_scores, return_all_scores)



def _rouge_l_update(
    candidates: list[str],
    mult_references: list[list[str]],
    beta: float,
    tokenizer: Callable[[str], list[str]],
    prev_rouge_l_scores: list[float],
) -> list[float]:
    check_metric_inputs(candidates, mult_references)

    new_rouge_l_scores = [
        __calc_score(cand, refs, beta, tokenizer)
        for cand, refs in zip(candidates, mult_references)
    ]
    prev_rouge_l_scores += new_rouge_l_scores
    return prev_rouge_l_scores


def _rouge_l_compute(
    rouge_l_scs: list[float],
    return_all_scores: bool,
) -> Union[ROUGELOuts, Tensor]:
    # Note: use numpy to compute mean because np.mean and torch.mean can give very small differences
    rouge_l_scores_np = np.array(rouge_l_scs)
    rouge_l_score_np = rouge_l_scores_np.mean()

    rouge_l_score_pt = torch.as_tensor(rouge_l_score_np)
    rouge_l_scores_pt = torch.from_numpy(rouge_l_scores_np)

    if return_all_scores:
        rouge_l_outs_corpus = {
            "rouge_l": rouge_l_score_pt,
        }
        rouge_l_outs_sents = {
            "rouge_l": rouge_l_scores_pt,
        }
        rouge_l_outs = rouge_l_outs_corpus, rouge_l_outs_sents
        return rouge_l_outs  # type: ignore
    else:
        return rouge_l_score_pt


def __calc_score(
    candidate: str,
    references: list[str],
    beta: float,
    tokenizer: Callable[[str], list[str]] = str.split,
) -> float:
    """Compute ROUGE-L score given one candidate and mult_references for an audio
    :param candidate: list of str : candidate sentence to be evaluated
    :param refs: list of str : Reference sentences for the particular audio to be evaluated
    :returns score: int (ROUGE-L score for the candidate evaluated against mult_references)
    """
    assert len(references) > 0
    prec = []
    rec = []

    # split into tokens
    token_c = tokenizer(candidate)

    # Add Labbeti: returns 0 when candidate is empty.
    if len(token_c) == 0:
        return 0.0

    for reference in references:
        # split into tokens
        token_r = tokenizer(reference)
        # compute the longest common subsequence
        lcs = __my_lcs(token_r, token_c)
        prec.append(lcs / float(len(token_c)))
        rec.append(lcs / float(len(token_r)))

    prec_max = max(prec)
    rec_max = max(rec)

    if prec_max != 0 and rec_max != 0:
        score = ((1 + beta**2) * prec_max * rec_max) / float(
            rec_max + beta**2 * prec_max
        )
    else:
        score = 0.0
    return score


def __my_lcs(string: list[str], sub: list[str]) -> int:
    """
    Calculates longest common subsequence for a pair of tokenized strings
    :param string : list of str : tokens from a string split using whitespace
    :param sub : list of str : shorter string, also split using whitespace
    :returns: length (list of int): length of the longest common subsequence between the two strings
    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
    """
    if len(string) < len(sub):
        sub, string = string, sub

    lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]
    # lengths shape: (len(string)+1, len(sub)+1)

    for j in range(1, len(sub) + 1):
        for i in range(1, len(string) + 1):
            if string[i - 1] == sub[j - 1]:
                lengths[i][j] = lengths[i - 1][j - 1] + 1
            else:
                lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])

    return lengths[len(string)][len(sub)]