Source code for src.metrics

import itertools
import numpy as np
from collections import Counter
from sklearn.metrics import classification_report


[docs] class ClassificationMetrics: """ Metrics for Classification Task. :param config_dict: Config Params Dictionary :type config_dict: dict """ def __init__(self, config_dict): self.config_dict = config_dict
[docs] def get_metrics(self, references, predictions, target_names): """ Function that returns Metrics using References, Predictions and Class Labels :param references: References, 1D array (num_samples,) :type references: numpy.ndarray :param predictions: Predictions, 2D array (num_samples, num_classes) with Probabilities :type predictions: numpy.ndarray :param target_names: Class Labels :type target_names: list :return: Metrics Dictionary :rtype: dict """ predictions = np.argmax(predictions, axis=-1) labels = np.arange(len(target_names)) clf_report = classification_report( references, predictions, labels=labels, target_names=target_names, output_dict=True, zero_division=0.0, ) metric_dict = {} metric_dict["Accuracy"] = clf_report["accuracy"] for name in target_names + ["macro avg", "weighted avg"]: metric_dict[f"{name}-Precision"] = clf_report[name]["precision"] metric_dict[f"{name}-Recall"] = clf_report[name]["recall"] metric_dict[f"{name}-F1Score"] = clf_report[name]["f1-score"] return metric_dict
[docs] class TextGenerationMetrics: """ Metrics for Text Generation Task. :param config_dict: Config Params Dictionary :type config_dict: dict """ def __init__(self, config_dict): self.config_dict = config_dict self.rouge_n_n = config_dict["train"]["rouge_n_n"] self.rouge_s_n = config_dict["train"]["rouge_s_n"] self.bleu_n = config_dict["train"]["bleu_n"] self.seq_len = config_dict["dataset"]["seq_len"]
[docs] def get_metrics(self, references, predictions): """ Function that returns Metrics using References, Predictions and Class Labels :param references: References, 2D array (num_samples, seq_len) :type references: numpy.ndarray :param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities :type predictions: numpy.ndarray :return: Metrics Dictionary :rtype: dict """ rouge_n_p, rouge_n_r, rouge_n_f = self.rouge_n_score( references, predictions, self.rouge_n_n ) rouge_l_p, rouge_l_r, rouge_l_f = self.rouge_l_score(references, predictions) rouge_s_p, rouge_s_r, rouge_s_f = self.rouge_s_score( references, predictions, self.rouge_s_n ) return { f"BLEU-{self.bleu_n}": self.bleu_score( references, predictions, self.bleu_n ), "Perplexity": self.perplexity_score(predictions), "METEOR": self.meteor_score(references, predictions), f"ROUGE-N-{self.rouge_n_n}-Precision": rouge_n_p, f"ROUGE-N-{self.rouge_n_n}-Recall": rouge_n_r, f"ROUGE-N-{self.rouge_n_n}-F1score": rouge_n_f, "ROUGE-L-Precision": rouge_l_p, "ROUGE-L-Recall": rouge_l_r, "ROUGE-L-F1score": rouge_l_f, f"ROUGE-S-{self.rouge_s_n}-Precision": rouge_s_p, f"ROUGE-S-{self.rouge_s_n}-Recall": rouge_s_r, f"ROUGE-S-{self.rouge_s_n}-F1score": rouge_s_f, "CIDER": self.cider_score(references, predictions), }
[docs] def bleu_score(self, references, predictions, n=4): """ BLEU Score :param references: References, 2D array (num_samples, seq_len) :type references: numpy.ndarray :param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities :type predictions: numpy.ndarray :param n: Max number of N gram, defaults to 4 :type n: int, optional :return: BLEU score :rtype: float """ predictions = np.argmax(predictions, axis=-1) num_instances = references.shape[0] log_score_corpus = 0 w = 1 / n for ref, pred in zip(references, predictions): ref_len, pred_len = len(ref), len(pred) bp = min(0, 1 - ref_len / pred_len) log_score = bp for i in range(1, n + 1): p, _, _ = self._get_metrics_ngram(ref, pred, i, True) log_score += w * np.log(1 + p) log_score_corpus += log_score / num_instances return log_score_corpus
[docs] def perplexity_score(self, predictions): """ Perplixity Score :param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities :type predictions: numpy.ndarray :return: Perplixity Score :rtype: float """ predictions = np.max(predictions, axis=-1) probs_sum = -np.sum(np.log(predictions), axis=1) / self.seq_len probs_prod_inv_norm = np.exp(probs_sum) # probs_prod = np.prod(predictions, axis=1) # probs_prod_inv_norm = 1/np.pow(probs_prod, 1/self.seq_len) score = np.mean(probs_prod_inv_norm).item() return score
[docs] def meteor_score(self, references, predictions): predictions = np.argmax(predictions, axis=-1) return 0.5
[docs] def rouge_n_score(self, references, predictions, n=4): """ ROUGE N Score :param references: References, 3D array (num_samples, seq_len) :type references: numpy.ndarray :param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities :type predictions: numpy.ndarray :param n: Max number of N gram, defaults to 4 :type n: int, optional :return: ROUGE N score :rtype: float """ predictions = np.argmax(predictions, axis=-1) num_instances = references.shape[0] precision, recall, f1score = 0, 0, 0 for ref, pred in zip(references, predictions): p, r, f = self._get_metrics_ngram(ref, pred, n) precision += p recall += r f1score += f return ( precision / num_instances, recall / num_instances, f1score / num_instances, )
[docs] def rouge_l_score(self, references, predictions): """ ROUGE L Score :param references: References, 2D array (num_samples, seq_len) :type references: numpy.ndarray :param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities :type predictions: numpy.ndarray :return: ROUGE L score :rtype: float """ predictions = np.argmax(predictions, axis=-1) num_instances = references.shape[0] precision, recall, f1score = 0, 0, 0 for ref, pred in zip(references, predictions): lcs = self._lcs(ref, pred) p, r = lcs / len(pred), lcs / len(ref) precision += p recall += p f1score += 2 * p * r / (p + r + 1e-8) return ( precision / num_instances, recall / num_instances, f1score / num_instances, )
[docs] def rouge_s_score(self, references, predictions, n=4): """ ROUGE S Score :param references: References, 2D array (num_samples, seq_len) :type references: numpy.ndarray :param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities :type predictions: numpy.ndarray :param n: Max number of N gram, defaults to 4 :type n: int, optional :return: ROUGE S score :rtype: float """ predictions = np.argmax(predictions, axis=-1) num_instances = references.shape[0] precision, recall, f1score = 0, 0, 0 for ref, pred in zip(references, predictions): ref_bi = [ " ".join([str(ref[j]), str(ref[j + 1])]) for j in range(len(ref) - 1) ] pred_skip_bi = [ [ " ".join([str(pred[i]), str(pred[k])]) for k in range(i + 1, min(len(pred) - 1, i + n + 1)) ] for i in range(len(pred) - 1) ] pred_skip_bi = list(itertools.chain.from_iterable(pred_skip_bi)) n_unq_ngrams = len(set(ref_bi) & set(pred_skip_bi)) p, r = n_unq_ngrams / len(pred_skip_bi), n_unq_ngrams / len(ref_bi) precision += p recall += r f1score += 2 * p * r / (p + r + 1e-8) return ( precision / num_instances, recall / num_instances, f1score / num_instances, )
[docs] def cider_score(self, references, predictions): predictions = np.argmax(predictions, axis=-1) return 0.5
def _get_metrics_ngram(self, ref, pred, n, clip=False): """ Obtain Precision, Recall and F1 score of NGRAM :param ref: Reference Sentence, 1D Array (seq_len,) :type ref: numpy.ndarray :param pred: Predicted Sentence, 1D ARray (seq_len,) :type pred: numpy.ndarray :param n: Number of tokens in a base token :type n: int :param clip: _description_, defaults to False :type clip: bool, optional :return: Precision, Recall, F1 score :rtype: float, float, float """ ref_n = [ " ".join([str(ref[j + k]) for k in range(n)]) for j in range(len(ref) - n + 1) ] pred_n = [ " ".join([str(pred[j + k]) for k in range(n)]) for j in range(len(pred) - n + 1) ] if clip: n_common_ngrams = 0 ref_cnt_dict = Counter(ref) pred_cnt_dict = Counter(pred) for k in pred_cnt_dict.keys(): cnt_k = min(pred_cnt_dict[k], ref_cnt_dict.get(k, 0)) n_common_ngrams += cnt_k else: n_common_ngrams = len(set(ref_n) & set(pred_n)) p, r = n_common_ngrams / len(pred_n), n_common_ngrams / len(ref_n) f = 2 * p * r / (p + r + 1e-8) return p, r, f def _lcs(self, arr1, arr2): """ Find Longest Common Subsequence :param arr1: First Array, 1D Array :type arr1: numpy.ndarray :param arr2: Second Array, 1D Array :type arr2: numpy.ndarray :return: Longest Common Subsequence Length :rtype: int """ m = len(arr1) n = len(arr2) lcs_mat = np.zeros((m + 1, n + 1)) for i in range(1, m + 1): for j in range(1, n + 1): if arr1[i - 1] == arr2[j - 1]: lcs_mat[i][j] = lcs_mat[i - 1][j - 1] + 1 else: lcs_mat[i][j] = max(lcs_mat[i - 1][j], lcs_mat[i][j - 1]) return float(lcs_mat[m][n])