import itertools
import numpy as np
from collections import Counter
from sklearn.metrics import classification_report
[docs]
class ClassificationMetrics:
"""
Metrics for Classification Task.
:param config_dict: Config Params Dictionary
:type config_dict: dict
"""
def __init__(self, config_dict):
self.config_dict = config_dict
[docs]
def get_metrics(self, references, predictions, target_names):
"""
Function that returns Metrics using References, Predictions and Class Labels
:param references: References, 1D array (num_samples,)
:type references: numpy.ndarray
:param predictions: Predictions, 2D array (num_samples, num_classes) with Probabilities
:type predictions: numpy.ndarray
:param target_names: Class Labels
:type target_names: list
:return: Metrics Dictionary
:rtype: dict
"""
predictions = np.argmax(predictions, axis=-1)
labels = np.arange(len(target_names))
clf_report = classification_report(
references,
predictions,
labels=labels,
target_names=target_names,
output_dict=True,
zero_division=0.0,
)
metric_dict = {}
metric_dict["Accuracy"] = clf_report["accuracy"]
for name in target_names + ["macro avg", "weighted avg"]:
metric_dict[f"{name}-Precision"] = clf_report[name]["precision"]
metric_dict[f"{name}-Recall"] = clf_report[name]["recall"]
metric_dict[f"{name}-F1Score"] = clf_report[name]["f1-score"]
return metric_dict
[docs]
class TextGenerationMetrics:
"""
Metrics for Text Generation Task.
:param config_dict: Config Params Dictionary
:type config_dict: dict
"""
def __init__(self, config_dict):
self.config_dict = config_dict
self.rouge_n_n = config_dict["train"]["rouge_n_n"]
self.rouge_s_n = config_dict["train"]["rouge_s_n"]
self.bleu_n = config_dict["train"]["bleu_n"]
self.seq_len = config_dict["dataset"]["seq_len"]
[docs]
def get_metrics(self, references, predictions):
"""
Function that returns Metrics using References, Predictions and Class Labels
:param references: References, 2D array (num_samples, seq_len)
:type references: numpy.ndarray
:param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities
:type predictions: numpy.ndarray
:return: Metrics Dictionary
:rtype: dict
"""
rouge_n_p, rouge_n_r, rouge_n_f = self.rouge_n_score(
references, predictions, self.rouge_n_n
)
rouge_l_p, rouge_l_r, rouge_l_f = self.rouge_l_score(references, predictions)
rouge_s_p, rouge_s_r, rouge_s_f = self.rouge_s_score(
references, predictions, self.rouge_s_n
)
return {
f"BLEU-{self.bleu_n}": self.bleu_score(
references, predictions, self.bleu_n
),
"Perplexity": self.perplexity_score(predictions),
"METEOR": self.meteor_score(references, predictions),
f"ROUGE-N-{self.rouge_n_n}-Precision": rouge_n_p,
f"ROUGE-N-{self.rouge_n_n}-Recall": rouge_n_r,
f"ROUGE-N-{self.rouge_n_n}-F1score": rouge_n_f,
"ROUGE-L-Precision": rouge_l_p,
"ROUGE-L-Recall": rouge_l_r,
"ROUGE-L-F1score": rouge_l_f,
f"ROUGE-S-{self.rouge_s_n}-Precision": rouge_s_p,
f"ROUGE-S-{self.rouge_s_n}-Recall": rouge_s_r,
f"ROUGE-S-{self.rouge_s_n}-F1score": rouge_s_f,
"CIDER": self.cider_score(references, predictions),
}
[docs]
def bleu_score(self, references, predictions, n=4):
"""
BLEU Score
:param references: References, 2D array (num_samples, seq_len)
:type references: numpy.ndarray
:param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities
:type predictions: numpy.ndarray
:param n: Max number of N gram, defaults to 4
:type n: int, optional
:return: BLEU score
:rtype: float
"""
predictions = np.argmax(predictions, axis=-1)
num_instances = references.shape[0]
log_score_corpus = 0
w = 1 / n
for ref, pred in zip(references, predictions):
ref_len, pred_len = len(ref), len(pred)
bp = min(0, 1 - ref_len / pred_len)
log_score = bp
for i in range(1, n + 1):
p, _, _ = self._get_metrics_ngram(ref, pred, i, True)
log_score += w * np.log(1 + p)
log_score_corpus += log_score / num_instances
return log_score_corpus
[docs]
def perplexity_score(self, predictions):
"""
Perplixity Score
:param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities
:type predictions: numpy.ndarray
:return: Perplixity Score
:rtype: float
"""
predictions = np.max(predictions, axis=-1)
probs_sum = -np.sum(np.log(predictions), axis=1) / self.seq_len
probs_prod_inv_norm = np.exp(probs_sum)
# probs_prod = np.prod(predictions, axis=1)
# probs_prod_inv_norm = 1/np.pow(probs_prod, 1/self.seq_len)
score = np.mean(probs_prod_inv_norm).item()
return score
[docs]
def meteor_score(self, references, predictions):
predictions = np.argmax(predictions, axis=-1)
return 0.5
[docs]
def rouge_n_score(self, references, predictions, n=4):
"""
ROUGE N Score
:param references: References, 3D array (num_samples, seq_len)
:type references: numpy.ndarray
:param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities
:type predictions: numpy.ndarray
:param n: Max number of N gram, defaults to 4
:type n: int, optional
:return: ROUGE N score
:rtype: float
"""
predictions = np.argmax(predictions, axis=-1)
num_instances = references.shape[0]
precision, recall, f1score = 0, 0, 0
for ref, pred in zip(references, predictions):
p, r, f = self._get_metrics_ngram(ref, pred, n)
precision += p
recall += r
f1score += f
return (
precision / num_instances,
recall / num_instances,
f1score / num_instances,
)
[docs]
def rouge_l_score(self, references, predictions):
"""
ROUGE L Score
:param references: References, 2D array (num_samples, seq_len)
:type references: numpy.ndarray
:param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities
:type predictions: numpy.ndarray
:return: ROUGE L score
:rtype: float
"""
predictions = np.argmax(predictions, axis=-1)
num_instances = references.shape[0]
precision, recall, f1score = 0, 0, 0
for ref, pred in zip(references, predictions):
lcs = self._lcs(ref, pred)
p, r = lcs / len(pred), lcs / len(ref)
precision += p
recall += p
f1score += 2 * p * r / (p + r + 1e-8)
return (
precision / num_instances,
recall / num_instances,
f1score / num_instances,
)
[docs]
def rouge_s_score(self, references, predictions, n=4):
"""
ROUGE S Score
:param references: References, 2D array (num_samples, seq_len)
:type references: numpy.ndarray
:param predictions: Predictions, 3D array (num_samples, seq_len, num_vocab) with Probabilities
:type predictions: numpy.ndarray
:param n: Max number of N gram, defaults to 4
:type n: int, optional
:return: ROUGE S score
:rtype: float
"""
predictions = np.argmax(predictions, axis=-1)
num_instances = references.shape[0]
precision, recall, f1score = 0, 0, 0
for ref, pred in zip(references, predictions):
ref_bi = [
" ".join([str(ref[j]), str(ref[j + 1])]) for j in range(len(ref) - 1)
]
pred_skip_bi = [
[
" ".join([str(pred[i]), str(pred[k])])
for k in range(i + 1, min(len(pred) - 1, i + n + 1))
]
for i in range(len(pred) - 1)
]
pred_skip_bi = list(itertools.chain.from_iterable(pred_skip_bi))
n_unq_ngrams = len(set(ref_bi) & set(pred_skip_bi))
p, r = n_unq_ngrams / len(pred_skip_bi), n_unq_ngrams / len(ref_bi)
precision += p
recall += r
f1score += 2 * p * r / (p + r + 1e-8)
return (
precision / num_instances,
recall / num_instances,
f1score / num_instances,
)
[docs]
def cider_score(self, references, predictions):
predictions = np.argmax(predictions, axis=-1)
return 0.5
def _get_metrics_ngram(self, ref, pred, n, clip=False):
"""
Obtain Precision, Recall and F1 score of NGRAM
:param ref: Reference Sentence, 1D Array (seq_len,)
:type ref: numpy.ndarray
:param pred: Predicted Sentence, 1D ARray (seq_len,)
:type pred: numpy.ndarray
:param n: Number of tokens in a base token
:type n: int
:param clip: _description_, defaults to False
:type clip: bool, optional
:return: Precision, Recall, F1 score
:rtype: float, float, float
"""
ref_n = [
" ".join([str(ref[j + k]) for k in range(n)])
for j in range(len(ref) - n + 1)
]
pred_n = [
" ".join([str(pred[j + k]) for k in range(n)])
for j in range(len(pred) - n + 1)
]
if clip:
n_common_ngrams = 0
ref_cnt_dict = Counter(ref)
pred_cnt_dict = Counter(pred)
for k in pred_cnt_dict.keys():
cnt_k = min(pred_cnt_dict[k], ref_cnt_dict.get(k, 0))
n_common_ngrams += cnt_k
else:
n_common_ngrams = len(set(ref_n) & set(pred_n))
p, r = n_common_ngrams / len(pred_n), n_common_ngrams / len(ref_n)
f = 2 * p * r / (p + r + 1e-8)
return p, r, f
def _lcs(self, arr1, arr2):
"""
Find Longest Common Subsequence
:param arr1: First Array, 1D Array
:type arr1: numpy.ndarray
:param arr2: Second Array, 1D Array
:type arr2: numpy.ndarray
:return: Longest Common Subsequence Length
:rtype: int
"""
m = len(arr1)
n = len(arr2)
lcs_mat = np.zeros((m + 1, n + 1))
for i in range(1, m + 1):
for j in range(1, n + 1):
if arr1[i - 1] == arr2[j - 1]:
lcs_mat[i][j] = lcs_mat[i - 1][j - 1] + 1
else:
lcs_mat[i][j] = max(lcs_mat[i - 1][j], lcs_mat[i][j - 1])
return float(lcs_mat[m][n])