NVIDIA-NeMo
diff --git a/‎nemo_skills/evaluation/metrics/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎nemo_skills/evaluation/metrics/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nemo_skills/evaluation/metrics/answer_judgement_metrics.py‎
Lines changed: 57 additions & 77 deletions b/‎nemo_skills/evaluation/metrics/answer_judgement_metrics.py‎
Lines changed: 57 additions & 77 deletions
diff --git a/‎nemo_skills/evaluation/metrics/arena_metrics.py‎
Lines changed: 9 additions & 13 deletions b/‎nemo_skills/evaluation/metrics/arena_metrics.py‎
Lines changed: 9 additions & 13 deletions
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo_skills.evaluation.metrics.base import default_formatting
 from nemo_skills.evaluation.metrics.compute_metrics import ComputeMetrics
-from nemo_skills.evaluation.metrics.utils import read_predictions
 from nemo_skills.evaluation.metrics.map_metrics import get_metrics
+from nemo_skills.evaluation.metrics.utils import read_predictions
@@ -12,47 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from collections import Counter, defaultdict
-from typing import Union
 from nemo_skills.evaluation.metrics.base import BaseMetrics
 from nemo_skills.evaluation.metrics.utils import is_correct_judgement
 
 
 class AnswerJudgementMetrics(BaseMetrics):
-    def __init__(self):
-        self.reset()
+    def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
+        gt_judgement = is_correct_judgement(prediction['expected_judgement'])
+        pred_judgement = is_correct_judgement(prediction['judgement'])
 
-    def update_perf_dict(self, perf_dict, is_correct, is_fp, is_fn, invalid_count):
-        perf_dict["total_correct"] += float(is_correct)
-        perf_dict["fp_count"] += float(is_fp)
-        perf_dict["fn_count"] += float(is_fn)
-        perf_dict["invalid_count"] += float(invalid_count)
-    
-    def get_judgement_by_type(self, predictions, judgement_type: str, gt_judgement: bool) -> Union[bool, None]:
-        answers = [c for elem in predictions if (c:=is_correct_judgement(elem['judgement'])) is not None]
-        if len(answers) == 0:
-            return None
-        if judgement_type == "majority":
-            return Counter(answers).most_common(1)[0][0]
-        elif judgement_type == "pass":
-            for answer in answers:
-                if answer == gt_judgement:
-                    return answer
-            return answers[0]
-        else:
-            raise ValueError(f"Invalid judgement type: {judgement_type}")
-    
-    def get_judgement_metrics(self, pred_judgement, gt_judgement):
-        is_fp, is_fn = False, False
-        is_invalid = pred_judgement is None
-        is_correct = pred_judgement == gt_judgement
-        if not is_correct:
-            if pred_judgement == True:
-                is_fp = True
-            elif pred_judgement == False:
-                is_fn = True
-        return is_correct, is_fp, is_fn, is_invalid
-        
+        return {'correct_judgements': gt_judgement == pred_judgement}
+
+    def _update_fp_fn(self, metrics_dict, pred_judgement, gt_judgement, divide_by=1):
+        is_fp = pred_judgement is True and gt_judgement is False
+        is_fn = pred_judgement is False and gt_judgement is True
+        metrics_dict['false_positives'] += float(is_fp) / divide_by
+        metrics_dict['false_negatives'] += float(is_fn) / divide_by
+
+    def _update_score_metrics_for_majority(
+        self,
+        eval_dict: dict,
+        k: int,
+        score_method: str,
+        score_dicts: list[dict],
+        majority_score: bool | float | int,
+        majority_answer: str,
+        predictions: list[dict],
+        predicted_answers: list[str],
+    ):
+        assert score_method == 'correct_judgements'
+        # expected answer is always the same for all predictions, so just take the first one
+        gt_judgement = is_correct_judgement(predictions[0]['expected_judgement'])
+        self._update_fp_fn(eval_dict[f"majority@{k}"], majority_answer, gt_judgement)
+
+    def _update_score_metrics_for_pass(
+        self,
+        eval_dict: dict,
+        k: int,
+        score_method: str,
+        score_dicts: list[dict],
+        pass_score: bool | float | int,
+        predictions: list[dict],
+        predicted_answers: list[str] | None,
+    ):
+        assert score_method == 'correct_judgements'
+        # expected answer is always the same for all predictions, so just take the first one
+        gt_judgement = is_correct_judgement(predictions[0]['expected_judgement'])
+        pred_judgement = is_correct_judgement(predictions[0]['judgement'])
+        # if pass is not correct, means all predictions are the same and wrong
+        if not pass_score:
+            self._update_fp_fn(eval_dict[f"pass@{k}"], pred_judgement, gt_judgement)
+
+        for pred in predictions[:k]:
+            gt_judgement = is_correct_judgement(pred['expected_judgement'])
+            pred_judgement = is_correct_judgement(pred['judgement'])
+            self._update_fp_fn(eval_dict[f"pass@1[{k}]"], pred_judgement, gt_judgement, divide_by=k)
 
     def update(self, predictions):
         """Updating the evaluation results with the current element.
@@ -61,47 +75,13 @@ def update(self, predictions):
             predictions (list[dict]): aggregated predictions across all generations.
                 The content of the file is benchmark specific.
         """
-        self.total += 1
-        gt_judgement = is_correct_judgement(predictions[0]['expected_judgement'])
-        if len(predictions) > 1:
-            # Majority@k, Pass@k, Pass@1[k]
-            for k in range(len(predictions), 0, -1):
-                pred_subset = predictions[:k]
-                majority_judgement = self.get_judgement_by_type(pred_subset, "majority", gt_judgement)
-                majority_metrics = self.get_judgement_metrics(majority_judgement, gt_judgement)
-                self.update_perf_dict(self.agg_mode_dict[f"majority@{k}"], *majority_metrics)
-
-                pass_judgement = self.get_judgement_by_type(pred_subset, "pass", gt_judgement)
-                pass_metrics = self.get_judgement_metrics(pass_judgement, gt_judgement)
-                self.update_perf_dict(self.agg_mode_dict[f"pass@{k}"], *pass_metrics)
-
-                pass1_k_metrics = [self.get_judgement_metrics(is_correct_judgement(prediction['judgement']), gt_judgement) for prediction in pred_subset]
-                avg_pass1_k_metrics = [sum(metrics) / len(metrics) for metrics in zip(*pass1_k_metrics)]
-                self.update_perf_dict(self.agg_mode_dict[f"pass@1[{k}]"], *avg_pass1_k_metrics)
-
-        # Greedy
-        if len(predictions) == 1:
-            per_sample_metrics = self.get_judgement_metrics(is_correct_judgement(predictions[0]['judgement']), gt_judgement)
-            self.update_perf_dict(self.agg_mode_dict["greedy"], *per_sample_metrics)
-            return
-
+        super().update(predictions)
+        predicted_answers = [is_correct_judgement(pred['judgement']) for pred in predictions]
+        self._compute_pass_at_k(predictions=predictions, predicted_answers=predicted_answers)
+        self._compute_majority_at_k(predictions=predictions, predicted_answers=predicted_answers)
 
     def get_metrics(self):
-        metrics_dict = {}
-        for agg_mode, agg_metric_dict in self.agg_mode_dict.items():
-            metrics_dict[agg_mode] = {"num_entries": self.total}
-
-            metrics_dict[agg_mode]["correct_judgements"] = (agg_metric_dict["total_correct"] / self.total) * 100.0
-            metrics_dict[agg_mode]["false_positives"] = (agg_metric_dict["fp_count"] / self.total) * 100.0
-            metrics_dict[agg_mode]["false_negatives"] = (agg_metric_dict["fn_count"] / self.total) * 100.0
-            metrics_dict[agg_mode]["invalid_judgements"] = (agg_metric_dict["invalid_count"] / self.total) * 100.0
-
-        return metrics_dict
-
-    def reset(self):
-        self.total = 0
-        self.agg_mode_dict = defaultdict(lambda: defaultdict(int))
-
-    def max_aggregations_to_print(self):
-        # majority + pass + pass@1[k]
-        return 1 + 1 + 1
+        # renaming no_answer to invalid_judgements
+        for agg_metric_dict in self.eval_dict.values():
+            agg_metric_dict["invalid_judgements"] = agg_metric_dict.pop("no_answer")
+        return super().get_metrics()
@@ -76,11 +76,10 @@ def update(self, predictions):
         """
         # this shouldn't do any heavy calculation, but just read the metric from existing json entry
         # all the heavy lifting should be done in the evaluation script
-        self.total += 1
+        super().update(predictions)
         self.scores.append([])
+        self.agg_mode = f"pass@{len(predictions)}"
         if len(predictions) > 1:
-            self.agg_mode = f"pass@{len(predictions)}"
-
             judge_scores = [self._get_judge_score(elem['judgement-gen-base']) for elem in predictions]
             # adding the best score out of all the generations
             possible_scores = ['A>>B', 'A>B', 'A=B', 'B>A', 'B>>A']
@@ -89,7 +88,7 @@ def update(self, predictions):
                 if any([score == possible_score for score in judge_scores]):
                     self.scores[-1].append(possible_score)
                     best_id = judge_scores.index(possible_score)
-                    self.lengths += len(predictions[best_id]['generation'])
+                    self.lengths += predictions[best_id].get('num_generated_tokens', 0)
                     break
             else:
                 self.scores[-1].append(None)  # in case judge didn't generate a valid score
@@ -101,15 +100,12 @@ def update(self, predictions):
                 if any([score == possible_score for score in judge_scores]):
                     self.scores[-1].append(possible_score)
                     best_id = judge_scores.index(possible_score)
-                    self.lengths += len(predictions[best_id]['generation'])
+                    self.lengths += predictions[best_id].get('num_generated_tokens', 0)
                     break
             else:
                 self.scores[-1].append(None)  # in case judge didn't generate a valid score
         else:
-            # Single prediction
-            self.agg_mode = "greedy"
-
-            self.lengths += len(predictions[0]['generation'])
+            self.lengths += predictions[0].get('num_generated_tokens', 0)
             self.scores[-1] = [
                 self._get_judge_score(predictions[0]['judgement-gen-base']),
                 self._get_judge_score(predictions[0]['judgement-base-gen']),
@@ -120,12 +116,12 @@ def get_metrics(self):
 
         metrics = {'num_entries': self.total}
         metrics.update(get_aggregate_score(self.scores))
-        metrics['avg_response_length'] = self.lengths / self.total
+        if self.lengths > 0:
+            metrics['avg_response_tokens'] = int(self.lengths / self.total)
         return {self.agg_mode: metrics}
 
     def reset(self):
+        super().reset()
         self.scores = []  # list of lists
         self.lengths = 0
-        self.total = 0
-        # Set automatically
-        self.agg_mode = "greedy"
+        self.agg_mode = "pass@1"