add csbench (#841)

KelvinDo183 · pbcong · web-flow · commit 48e8b5905fec · 2025-10-03T11:19:08.000+08:00
* add csbench

* run precommit

---------

Co-authored-by: pbcong &lt;congphamba2005@gmail.com&gt;
diff --git a/lmms_eval/tasks/csbench/csbench.yaml b/lmms_eval/tasks/csbench/csbench.yaml
@@ -0,0 +1,4 @@
+group: csbench
+task:
+- csbench_mcq
+- csbench_assertion
diff --git a/lmms_eval/tasks/csbench/csbench_assertion.yaml b/lmms_eval/tasks/csbench/csbench_assertion.yaml
@@ -0,0 +1,25 @@
+dataset_path: lmms-lab/CSBench_Assertion
+dataset_kwargs:
+  token: True
+test_split: assertion
+task: "csbench_assertion"
+
+doc_to_text: !function utils.csbench_assertion_doc_to_text
+doc_to_target: !function utils.csbench_doc_to_target
+doc_to_choice: !function utils.csbench_doc_to_choice
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+
+metric_list:
+  - metric: accuracy
+    aggregation: mean
+    higher_is_better: true
+
+process_results: !function utils.csbench_process_results
+
+metadata:
+  version: 0.0
+
diff --git a/lmms_eval/tasks/csbench/csbench_mcq.yaml b/lmms_eval/tasks/csbench/csbench_mcq.yaml
@@ -0,0 +1,25 @@
+dataset_path: lmms-lab/CSBench_MCQ
+dataset_kwargs:
+  token: True
+test_split: mcq
+task: "csbench_mcq"
+
+doc_to_text: !function utils.csbench_mcq_doc_to_text
+doc_to_target: !function utils.csbench_doc_to_target
+doc_to_choice: !function utils.csbench_doc_to_choice
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+
+metric_list:
+  - metric: accuracy
+    aggregation: mean
+    higher_is_better: true
+
+process_results: !function utils.csbench_process_results
+
+metadata:
+  version: 0.0
+
diff --git a/lmms_eval/tasks/csbench/utils.py b/lmms_eval/tasks/csbench/utils.py
@@ -0,0 +1,88 @@
+import random
+import re
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+assertion_prompt = """Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A or B."""
+
+mcq_prompt = """Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, or D."""
+
+
+def csbench_mcq_doc_to_text(doc: Dict, lmms_eval_specific_kwargs: Dict) -> str:
+    q = doc["Question"]
+    a = doc["A"]
+    b = doc["B"]
+    c = doc["C"]
+    d = doc["D"]
+    question = f"{assertion_prompt}\nQuestion: {q}\nA: {a}\nB: {b}\nC: {c}\nD: {d}\n"
+    return question
+
+
+def csbench_assertion_doc_to_text(doc: Dict, lmms_eval_specific_kwargs: Dict) -> str:
+    q = doc["Question"]
+    question = f"{assertion_prompt}\nQuestion: {q}\n A: True\n B: False\n"
+    return question
+
+
+def csbench_doc_to_target(doc: Dict) -> str:
+    if doc["Format"].strip() == "Multiple-choice":
+        return doc["Answer"].strip().upper()
+    else:
+        return "A" if doc["Answer"].strip() == "True" else "B"
+
+
+def csbench_doc_to_choice(doc: Dict) -> List[str]:
+    if doc["Format"].strip() == "Multiple-choice":
+        return ["A", "B", "C", "D"]
+    else:
+        return ["A", "B"]
+
+
+def parse_multi_choice_response(response, all_choices):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted choice letter e.g., A, B, C, D.
+    """
+    # Clean response of unwanted characters
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # Add space to avoid partial match
+
+    candidates = []
+    # Look for choices with parentheses, e.g., (A)
+    for choice in all_choices:
+        if f"({choice})" in response:
+            candidates.append(choice)
+
+    # Look for simple choices, e.g., A, B, C
+    if len(candidates) == 0:
+        for choice in all_choices:
+            if f" {choice} " in response:
+                candidates.append(choice)
+
+    # Look for choices with periods, e.g., A., B., C.
+    if len(candidates) == 0:
+        for choice in all_choices:
+            if f"{choice}." in response:
+                candidates.append(choice)
+
+    # If no candidates, randomly choose one
+    if len(candidates) == 0:
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        # If more than one candidate, choose the last one found
+        start_indexes = [response.rfind(f" {can} ") for can in candidates]
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:
+        # If only one candidate, use it
+        pred_index = candidates[0]
+
+    return pred_index
+
+
+def csbench_process_results(doc: Dict, result: List[str]) -> Dict[str, float]:
+    pred = parse_multi_choice_response(result[0], csbench_doc_to_choice(doc))
+    gt = csbench_doc_to_target(doc)
+    score = 1.0 if pred == gt else 0.0
+    return {"accuracy": score}