diff --git a/evaluation/data_processing/answer_extraction.py b/evaluation/data_processing/answer_extraction.py index c2691d9..14c0b23 100644 --- a/evaluation/data_processing/answer_extraction.py +++ b/evaluation/data_processing/answer_extraction.py @@ -127,7 +127,7 @@ def strip_string(string): string = string.replace("infinity", "\\infty") if "\\infty" not in string: string = string.replace("inf", "\\infty") - string = string.replace("+\\inity", "\\infty") + string = string.replace("+\\infty", "\\infty") # and # string = string.replace("and", "") @@ -305,7 +305,6 @@ def extract_ocwcourses_few_shot_answer(question, reasoning, task): patt = regex.search(r"final answer is (?P.*)\. I hope it is correct.", reasoning) if patt is None: pred = "[invalid]" - print(f"DEBUG >>>\n{reasoning}", flush=True) else: pred = patt.group('ans') return pred @@ -331,7 +330,6 @@ def extract_cmath_few_shot_test(question, reasoning, task): try: ans = [s for s in regex.findall(r'-?\d+\.?\d*', ans)][-1] except: - print(f"DEBUG CMATH: {reasoning}", flush=True) ans = "[invalid]" else: ans = extract_last_single_answer(question, reasoning, task) diff --git a/evaluation/eval/eval_script.py b/evaluation/eval/eval_script.py index 0718130..3ab13fd 100644 --- a/evaluation/eval/eval_script.py +++ b/evaluation/eval/eval_script.py @@ -19,9 +19,6 @@ def is_correct(item, pred_key='prediction', prec=1e-3): if is_correct(item_cpy, pred_key=pred_key, prec=prec): pred_matched.add(i) ans_matched.add(j) - if item_cpy[pred_key] == '2,3,4': - print(item, flush=True) - print("wtf", flush=True) return len(pred_matched) == len(pred) and len(ans_matched) == len(ans) elif isinstance(pred, str) and isinstance(ans, str): if '\\cup' in pred and '\\cup' in ans: @@ -40,8 +37,7 @@ def is_correct(item, pred_key='prediction', prec=1e-3): label = label or (ans and pred == ans) or math_equal(pred, ans) return label else: - print(item, flush=True) - raise NotImplementedError() + raise NotImplementedError(f"Unsupported types: pred={type(pred)}, ans={type(ans)}") def eval_math(item, pred_key='prediction', prec=1e-3): pred = item[pred_key]