diff --git a/astabench/cli.py b/astabench/cli.py index f3296c39..753684e9 100644 --- a/astabench/cli.py +++ b/astabench/cli.py @@ -3,7 +3,7 @@ import click from agenteval.cli import cli as ae_cli -from agenteval.cli import eval_command, score_command +from agenteval.cli import eval_command, score_command, edit_command, check_command, convert_command DEFAULT_CONFIG = "v1.0.0-dev1" SPLIT_NAMES = ["validation", "test"] @@ -15,6 +15,10 @@ def get_config_path(config_name): return os.path.abspath(path) +def add_astabench_interventions_registry(ctx, param, value): + return tuple(set(value).union({"astabench:astabench.interventions"})) + + for cmd in (eval_command, score_command): for param in cmd.params: if isinstance(param, click.Option) and param.name == "config_path": @@ -24,5 +28,12 @@ def get_config_path(config_name): elif isinstance(param, click.Option) and param.name == "split": param.type = click.Choice(SPLIT_NAMES, case_sensitive=False) + +for cmd in (edit_command, convert_command, check_command): + for param in cmd.params: + if isinstance(param, click.Option) and param.name == "registry": + param.callback = add_astabench_interventions_registry + + # Export the CLI cli = ae_cli diff --git a/astabench/conversions.py b/astabench/conversions.py new file mode 100644 index 00000000..070c5570 --- /dev/null +++ b/astabench/conversions.py @@ -0,0 +1,125 @@ +from agenteval.config import Task, load_suite_config +from agenteval.interventions import Intervention +from agenteval.leaderboard.models import LeaderboardSubmission, Readme +from agenteval.score import TaskResult +from astabench.cli import get_config_path + + +# src HF config -> target HF config -> split -> original name -> target name +TASK_NAME_ALIASES = { + "1.0.0-dev1": { + "1.0.0": { + "test": { + "paper_finder_test": "PaperFindingBench_test", + "paper_finder_litqa2_test": "LitQA2_FullText_Search_test", + "sqa_test": "ScholarQA_CS2_test", + "arxivdigestables_test": "ArxivDIGESTables_Clean_test", + "litqa2_test": "LitQA2_FullText_test", + "discoverybench_test": "DiscoveryBench_test", + "core_bench_test": "CORE_Bench_Hard_test", + "ds1000_test": "DS_1000_test", + "e2e_discovery_test": "E2E_Bench_test", + "e2e_discovery_hard_test": "E2E_Bench_Hard_test", + "super_test": "SUPER_Expert_test", + }, + "validation": { + "arxivdigestables_validation": "ArxivDIGESTables_Clean_validation", + "sqa_dev": "ScholarQA_CS2_validation", + "litqa2_validation": "LitQA2_FullText_validation", + "paper_finder_validation": "PaperFindingBench_validation", + "paper_finder_litqa2_validation": "LitQA2_FullText_Search_validation", + "discoverybench_validation": "DiscoveryBench_validation", + "core_bench_validation": "CORE_Bench_Hard_validation", + "ds1000_validation": "DS_1000_validation", + "e2e_discovery_validation": "E2E_Bench_validation", + "e2e_discovery_hard_validation": "E2E_Bench_Hard_validation", + "super_validation": "SUPER_Expert_validation", + }, + } + } +} + + +def convert_one_task_result( + result: TaskResult, + split: str, + src_hf_config: str, + target_hf_config: str, + target_tasks_by_name: dict[str, Task], +): + changed_something = False + + original_task_name = result.task_name + if original_task_name in TASK_NAME_ALIASES.get(src_hf_config, {}).get( + target_hf_config, {} + ).get(split, {}): + new_task_name = TASK_NAME_ALIASES[src_hf_config][target_hf_config][split][ + original_task_name + ] + result.task_name = new_task_name + + final_task_name = result.task_name + if original_task_name != final_task_name: + changed_something = True + + if final_task_name not in target_tasks_by_name: + print(f"Unknown final task name {final_task_name}") + else: + expected_primary_metric_name = target_tasks_by_name[ + final_task_name + ].primary_metric + if expected_primary_metric_name not in result.available_metrics(): + print( + f"Expected {expected_primary_metric_name} as the primary metric for task {final_task_name}, but don't have it." + ) + + return changed_something + + +def convert_task_results( + task_results: list[TaskResult], + split: str, + src_hf_config: str, + target_hf_config: str, + target_tasks_by_name: dict[str, Task], +): + changed_something = False + for result in task_results: + # changes happen in place + changed_this_thing = convert_one_task_result( + result=result, + split=split, + src_hf_config=src_hf_config, + target_hf_config=target_hf_config, + target_tasks_by_name=target_tasks_by_name, + ) + changed_something = changed_something or changed_this_thing + + return changed_something + + +def convert_lb_submission_to_astabench_config(lb_submission: LeaderboardSubmission, config_name: str) -> bool: + src_suite_config = lb_submission.suite_config + target_suite_config = load_suite_config(get_config_path(config_name)) + + target_tasks_by_name = target_suite_config.get_tasks_by_name(lb_submission.split) + + # changes are made in place + changed_something = convert_task_results( + task_results=lb_submission.results, + split=lb_submission.split, + src_hf_config=src_suite_config.version, + target_hf_config=target_suite_config.version, + target_tasks_by_name=target_tasks_by_name, + ) + if src_suite_config != target_suite_config: + lb_submission.suite_config = target_suite_config + changed_something = True + + return changed_something + + +convert_1_0_0_dev1_to_1_0_0 = Intervention( + eligible=lambda x: x.lb_submission.suite_config.version == "1.0.0-dev1", + transform=lambda x: convert_lb_submission_to_astabench_config(x, "v1.0.0"), +) diff --git a/astabench/edits.py b/astabench/edits.py new file mode 100644 index 00000000..007c489a --- /dev/null +++ b/astabench/edits.py @@ -0,0 +1,98 @@ +from agenteval.leaderboard.models import LeaderboardSubmission +from agenteval.interventions import Intervention, LbSubmissionWithDetails, CONVERSION_INTERVENTION_KIND, EDIT_INTERVENTION_KIND + + +EXPECTED_SUBMITTERS = {"Ai2", "Elicit", "SciSpace"} + + +KNOWN_AI2_SUBMITTERS = { + "danyhai2", + "miked-ai", + "aakanksha19", + "aryeh_tiktinsky_ai2", + "varshak1", + "aps6992", + "pclark425", +} + + +ACCEPTABLE_OPENNESS = { + "Open source & open weights", + "Open source & closed weights", + "Closed source & API available", + "Closed source & UI only", +} + + +OPENNESS_MAPPING = { + "Open Source + Open Weights": "Open source & open weights", + "Open Source": "Open source & closed weights", + "API Available": "Closed source & API available", + "Closed": "Closed source & UI only", +} + + +def has_openness( + submission_with_details: LbSubmissionWithDetails, opennesses: set[str] +) -> bool: + return submission_with_details.lb_submission.submission.openness in opennesses + + +def has_submitter( + submission_with_details: LbSubmissionWithDetails, usernames: set[str] +) -> bool: + return submission_with_details.lb_submission.submission.username in usernames + + +def has_any_non_null_costs( + submission_with_details: LbSubmissionWithDetails, +) -> bool: + return any( + [r.model_costs is not None for r in submission_with_details.lb_submission.results] + ) + + +def has_any_non_null_model_usages( + submission_with_details: LbSubmissionWithDetails, +) -> bool: + return any( + [r.model_usages is not None for r in submission_with_details.lb_submission.results] + ) + + +def set_submitter(lb_submission: LeaderboardSubmission, new_submitter: str) -> bool: + """Return true if something changed.""" + current_submitter = lb_submission.submission.username + lb_submission.submission.username = new_submitter + return current_submitter != lb_submission.submission.username + + +def normalize_submitter(lb_submission: LeaderboardSubmission) -> bool: + """Return true if something changed.""" + current_submitter = lb_submission.submission.username + agent_name = lb_submission.submission.agent_name + if current_submitter in KNOWN_AI2_SUBMITTERS: + if agent_name in {"Elicit", "SciSpace"}: + to_return = set_submitter(lb_submission, agent_name) + else: + to_return = set_submitter(lb_submission, "Ai2") + return to_return + + +def normalize_openness(lb_submission: LeaderboardSubmission): + """Return true if something changed.""" + current_openness = lb_submission.submission.openness + replace_with = OPENNESS_MAPPING.get(current_openness, current_openness) + lb_submission.submission.openness = replace_with + return current_openness != lb_submission.submission.openness + + +normalize_submitter_intervention = Intervention( + eligible=lambda x: not has_submitter(x, EXPECTED_SUBMITTERS), + transform=normalize_submitter, +) + +normalize_openness_intervention = Intervention( + eligible=lambda x: not has_openness(x, ACCEPTABLE_OPENNESS), + transform=normalize_openness, +) diff --git a/astabench/interventions.py b/astabench/interventions.py new file mode 100644 index 00000000..5cd27e75 --- /dev/null +++ b/astabench/interventions.py @@ -0,0 +1,24 @@ +from agenteval.leaderboard.models import LeaderboardSubmission +from agenteval.interventions import Intervention, LbSubmissionWithDetails, CONVERSION_INTERVENTION_KIND, EDIT_INTERVENTION_KIND +from astabench.conversions import convert_1_0_0_dev1_to_1_0_0 +from astabench.edits import normalize_submitter_intervention, normalize_openness_intervention + + +# intervention kind -> config name -> intervention name -> Intervention +INTERVENTIONS: dict[str, dict[str, dict[str, Intervention]]] = { + EDIT_INTERVENTION_KIND: { + "1.0.0": { + "normalize_submitter": normalize_submitter_intervention, + "normalize_openness": normalize_openness_intervention, + }, + "1.0.0-dev1": { + "normalize_submitter": normalize_submitter_intervention, + "normalize_openness": normalize_openness_intervention, + }, + }, + CONVERSION_INTERVENTION_KIND: { + "1.0.0-dev1": { + "convert_1_0_0_dev1_to_1_0_0": convert_1_0_0_dev1_to_1_0_0, + } + }, +}