Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion astabench/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import click
from agenteval.cli import cli as ae_cli
from agenteval.cli import eval_command, score_command
from agenteval.cli import eval_command, score_command, edit_command, check_command, convert_command

DEFAULT_CONFIG = "v1.0.0-dev1"
SPLIT_NAMES = ["validation", "test"]
Expand All @@ -15,6 +15,10 @@ def get_config_path(config_name):
return os.path.abspath(path)


def add_astabench_interventions_registry(ctx, param, value):
return tuple(set(value).union({"astabench:astabench.interventions"}))


for cmd in (eval_command, score_command):
for param in cmd.params:
if isinstance(param, click.Option) and param.name == "config_path":
Expand All @@ -24,5 +28,12 @@ def get_config_path(config_name):
elif isinstance(param, click.Option) and param.name == "split":
param.type = click.Choice(SPLIT_NAMES, case_sensitive=False)


for cmd in (edit_command, convert_command, check_command):
for param in cmd.params:
if isinstance(param, click.Option) and param.name == "registry":
param.callback = add_astabench_interventions_registry


# Export the CLI
cli = ae_cli
125 changes: 125 additions & 0 deletions astabench/conversions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from agenteval.config import Task, load_suite_config
from agenteval.interventions import Intervention
from agenteval.leaderboard.models import LeaderboardSubmission, Readme
from agenteval.score import TaskResult
from astabench.cli import get_config_path


# src HF config -> target HF config -> split -> original name -> target name
TASK_NAME_ALIASES = {
"1.0.0-dev1": {
"1.0.0": {
"test": {
"paper_finder_test": "PaperFindingBench_test",
"paper_finder_litqa2_test": "LitQA2_FullText_Search_test",
"sqa_test": "ScholarQA_CS2_test",
"arxivdigestables_test": "ArxivDIGESTables_Clean_test",
"litqa2_test": "LitQA2_FullText_test",
"discoverybench_test": "DiscoveryBench_test",
"core_bench_test": "CORE_Bench_Hard_test",
"ds1000_test": "DS_1000_test",
"e2e_discovery_test": "E2E_Bench_test",
"e2e_discovery_hard_test": "E2E_Bench_Hard_test",
"super_test": "SUPER_Expert_test",
},
"validation": {
"arxivdigestables_validation": "ArxivDIGESTables_Clean_validation",
"sqa_dev": "ScholarQA_CS2_validation",
"litqa2_validation": "LitQA2_FullText_validation",
"paper_finder_validation": "PaperFindingBench_validation",
"paper_finder_litqa2_validation": "LitQA2_FullText_Search_validation",
"discoverybench_validation": "DiscoveryBench_validation",
"core_bench_validation": "CORE_Bench_Hard_validation",
"ds1000_validation": "DS_1000_validation",
"e2e_discovery_validation": "E2E_Bench_validation",
"e2e_discovery_hard_validation": "E2E_Bench_Hard_validation",
"super_validation": "SUPER_Expert_validation",
},
}
}
}


def convert_one_task_result(
result: TaskResult,
split: str,
src_hf_config: str,
target_hf_config: str,
target_tasks_by_name: dict[str, Task],
):
changed_something = False

original_task_name = result.task_name
if original_task_name in TASK_NAME_ALIASES.get(src_hf_config, {}).get(
target_hf_config, {}
).get(split, {}):
new_task_name = TASK_NAME_ALIASES[src_hf_config][target_hf_config][split][
original_task_name
]
result.task_name = new_task_name

final_task_name = result.task_name
if original_task_name != final_task_name:
changed_something = True

if final_task_name not in target_tasks_by_name:
print(f"Unknown final task name {final_task_name}")
else:
expected_primary_metric_name = target_tasks_by_name[
final_task_name
].primary_metric
if expected_primary_metric_name not in result.available_metrics():
print(
f"Expected {expected_primary_metric_name} as the primary metric for task {final_task_name}, but don't have it."
)

return changed_something


def convert_task_results(
task_results: list[TaskResult],
split: str,
src_hf_config: str,
target_hf_config: str,
target_tasks_by_name: dict[str, Task],
):
changed_something = False
for result in task_results:
# changes happen in place
changed_this_thing = convert_one_task_result(
result=result,
split=split,
src_hf_config=src_hf_config,
target_hf_config=target_hf_config,
target_tasks_by_name=target_tasks_by_name,
)
changed_something = changed_something or changed_this_thing

return changed_something


def convert_lb_submission_to_astabench_config(lb_submission: LeaderboardSubmission, config_name: str) -> bool:
src_suite_config = lb_submission.suite_config
target_suite_config = load_suite_config(get_config_path(config_name))

target_tasks_by_name = target_suite_config.get_tasks_by_name(lb_submission.split)

# changes are made in place
changed_something = convert_task_results(
task_results=lb_submission.results,
split=lb_submission.split,
src_hf_config=src_suite_config.version,
target_hf_config=target_suite_config.version,
target_tasks_by_name=target_tasks_by_name,
)
if src_suite_config != target_suite_config:
lb_submission.suite_config = target_suite_config
changed_something = True

return changed_something


convert_1_0_0_dev1_to_1_0_0 = Intervention(
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

conversion rule example

eligible=lambda x: x.lb_submission.suite_config.version == "1.0.0-dev1",
transform=lambda x: convert_lb_submission_to_astabench_config(x, "v1.0.0"),
)
98 changes: 98 additions & 0 deletions astabench/edits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from agenteval.leaderboard.models import LeaderboardSubmission
from agenteval.interventions import Intervention, LbSubmissionWithDetails, CONVERSION_INTERVENTION_KIND, EDIT_INTERVENTION_KIND


EXPECTED_SUBMITTERS = {"Ai2", "Elicit", "SciSpace"}


KNOWN_AI2_SUBMITTERS = {
"danyhai2",
"miked-ai",
"aakanksha19",
"aryeh_tiktinsky_ai2",
"varshak1",
"aps6992",
"pclark425",
}


ACCEPTABLE_OPENNESS = {
"Open source & open weights",
"Open source & closed weights",
"Closed source & API available",
"Closed source & UI only",
}


OPENNESS_MAPPING = {
"Open Source + Open Weights": "Open source & open weights",
"Open Source": "Open source & closed weights",
"API Available": "Closed source & API available",
"Closed": "Closed source & UI only",
}


def has_openness(
submission_with_details: LbSubmissionWithDetails, opennesses: set[str]
) -> bool:
return submission_with_details.lb_submission.submission.openness in opennesses


def has_submitter(
submission_with_details: LbSubmissionWithDetails, usernames: set[str]
) -> bool:
return submission_with_details.lb_submission.submission.username in usernames


def has_any_non_null_costs(
submission_with_details: LbSubmissionWithDetails,
) -> bool:
return any(
[r.model_costs is not None for r in submission_with_details.lb_submission.results]
)


def has_any_non_null_model_usages(
submission_with_details: LbSubmissionWithDetails,
) -> bool:
return any(
[r.model_usages is not None for r in submission_with_details.lb_submission.results]
)


def set_submitter(lb_submission: LeaderboardSubmission, new_submitter: str) -> bool:
"""Return true if something changed."""
current_submitter = lb_submission.submission.username
lb_submission.submission.username = new_submitter
return current_submitter != lb_submission.submission.username


def normalize_submitter(lb_submission: LeaderboardSubmission) -> bool:
"""Return true if something changed."""
current_submitter = lb_submission.submission.username
agent_name = lb_submission.submission.agent_name
if current_submitter in KNOWN_AI2_SUBMITTERS:
if agent_name in {"Elicit", "SciSpace"}:
to_return = set_submitter(lb_submission, agent_name)
else:
to_return = set_submitter(lb_submission, "Ai2")
return to_return


def normalize_openness(lb_submission: LeaderboardSubmission):
"""Return true if something changed."""
current_openness = lb_submission.submission.openness
replace_with = OPENNESS_MAPPING.get(current_openness, current_openness)
lb_submission.submission.openness = replace_with
return current_openness != lb_submission.submission.openness


normalize_submitter_intervention = Intervention(
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

edit example

eligible=lambda x: not has_submitter(x, EXPECTED_SUBMITTERS),
transform=normalize_submitter,
)

normalize_openness_intervention = Intervention(
eligible=lambda x: not has_openness(x, ACCEPTABLE_OPENNESS),
transform=normalize_openness,
)
24 changes: 24 additions & 0 deletions astabench/interventions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from agenteval.leaderboard.models import LeaderboardSubmission
from agenteval.interventions import Intervention, LbSubmissionWithDetails, CONVERSION_INTERVENTION_KIND, EDIT_INTERVENTION_KIND
from astabench.conversions import convert_1_0_0_dev1_to_1_0_0
from astabench.edits import normalize_submitter_intervention, normalize_openness_intervention


# intervention kind -> config name -> intervention name -> Intervention
INTERVENTIONS: dict[str, dict[str, dict[str, Intervention]]] = {
EDIT_INTERVENTION_KIND: {
"1.0.0": {
"normalize_submitter": normalize_submitter_intervention,
"normalize_openness": normalize_openness_intervention,
},
"1.0.0-dev1": {
"normalize_submitter": normalize_submitter_intervention,
"normalize_openness": normalize_openness_intervention,
},
},
CONVERSION_INTERVENTION_KIND: {
"1.0.0-dev1": {
"convert_1_0_0_dev1_to_1_0_0": convert_1_0_0_dev1_to_1_0_0,
}
},
}
Loading