-
Notifications
You must be signed in to change notification settings - Fork 18
[DO NOT MERGE] sticky edits and conversions PoC #101
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
ca16
wants to merge
9
commits into
main
Choose a base branch
from
chloea-repo-combo-test
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
97899b1
save
ca16 bbc727d
checkpoint 2
ca16 8f576c1
add astabench registry always
ca16 e572bce
closer to repair
ca16 a7dbaed
closer to repair
ca16 82bb9c4
rename repairs
ca16 c71753a
split out edits and add some more rules
ca16 79ea3a1
convert first pass
ca16 61a389d
chck first pass
ca16 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| from agenteval.config import Task, load_suite_config | ||
| from agenteval.interventions import Intervention | ||
| from agenteval.leaderboard.models import LeaderboardSubmission, Readme | ||
| from agenteval.score import TaskResult | ||
| from astabench.cli import get_config_path | ||
|
|
||
|
|
||
| # src HF config -> target HF config -> split -> original name -> target name | ||
| TASK_NAME_ALIASES = { | ||
| "1.0.0-dev1": { | ||
| "1.0.0": { | ||
| "test": { | ||
| "paper_finder_test": "PaperFindingBench_test", | ||
| "paper_finder_litqa2_test": "LitQA2_FullText_Search_test", | ||
| "sqa_test": "ScholarQA_CS2_test", | ||
| "arxivdigestables_test": "ArxivDIGESTables_Clean_test", | ||
| "litqa2_test": "LitQA2_FullText_test", | ||
| "discoverybench_test": "DiscoveryBench_test", | ||
| "core_bench_test": "CORE_Bench_Hard_test", | ||
| "ds1000_test": "DS_1000_test", | ||
| "e2e_discovery_test": "E2E_Bench_test", | ||
| "e2e_discovery_hard_test": "E2E_Bench_Hard_test", | ||
| "super_test": "SUPER_Expert_test", | ||
| }, | ||
| "validation": { | ||
| "arxivdigestables_validation": "ArxivDIGESTables_Clean_validation", | ||
| "sqa_dev": "ScholarQA_CS2_validation", | ||
| "litqa2_validation": "LitQA2_FullText_validation", | ||
| "paper_finder_validation": "PaperFindingBench_validation", | ||
| "paper_finder_litqa2_validation": "LitQA2_FullText_Search_validation", | ||
| "discoverybench_validation": "DiscoveryBench_validation", | ||
| "core_bench_validation": "CORE_Bench_Hard_validation", | ||
| "ds1000_validation": "DS_1000_validation", | ||
| "e2e_discovery_validation": "E2E_Bench_validation", | ||
| "e2e_discovery_hard_validation": "E2E_Bench_Hard_validation", | ||
| "super_validation": "SUPER_Expert_validation", | ||
| }, | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| def convert_one_task_result( | ||
| result: TaskResult, | ||
| split: str, | ||
| src_hf_config: str, | ||
| target_hf_config: str, | ||
| target_tasks_by_name: dict[str, Task], | ||
| ): | ||
| changed_something = False | ||
|
|
||
| original_task_name = result.task_name | ||
| if original_task_name in TASK_NAME_ALIASES.get(src_hf_config, {}).get( | ||
| target_hf_config, {} | ||
| ).get(split, {}): | ||
| new_task_name = TASK_NAME_ALIASES[src_hf_config][target_hf_config][split][ | ||
| original_task_name | ||
| ] | ||
| result.task_name = new_task_name | ||
|
|
||
| final_task_name = result.task_name | ||
| if original_task_name != final_task_name: | ||
| changed_something = True | ||
|
|
||
| if final_task_name not in target_tasks_by_name: | ||
| print(f"Unknown final task name {final_task_name}") | ||
| else: | ||
| expected_primary_metric_name = target_tasks_by_name[ | ||
| final_task_name | ||
| ].primary_metric | ||
| if expected_primary_metric_name not in result.available_metrics(): | ||
| print( | ||
| f"Expected {expected_primary_metric_name} as the primary metric for task {final_task_name}, but don't have it." | ||
| ) | ||
|
|
||
| return changed_something | ||
|
|
||
|
|
||
| def convert_task_results( | ||
| task_results: list[TaskResult], | ||
| split: str, | ||
| src_hf_config: str, | ||
| target_hf_config: str, | ||
| target_tasks_by_name: dict[str, Task], | ||
| ): | ||
| changed_something = False | ||
| for result in task_results: | ||
| # changes happen in place | ||
| changed_this_thing = convert_one_task_result( | ||
| result=result, | ||
| split=split, | ||
| src_hf_config=src_hf_config, | ||
| target_hf_config=target_hf_config, | ||
| target_tasks_by_name=target_tasks_by_name, | ||
| ) | ||
| changed_something = changed_something or changed_this_thing | ||
|
|
||
| return changed_something | ||
|
|
||
|
|
||
| def convert_lb_submission_to_astabench_config(lb_submission: LeaderboardSubmission, config_name: str) -> bool: | ||
| src_suite_config = lb_submission.suite_config | ||
| target_suite_config = load_suite_config(get_config_path(config_name)) | ||
|
|
||
| target_tasks_by_name = target_suite_config.get_tasks_by_name(lb_submission.split) | ||
|
|
||
| # changes are made in place | ||
| changed_something = convert_task_results( | ||
| task_results=lb_submission.results, | ||
| split=lb_submission.split, | ||
| src_hf_config=src_suite_config.version, | ||
| target_hf_config=target_suite_config.version, | ||
| target_tasks_by_name=target_tasks_by_name, | ||
| ) | ||
| if src_suite_config != target_suite_config: | ||
| lb_submission.suite_config = target_suite_config | ||
| changed_something = True | ||
|
|
||
| return changed_something | ||
|
|
||
|
|
||
| convert_1_0_0_dev1_to_1_0_0 = Intervention( | ||
| eligible=lambda x: x.lb_submission.suite_config.version == "1.0.0-dev1", | ||
| transform=lambda x: convert_lb_submission_to_astabench_config(x, "v1.0.0"), | ||
| ) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| from agenteval.leaderboard.models import LeaderboardSubmission | ||
| from agenteval.interventions import Intervention, LbSubmissionWithDetails, CONVERSION_INTERVENTION_KIND, EDIT_INTERVENTION_KIND | ||
|
|
||
|
|
||
| EXPECTED_SUBMITTERS = {"Ai2", "Elicit", "SciSpace"} | ||
|
|
||
|
|
||
| KNOWN_AI2_SUBMITTERS = { | ||
| "danyhai2", | ||
| "miked-ai", | ||
| "aakanksha19", | ||
| "aryeh_tiktinsky_ai2", | ||
| "varshak1", | ||
| "aps6992", | ||
| "pclark425", | ||
| } | ||
|
|
||
|
|
||
| ACCEPTABLE_OPENNESS = { | ||
| "Open source & open weights", | ||
| "Open source & closed weights", | ||
| "Closed source & API available", | ||
| "Closed source & UI only", | ||
| } | ||
|
|
||
|
|
||
| OPENNESS_MAPPING = { | ||
| "Open Source + Open Weights": "Open source & open weights", | ||
| "Open Source": "Open source & closed weights", | ||
| "API Available": "Closed source & API available", | ||
| "Closed": "Closed source & UI only", | ||
| } | ||
|
|
||
|
|
||
| def has_openness( | ||
| submission_with_details: LbSubmissionWithDetails, opennesses: set[str] | ||
| ) -> bool: | ||
| return submission_with_details.lb_submission.submission.openness in opennesses | ||
|
|
||
|
|
||
| def has_submitter( | ||
| submission_with_details: LbSubmissionWithDetails, usernames: set[str] | ||
| ) -> bool: | ||
| return submission_with_details.lb_submission.submission.username in usernames | ||
|
|
||
|
|
||
| def has_any_non_null_costs( | ||
| submission_with_details: LbSubmissionWithDetails, | ||
| ) -> bool: | ||
| return any( | ||
| [r.model_costs is not None for r in submission_with_details.lb_submission.results] | ||
| ) | ||
|
|
||
|
|
||
| def has_any_non_null_model_usages( | ||
| submission_with_details: LbSubmissionWithDetails, | ||
| ) -> bool: | ||
| return any( | ||
| [r.model_usages is not None for r in submission_with_details.lb_submission.results] | ||
| ) | ||
|
|
||
|
|
||
| def set_submitter(lb_submission: LeaderboardSubmission, new_submitter: str) -> bool: | ||
| """Return true if something changed.""" | ||
| current_submitter = lb_submission.submission.username | ||
| lb_submission.submission.username = new_submitter | ||
| return current_submitter != lb_submission.submission.username | ||
|
|
||
|
|
||
| def normalize_submitter(lb_submission: LeaderboardSubmission) -> bool: | ||
| """Return true if something changed.""" | ||
| current_submitter = lb_submission.submission.username | ||
| agent_name = lb_submission.submission.agent_name | ||
| if current_submitter in KNOWN_AI2_SUBMITTERS: | ||
| if agent_name in {"Elicit", "SciSpace"}: | ||
| to_return = set_submitter(lb_submission, agent_name) | ||
| else: | ||
| to_return = set_submitter(lb_submission, "Ai2") | ||
| return to_return | ||
|
|
||
|
|
||
| def normalize_openness(lb_submission: LeaderboardSubmission): | ||
| """Return true if something changed.""" | ||
| current_openness = lb_submission.submission.openness | ||
| replace_with = OPENNESS_MAPPING.get(current_openness, current_openness) | ||
| lb_submission.submission.openness = replace_with | ||
| return current_openness != lb_submission.submission.openness | ||
|
|
||
|
|
||
| normalize_submitter_intervention = Intervention( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. edit example |
||
| eligible=lambda x: not has_submitter(x, EXPECTED_SUBMITTERS), | ||
| transform=normalize_submitter, | ||
| ) | ||
|
|
||
| normalize_openness_intervention = Intervention( | ||
| eligible=lambda x: not has_openness(x, ACCEPTABLE_OPENNESS), | ||
| transform=normalize_openness, | ||
| ) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| from agenteval.leaderboard.models import LeaderboardSubmission | ||
| from agenteval.interventions import Intervention, LbSubmissionWithDetails, CONVERSION_INTERVENTION_KIND, EDIT_INTERVENTION_KIND | ||
| from astabench.conversions import convert_1_0_0_dev1_to_1_0_0 | ||
| from astabench.edits import normalize_submitter_intervention, normalize_openness_intervention | ||
|
|
||
|
|
||
| # intervention kind -> config name -> intervention name -> Intervention | ||
| INTERVENTIONS: dict[str, dict[str, dict[str, Intervention]]] = { | ||
| EDIT_INTERVENTION_KIND: { | ||
| "1.0.0": { | ||
| "normalize_submitter": normalize_submitter_intervention, | ||
| "normalize_openness": normalize_openness_intervention, | ||
| }, | ||
| "1.0.0-dev1": { | ||
| "normalize_submitter": normalize_submitter_intervention, | ||
| "normalize_openness": normalize_openness_intervention, | ||
| }, | ||
| }, | ||
| CONVERSION_INTERVENTION_KIND: { | ||
| "1.0.0-dev1": { | ||
| "convert_1_0_0_dev1_to_1_0_0": convert_1_0_0_dev1_to_1_0_0, | ||
| } | ||
| }, | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
conversion rule example