eliza/packages/benchmarks/orchestrator/scoring.py at develop · CrowWizard/eliza · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from __future__ import annotations

from collections.abc import Iterable
from pathlib import Path
from typing import Any

if __package__ == "orchestrator":
    from bench_cli_types import ScoreExtraction
    from registry import get_benchmark_registry, load_benchmark_result_json
else:
    from benchmarks.bench_cli_types import ScoreExtraction
    from benchmarks.registry import get_benchmark_registry, load_benchmark_result_json

from .types import ScoreSummary


def _flatten_pairs(obj: Any, prefix: str = "") -> Iterable[tuple[str, Any]]:
    if isinstance(obj, dict):
        for key, value in obj.items():
            path = f"{prefix}.{key}" if prefix else str(key)
            yield from _flatten_pairs(value, path)
    elif isinstance(obj, list):
        for idx, value in enumerate(obj):
            path = f"{prefix}[{idx}]"
            yield from _flatten_pairs(value, path)
    else:
        yield prefix, obj


def _coerce_number(value: Any) -> float | None:
    if isinstance(value, bool):
        return None
    if isinstance(value, (int, float)):
        return float(value)
    if isinstance(value, str):
        stripped = value.strip().replace(",", "")
        if not stripped:
            return None
        try:
            return float(stripped)
        except ValueError:
            return None
    return None


GENERIC_SCORE_KEYS: tuple[str, ...] = (
    "overall_score",
    "score",
    "overall_success_rate",
    "overall_accuracy",
    "resolve_rate",
    "success_rate",
    "task_success_rate",
    "overall_step_accuracy",
    "accuracy",
    "final_reward",
    "max_net_worth",
)


def generic_score_extractor(result_path: Path) -> ScoreSummary:
    data = load_benchmark_result_json(result_path)

    flat = dict(_flatten_pairs(data))
    for key in GENERIC_SCORE_KEYS:
        candidates = [v for p, v in flat.items() if p.endswith(key)]
        for candidate in candidates:
            number = _coerce_number(candidate)
            if number is None:
                continue
            unit = "ratio" if "rate" in key or "accuracy" in key or "score" in key else None
            return ScoreSummary(
                score=number,
                unit=unit,
                higher_is_better=True,
                metrics={"auto_score_key": key},
            )

    return ScoreSummary(score=None, unit=None, higher_is_better=None, metrics={})


class RegistryScoreExtractor:
    def __init__(self, workspace_root: Path):
        self._registry_map = {
            entry.id: entry for entry in get_benchmark_registry(workspace_root)
        }

    def for_benchmark(self, benchmark_id: str):
        if benchmark_id not in self._registry_map:
            return generic_score_extractor

        entry = self._registry_map[benchmark_id]

        def extractor(result_path: Path) -> ScoreSummary:
            data = load_benchmark_result_json(result_path)
            extraction: ScoreExtraction = entry.extract_score(data)
            return ScoreSummary(
                score=extraction.score,
                unit=extraction.unit,
                higher_is_better=extraction.higher_is_better,
                metrics=extraction.metrics,
            )

        return extractor