-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Expand file tree
/
Copy pathcode_agent_latest_contract.py
More file actions
94 lines (81 loc) · 2.63 KB
/
code_agent_latest_contract.py
File metadata and controls
94 lines (81 loc) · 2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""Shared contract for code-agent latest benchmark snapshots."""
from __future__ import annotations
import math
from typing import Any
CODE_AGENT_LATEST_AGENT = "elizaos_vs_opencode"
CODE_AGENT_LATEST_REQUIRED_PROVENANCE_FIELDS: tuple[str, ...] = (
"target_result_path",
"baseline_result_path",
"target_command_path",
"baseline_command_path",
"target_trajectory_dir",
"baseline_trajectory_dir",
)
CODE_AGENT_LATEST_REQUIRED_NUMERIC_FIELDS: tuple[str, ...] = (
"target_right",
"target_wrong",
"target_total",
"baseline_right",
"baseline_wrong",
"baseline_total",
"target_input_tokens",
"target_output_tokens",
"target_total_tokens",
"target_cached_token_percent",
"target_llm_call_count",
"baseline_input_tokens",
"baseline_output_tokens",
"baseline_total_tokens",
"baseline_cached_token_percent",
"baseline_llm_call_count",
"accuracy_delta",
"input_token_delta",
"output_token_delta",
"total_token_delta",
"llm_call_delta",
"cached_token_percent_delta",
)
CODE_AGENT_LATEST_REQUIRED_TRUE_FIELDS: tuple[str, ...] = (
"coverage_gate_ok",
"benchmark_gate_ok",
"required_stats_gate_ok",
"efficiency_gate_ok",
"quality_guardrail_gate_ok",
"trajectory_review_gate_ok",
"live_report_gate_ok",
"report_gate_ok",
"release_readiness_ok",
)
CODE_AGENT_LATEST_ACCEPTABLE_COMPARISON_STATUSES: frozenset[str] = frozenset(
{"superior", "comparable"}
)
def expected_code_agent_comparison_status(payload: dict[str, Any]) -> str | None:
target_accuracy = code_agent_accuracy_for_status(payload, "target")
baseline_accuracy = code_agent_accuracy_for_status(payload, "baseline")
if target_accuracy is None or baseline_accuracy is None:
return None
if target_accuracy <= 0 and baseline_accuracy <= 0:
return "weak"
if target_accuracy + 1e-9 < baseline_accuracy:
return "inferior"
if target_accuracy > baseline_accuracy + 1e-9:
return "superior"
return "comparable"
def code_agent_accuracy_for_status(
payload: dict[str, Any],
prefix: str,
) -> float | None:
explicit = payload.get(f"{prefix}_accuracy")
if _is_finite_number(explicit):
return float(explicit)
right = payload.get(f"{prefix}_right")
total = payload.get(f"{prefix}_total")
if _is_finite_number(right) and _is_finite_number(total) and float(total) > 0:
return float(right) / float(total)
return None
def _is_finite_number(value: Any) -> bool:
return (
isinstance(value, (int, float))
and not isinstance(value, bool)
and math.isfinite(float(value))
)