Skip to content

Commit df8ffe5

Browse files
authored
Merge pull request #59 from agentevals-dev/peterj/addevalstatus
add EvalStatus enum
2 parents d826b9a + 1602869 commit df8ffe5

4 files changed

Lines changed: 49 additions & 4 deletions

File tree

packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def my_evaluator(input: EvalInput) -> EvalResult:
2424
from .types import (
2525
EvalInput,
2626
EvalResult,
27+
EvalStatus,
2728
IntermediateStepData,
2829
InvocationData,
2930
ToolCallData,
@@ -34,6 +35,7 @@ def my_evaluator(input: EvalInput) -> EvalResult:
3435
"evaluator",
3536
"EvalInput",
3637
"EvalResult",
38+
"EvalStatus",
3739
"IntermediateStepData",
3840
"InvocationData",
3941
"ToolCallData",

packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/types.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from __future__ import annotations
99

10+
from enum import Enum
1011
from typing import Any, Optional
1112

1213
from pydantic import BaseModel, Field
@@ -63,13 +64,21 @@ class EvalInput(BaseModel):
6364
expected_invocations: Optional[list[InvocationData]] = None
6465

6566

67+
class EvalStatus(str, Enum):
68+
"""Wire JSON uses the string values (stable protocol with agentevals CLI)."""
69+
70+
PASSED = "PASSED"
71+
FAILED = "FAILED"
72+
NOT_EVALUATED = "NOT_EVALUATED"
73+
74+
6675
class EvalResult(BaseModel):
6776
"""Output payload expected from a custom evaluator script/container on stdout."""
6877

6978
score: float = Field(ge=0.0, le=1.0)
70-
status: Optional[str] = Field(
79+
status: Optional[EvalStatus] = Field(
7180
default=None,
72-
description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.',
81+
description="One of EvalStatus.PASSED, EvalStatus.FAILED, EvalStatus.NOT_EVALUATED. Derived from score vs threshold if omitted.",
7382
)
7483
per_invocation_scores: list[Optional[float]] = Field(default_factory=list)
7584
details: Optional[dict[str, Any]] = None

src/agentevals/_protocol.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
from enum import Enum
1718
from typing import Any, Optional
1819

1920
from pydantic import BaseModel, Field
@@ -62,13 +63,21 @@ class EvalInput(BaseModel):
6263
expected_invocations: Optional[list[InvocationData]] = None
6364

6465

66+
class EvalStatus(str, Enum):
67+
"""Allowed ``status`` values on the evaluator JSON wire format (matches evaluator-sdk)."""
68+
69+
PASSED = "PASSED"
70+
FAILED = "FAILED"
71+
NOT_EVALUATED = "NOT_EVALUATED"
72+
73+
6574
class EvalResult(BaseModel):
6675
"""Output payload expected from a custom evaluator on stdout."""
6776

6877
score: float = Field(ge=0.0, le=1.0)
69-
status: Optional[str] = Field(
78+
status: Optional[EvalStatus] = Field(
7079
default=None,
71-
description='One of "PASSED", "FAILED", "NOT_EVALUATED". Derived from score vs threshold if omitted.',
80+
description="Derived from score vs threshold if omitted.",
7281
)
7382
per_invocation_scores: list[Optional[float]] = Field(default_factory=list)
7483
details: Optional[dict[str, Any]] = None

tests/test_protocol.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Tests for the custom evaluator JSON protocol models."""
2+
3+
import pytest
4+
from pydantic import ValidationError
5+
6+
from agentevals._protocol import EvalResult, EvalStatus
7+
8+
9+
def test_eval_result_accepts_valid_status_strings() -> None:
10+
raw = '{"score":1.0,"status":"PASSED","per_invocation_scores":[1.0]}'
11+
r = EvalResult.model_validate_json(raw)
12+
assert r.status == EvalStatus.PASSED
13+
assert r.score == 1.0
14+
15+
16+
def test_eval_result_rejects_invalid_status() -> None:
17+
raw = '{"score":1.0,"status":"MAYBE","per_invocation_scores":[]}'
18+
with pytest.raises(ValidationError):
19+
EvalResult.model_validate_json(raw)
20+
21+
22+
def test_eval_result_omitted_status_ok() -> None:
23+
raw = '{"score":0.5,"per_invocation_scores":[]}'
24+
r = EvalResult.model_validate_json(raw)
25+
assert r.status is None

0 commit comments

Comments
 (0)