pr feedback

peterj · peterj · commit cd6620b1db14 · 2026-03-19T12:11:44.000-07:00
Signed-off-by: Peter Jausovec &lt;peter.jausovec@solo.io&gt;
diff --git a/docs/custom-evaluators.md b/docs/custom-evaluators.md
@@ -52,9 +52,12 @@ def response_quality(input: EvalInput) -> EvalResult:
         score=sum(scores) / len(scores) if scores else 0.0,
         per_invocation_scores=scores,
     )
+
+if __name__ == "__main__":
+    response_quality.run()
 ```
 
-The `@evaluator` decorator handles all the stdin/stdout plumbing. Your function receives an `EvalInput` and returns an `EvalResult`.
+The `@evaluator` decorator marks your function as an evaluator. Call `.run()` to execute it as a stdin/stdout script. Your function receives an `EvalInput` and returns an `EvalResult`. The decorated function can still be called directly in tests.
 
 ### 3. Add it to your eval config
 
diff --git a/examples/custom_evaluators/response_quality.py b/examples/custom_evaluators/response_quality.py
@@ -55,3 +55,7 @@ def response_quality(input: EvalInput) -> EvalResult:
         per_invocation_scores=scores,
         details={"issues": issues} if issues else None,
     )
+
+
+if __name__ == "__main__":
+    response_quality.run()
diff --git a/examples/custom_evaluators/tool_call_checker.py b/examples/custom_evaluators/tool_call_checker.py
@@ -30,3 +30,7 @@ def tool_call_checker(input: EvalInput) -> EvalResult:
         score=overall,
         per_invocation_scores=scores,
     )
+
+
+if __name__ == "__main__":
+    tool_call_checker.run()
diff --git a/packages/evaluator-sdk-py/README.md b/packages/evaluator-sdk-py/README.md
@@ -26,9 +26,12 @@ def my_evaluator(input: EvalInput) -> EvalResult:
         score=sum(scores) / len(scores) if scores else 0.0,
         per_invocation_scores=scores,
     )
+
+if __name__ == "__main__":
+    my_evaluator.run()
 ```
 
-The `@evaluator` decorator turns your function into a runnable script -- just execute it with `python my_evaluator.py`. It reads JSON from stdin, calls your function, and writes the result to stdout.
+The `@evaluator` decorator marks your function as a runnable evaluator. Call `.run()` to execute it as a stdin/stdout script -- it reads JSON from stdin, calls your function, and writes the result to stdout. The decorated function can still be called directly in tests.
 
 ## Types
 
diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/__init__.py
@@ -13,6 +13,9 @@ def my_evaluator(input: EvalInput) -> EvalResult:
             if not inv.final_response:
                 score -= 0.5
         return EvalResult(score=max(0.0, score))
+
+    if __name__ == "__main__":
+        my_evaluator.run()
 """
 
 from .decorator import evaluator
diff --git a/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py b/packages/evaluator-sdk-py/src/agentevals_evaluator_sdk/decorator.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import asyncio
+import functools
 import inspect
 import sys
 import traceback
@@ -12,16 +13,10 @@
 
 
 def evaluator(fn: Callable[[EvalInput], EvalResult]) -> Callable[[EvalInput], EvalResult]:
-    """Decorator that turns an evaluator function into a runnable stdin/stdout script.
+    """Decorator that marks a function as a runnable evaluator.
 
-    When the decorated module is executed (``python my_evaluator.py``), it:
-    1. Reads JSON from stdin and parses it into an :class:`EvalInput`.
-    2. Calls the decorated function with the parsed input.
-    3. Serializes the returned :class:`EvalResult` to stdout as JSON.
-
-    The decorated function can be sync or async.
-
-    Example::
+    The decorated function can still be called normally (e.g. in tests).
+    To run it as a stdin/stdout script, call ``.run()``::
 
         from agentevals_evaluator_sdk import evaluator, EvalInput, EvalResult
 
@@ -32,9 +27,17 @@ def format_check(input: EvalInput) -> EvalResult:
                 if not inv.final_response:
                     score -= 0.5
             return EvalResult(score=max(0.0, score))
+
+        if __name__ == "__main__":
+            format_check.run()
     """
 
-    def _run() -> None:
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        return fn(*args, **kwargs)
+
+    def run() -> None:
+        """Read EvalInput from stdin, call the evaluator, write EvalResult to stdout."""
         raw = sys.stdin.read()
         if not raw.strip():
             _write_error("No input received on stdin")
@@ -65,10 +68,8 @@ def _run() -> None:
         sys.stdout.write("\n")
         sys.stdout.flush()
 
-    import atexit
-    atexit.register(_run)
-
-    return fn
+    wrapper.run = run
+    return wrapper
 
 
 def _write_error(msg: str) -> None:
diff --git a/src/agentevals/evaluator/templates.py b/src/agentevals/evaluator/templates.py
@@ -43,6 +43,10 @@ def ${name}(input: EvalInput) -> EvalResult:
         score=overall,
         per_invocation_scores=scores,
     )
+
+
+if __name__ == "__main__":
+    ${name}.run()
 ''')
 
 

Original file line number	Diff line number	Diff line change
`@@ -55,3 +55,7 @@ def response_quality(input: EvalInput) -> EvalResult:`
`55`	`55`	`per_invocation_scores=scores,`
`56`	`56`	`details={"issues": issues} if issues else None,`
`57`	`57`	`)`
	`58`	`+`
	`59`	`+`
	`60`	`+if __name__ == "__main__":`
	`61`	`+ response_quality.run()`
Original file line number	Diff line number	Diff line change
`@@ -30,3 +30,7 @@ def tool_call_checker(input: EvalInput) -> EvalResult:`
`30`	`30`	`score=overall,`
`31`	`31`	`per_invocation_scores=scores,`
`32`	`32`	`)`
	`33`	`+`
	`34`	`+`
	`35`	`+if __name__ == "__main__":`
	`36`	`+ tool_call_checker.run()`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,10 @@ def ${name}(input: EvalInput) -> EvalResult:`
`43`	`43`	`score=overall,`
`44`	`44`	`per_invocation_scores=scores,`
`45`	`45`	`)`
	`46`	`+`
	`47`	`+`
	`48`	`+if __name__ == "__main__":`
	`49`	`+ ${name}.run()`
`46`	`50`	`''')`
`47`	`51`
`48`	`52`