Every metric is a single class in a single file. It extends BaseMetric, implements measure(), and returns a Score. That's it.
from harness_evals.core.metric import BaseMetric
from harness_evals.core.score import Score
from harness_evals.core.eval_case import EvalCase
class MyMetric(BaseMetric):
def __init__(self, threshold: float = 1.0, **kwargs) -> None:
super().__init__(name="my_metric", threshold=threshold, **kwargs)
def measure(self, eval_case: EvalCase) -> Score:
value = ... # compute 0.0–1.0
return Score(
name=self.name,
value=value,
threshold=self.threshold,
)Score.passed is auto-computed from value >= threshold — never set it manually.
Every metric belongs to exactly one dimension (ADR-009):
| Dimension | When to use |
|---|---|
CORRECTNESS |
Metric compares output to expected answer or evaluates task completion |
GROUNDEDNESS |
Metric checks if output is supported by provided context/evidence |
SAFETY |
Metric detects policy violations (PII, toxicity, injection, unauthorized actions) |
TRAJECTORY |
Metric evaluates the path taken (tool selection, plan quality, step efficiency) |
PERFORMANCE |
Metric measures operational cost (latency, tokens, dollars, retries) |
If your metric doesn't clearly fit one dimension, it may be compound — consider splitting it.
| Category | When to Use | Base Class |
|---|---|---|
deterministic/ |
Exact comparison, regex, numeric | BaseMetric |
structural/ |
JSON/YAML diff, schema validation | BaseMetric |
operational/ |
Latency, cost, tokens, retries | BaseMetric |
reliability/ |
Multi-run consistency, robustness | ReliabilityMetric |
llm_judge/ |
LLM scores against criteria | BaseMetric (takes llm param) |
rag/ |
Faithfulness, relevancy, context | BaseMetric (takes llm and/or embedding param) |
similarity/ |
Levenshtein, BLEU, embedding similarity | BaseMetric (optionally takes embedding param) |
safety/ |
PII, toxicity, injection, hallucination | BaseMetric |
agent/ |
Tool correctness, task completion | BaseMetric |
conversation/ |
Multi-turn coherence, resolution | BaseMetric |
mcp/ |
Tool selection, trace completeness | BaseMetric |
src/harness_evals/metrics/<category>/<metric_name>.py
For metrics that compare output vs expected:
class MyDeterministicMetric(BaseMetric):
def __init__(self, threshold: float = 1.0, **kwargs) -> None:
super().__init__(name="my_metric", threshold=threshold, **kwargs)
def measure(self, eval_case: EvalCase) -> Score:
actual = str(eval_case.output)
expected = str(eval_case.expected)
value = 1.0 if actual == expected else 0.0
return Score(
name=self.name,
value=value,
threshold=self.threshold,
)For metrics that read typed fields from EvalCase:
class MyOperationalMetric(BaseMetric):
def __init__(self, max_value: float = 100, threshold: float = 0.5, **kwargs) -> None:
super().__init__(name="my_metric", threshold=threshold, **kwargs)
self.max_value = max_value
def measure(self, eval_case: EvalCase) -> Score:
if eval_case.latency_ms is None:
return Score(
name=self.name,
value=0.0,
threshold=self.threshold,
reason="latency_ms not provided",
)
value = max(0.0, 1.0 - eval_case.latency_ms / self.max_value)
return Score(
name=self.name,
value=value,
threshold=self.threshold,
metadata={"latency_ms": eval_case.latency_ms, "max_value": self.max_value},
)For metrics that evaluate across multiple runs:
from harness_evals.core.metric import ReliabilityMetric
class MyReliabilityMetric(ReliabilityMetric):
def __init__(self, threshold: float = 0.8, k: int = 5, **kwargs) -> None:
super().__init__(name="my_metric", threshold=threshold, k=k, **kwargs)
def measure_runs(self, eval_case: EvalCase) -> Score:
runs = eval_case.runs or []
if len(runs) < 2:
return Score(
name=self.name,
value=0.0,
threshold=self.threshold,
reason=f"Need at least 2 runs, got {len(runs)}",
)
value = ... # compute consistency/variance across runs
return Score(
name=self.name,
value=value,
threshold=self.threshold,
metadata={"k": len(runs)},
)For metrics that use an LLM as a judge — override a_measure() and use _run_async for the sync wrapper:
from harness_evals._async_compat import _run_async
from harness_evals.llm.base import BaseLLM
class MyLLMMetric(BaseMetric):
def __init__(self, llm: BaseLLM, threshold: float = 0.7, **kwargs) -> None:
super().__init__(name="my_metric", threshold=threshold, **kwargs)
self.llm = llm
def measure(self, eval_case: EvalCase) -> Score:
return _run_async(self.a_measure(eval_case))
async def a_measure(self, eval_case: EvalCase) -> Score:
prompt = f"Rate the following response...\nInput: {eval_case.input}\nOutput: {eval_case.output}"
result = await self.llm.generate_json(prompt, schema={"score": "number"})
value = max(0.0, min(1.0, result.get("score", 0.0)))
return Score(
name=self.name,
value=value,
threshold=self.threshold,
)For metrics that use embedding similarity — take a BaseEmbedding parameter:
from harness_evals._async_compat import _run_async
from harness_evals.llm.embedding import BaseEmbedding, _cosine_similarity
class MyEmbeddingMetric(BaseMetric):
def __init__(self, embedding: BaseEmbedding, threshold: float = 0.8, **kwargs) -> None:
super().__init__(name="my_metric", threshold=threshold, **kwargs)
self.embedding = embedding
def measure(self, eval_case: EvalCase) -> Score:
return _run_async(self.a_measure(eval_case))
async def a_measure(self, eval_case: EvalCase) -> Score:
vectors = await self.embedding.embed([str(eval_case.output), str(eval_case.expected)])
value = max(0.0, min(1.0, _cosine_similarity(vectors[0], vectors[1])))
return Score(
name=self.name,
value=value,
threshold=self.threshold,
)Add to src/harness_evals/metrics/<category>/__init__.py:
from harness_evals.metrics.<category>.<metric_name> import MyMetricAdd to src/harness_evals/metrics/__init__.py:
from harness_evals.metrics.<category> import MyMetricCreate tests/metrics/test_<metric_name>.py:
import pytest
from harness_evals.core.eval_case import EvalCase
from harness_evals.metrics.<category> import MyMetric
@pytest.mark.unit
class TestMyMetric:
def test_perfect_score(self):
ec = EvalCase(input="q", output="expected", expected="expected")
score = MyMetric(threshold=0.8).measure(ec)
assert score.passed
assert score.value == 1.0
def test_failure(self):
ec = EvalCase(input="q", output="wrong", expected="expected")
score = MyMetric(threshold=0.8).measure(ec)
assert not score.passed
assert score.value < 0.8
def test_edge_case(self):
ec = EvalCase(input="", output="")
score = MyMetric().measure(ec)
assert isinstance(score.value, float)ruff check src/ tests/ # lint
ruff format --check src/ tests/ # format
pytest tests/ -v # test- One metric per file — keeps PRs small and reviewable.
- One dimension per metric — every metric belongs to exactly one of: Correctness, Groundedness, Safety, Trajectory, Performance. See ADR-009.
- Score is always [0.0, 1.0] — normalize whatever you compute. Put raw values in
Score.metadata. - Never raise from
measure()— return a failing Score with areasoninstead. If you do raise,evaluate()catches it, but explicit is better. - Handle missing data gracefully — operational metrics should check typed fields and return a failing Score with a clear reason if None.
- No global state — all configuration goes in
__init__(). Metrics are reusable across eval cases. - No cross-metric imports — metrics should not import from other metrics.
- Safety metrics are hard constraints — see ADR-003.
- Don't set
passedmanually —Scoreauto-computespassed = value >= thresholdin__post_init__.