Skip to content

Commit 53f4df7

Browse files
wow-mileyclaude
andcommitted
AMPR-185 #532: ampere-eval 3 — Meter & Tolerance grading interface
Add Reading, Tolerance, Meter (fun interface), MeterError (sealed), OutcomeMeter, CompositeMeter (WeightedMeter), and JudgeMeter (JudgeClient SAM + PlaybackRelay seam) under link.socket.ampere.eval.meter. Includes 26 commonTest tests covering all tasks (3.1–3.5). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 97060ca commit 53f4df7

8 files changed

Lines changed: 662 additions & 0 deletions

File tree

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package link.socket.ampere.eval.meter
2+
3+
import link.socket.ampere.eval.trace.Trace
4+
5+
/**
6+
* A [Meter] paired with a relative [weight] and a [required] flag.
7+
*
8+
* If [required] is `true`, a failed or missing reading from this meter causes the
9+
* [CompositeMeter] to fail immediately.
10+
*/
11+
data class WeightedMeter(
12+
val meter: Meter,
13+
val weight: Double,
14+
val required: Boolean = false,
15+
)
16+
17+
/**
18+
* Aggregates child meters into a single weighted-mean [Reading].
19+
*
20+
* - Aggregate score = weighted mean of successful child scores.
21+
* - A required child that returns `Result.failure` propagates the failure immediately.
22+
* - A required child whose [Reading.passed] is `false` forces the composite to fail
23+
* even if the weighted mean clears [tolerance].
24+
*/
25+
class CompositeMeter(
26+
val meterId: String,
27+
private val tolerance: Tolerance,
28+
private val children: List<WeightedMeter>,
29+
) : Meter {
30+
31+
override suspend fun measure(trace: Trace): Result<Reading> {
32+
val childReadings = mutableListOf<Pair<WeightedMeter, Reading>>()
33+
34+
for (weighted in children) {
35+
val result = weighted.meter.measure(trace)
36+
when {
37+
result.isFailure && weighted.required -> return result
38+
result.isSuccess -> childReadings.add(weighted to result.getOrThrow())
39+
}
40+
}
41+
42+
if (childReadings.isEmpty()) {
43+
return Result.failure(MeterError.NoReadings(meterId))
44+
}
45+
46+
val totalWeight = childReadings.sumOf { (w, _) -> w.weight }
47+
val aggregateScore = if (totalWeight > 0) {
48+
childReadings.sumOf { (w, r) -> w.weight * r.score } / totalWeight
49+
} else {
50+
0.0
51+
}
52+
53+
val anyRequiredFailed = childReadings.any { (w, r) -> w.required && !r.passed }
54+
val passed = !anyRequiredFailed && tolerance.passes(aggregateScore)
55+
56+
return Result.success(
57+
Reading(score = aggregateScore, passed = passed, meterId = meterId),
58+
)
59+
}
60+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
package link.socket.ampere.eval.meter
2+
3+
import link.socket.ampere.agents.domain.routing.CognitiveRelay
4+
import link.socket.ampere.eval.trace.Trace
5+
6+
/**
7+
* Performs an LLM completion call (prompt → text) on behalf of a [JudgeMeter].
8+
*
9+
* In production this wraps the live provider client. In CI a stubbed lambda returns
10+
* a fixed response, and routing is served by a [link.socket.ampere.eval.relay.PlaybackRelay]
11+
* wired as the [JudgeMeter.relay] — making the judge deterministic under replay.
12+
*/
13+
fun interface JudgeClient {
14+
suspend fun call(prompt: String): Result<String>
15+
}
16+
17+
/**
18+
* LLM-grades a [Trace] against a [rubric], parsing a normalized score from the response.
19+
*
20+
* The [relay] resolves routing (which model to call) — swapping in a
21+
* [link.socket.ampere.eval.relay.PlaybackRelay] makes the judge deterministic in CI.
22+
* The [client] performs the actual completion; for tests inject a stubbed lambda.
23+
*
24+
* The judge response must contain `"Score: X.X"` (case-insensitive) where `X.X` is
25+
* a value in `0.0..1.0`. Anything else is a [MeterError.MalformedJudgeResponse].
26+
*/
27+
class JudgeMeter(
28+
val meterId: String,
29+
private val tolerance: Tolerance,
30+
private val rubric: String,
31+
val relay: CognitiveRelay,
32+
private val client: JudgeClient,
33+
) : Meter {
34+
35+
override suspend fun measure(trace: Trace): Result<Reading> {
36+
if (trace.events.isEmpty()) {
37+
return Result.failure(MeterError.EmptyTrace(meterId))
38+
}
39+
val prompt = buildPrompt(trace)
40+
return client.call(prompt).fold(
41+
onSuccess = { response ->
42+
val score = parseScore(response)
43+
?: return Result.failure(MeterError.MalformedJudgeResponse(meterId, response))
44+
Result.success(
45+
Reading(score = score, passed = tolerance.passes(score), meterId = meterId),
46+
)
47+
},
48+
onFailure = { Result.failure(it) },
49+
)
50+
}
51+
52+
private fun buildPrompt(trace: Trace): String {
53+
val eventLines = trace.events.joinToString("\n") { " [${it.index}] ${it.type} at ${it.timestamp}" }
54+
return """Grade the trajectory below against the rubric.
55+
Respond with exactly: "Score: X.X" (0.0 to 1.0) then your reasoning.
56+
57+
Rubric: $rubric
58+
59+
Trajectory (${trace.events.size} events):
60+
$eventLines"""
61+
}
62+
63+
companion object {
64+
private val SCORE_REGEX = Regex("""(?i)score:\s*(\d+(?:\.\d+)?)""")
65+
66+
fun parseScore(response: String): Double? =
67+
SCORE_REGEX.find(response)
68+
?.groupValues?.get(1)
69+
?.toDoubleOrNull()
70+
?.takeIf { it in 0.0..1.0 }
71+
}
72+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package link.socket.ampere.eval.meter
2+
3+
import kotlinx.serialization.Serializable
4+
import link.socket.ampere.eval.trace.Trace
5+
6+
@Serializable
7+
data class Reading(
8+
val score: Double,
9+
val passed: Boolean,
10+
val meterId: String,
11+
val detail: Map<String, String> = emptyMap(),
12+
)
13+
14+
data class Tolerance(val minScore: Double) {
15+
fun passes(score: Double) = score >= minScore
16+
}
17+
18+
fun interface Meter {
19+
suspend fun measure(trace: Trace): Result<Reading>
20+
}
21+
22+
sealed class MeterError(message: String, cause: Throwable? = null) : Exception(message, cause) {
23+
class EmptyTrace(meterId: String) : MeterError("[$meterId] trace has no events")
24+
class NoReadings(meterId: String) : MeterError("[$meterId] no child meters produced a reading")
25+
class MalformedJudgeResponse(meterId: String, response: String) :
26+
MeterError("[$meterId] judge response could not be parsed: «$response»")
27+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package link.socket.ampere.eval.meter
2+
3+
import link.socket.ampere.eval.trace.Trace
4+
import link.socket.ampere.eval.trace.TraceEvent
5+
6+
/**
7+
* Scores a [Trace] by inspecting its terminal event against a [predicate].
8+
*
9+
* Score is `1.0` on a match, `0.0` on a mismatch; [tolerance] determines pass/fail.
10+
* A mismatch surfaces the terminal event type in [Reading.detail].
11+
*/
12+
class OutcomeMeter(
13+
val meterId: String,
14+
private val tolerance: Tolerance,
15+
private val predicate: (TraceEvent) -> Boolean,
16+
) : Meter {
17+
18+
override suspend fun measure(trace: Trace): Result<Reading> {
19+
if (trace.events.isEmpty()) {
20+
return Result.failure(MeterError.EmptyTrace(meterId))
21+
}
22+
val terminal = trace.events.last()
23+
val matched = predicate(terminal)
24+
val score = if (matched) 1.0 else 0.0
25+
val detail = if (!matched) {
26+
mapOf("terminal_type" to terminal.type, "match" to "false")
27+
} else {
28+
emptyMap()
29+
}
30+
return Result.success(
31+
Reading(score = score, passed = tolerance.passes(score), meterId = meterId, detail = detail),
32+
)
33+
}
34+
}
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
package link.socket.ampere.eval.meter
2+
3+
import kotlin.test.Test
4+
import kotlin.test.assertEquals
5+
import kotlin.test.assertFalse
6+
import kotlin.test.assertIs
7+
import kotlin.test.assertTrue
8+
import kotlinx.coroutines.test.runTest
9+
import link.socket.ampere.eval.trace.Trace
10+
11+
/** AMPR-185 task 3.3 validation. */
12+
class CompositeMeterTest {
13+
14+
private val alwaysPass = Tolerance(minScore = 0.0)
15+
16+
@Test
17+
fun `aggregate score equals the weighted mean of children`() = runTest {
18+
val composite = CompositeMeter(
19+
meterId = "composite",
20+
tolerance = alwaysPass,
21+
children = listOf(
22+
WeightedMeter(meter = fixedMeter("a", 0.8), weight = 1.0),
23+
WeightedMeter(meter = fixedMeter("b", 0.4), weight = 3.0),
24+
),
25+
)
26+
val reading = composite.measure(anyTrace()).getOrThrow()
27+
28+
// (1.0 * 0.8 + 3.0 * 0.4) / 4.0 = (0.8 + 1.2) / 4.0 = 0.5
29+
assertEquals(0.5, reading.score)
30+
}
31+
32+
@Test
33+
fun `passes when all required children pass`() = runTest {
34+
val composite = CompositeMeter(
35+
meterId = "composite",
36+
tolerance = Tolerance(0.5),
37+
children = listOf(
38+
WeightedMeter(meter = fixedMeter("a", 1.0), weight = 1.0, required = true),
39+
WeightedMeter(meter = fixedMeter("b", 1.0), weight = 1.0, required = true),
40+
),
41+
)
42+
val reading = composite.measure(anyTrace()).getOrThrow()
43+
44+
assertTrue(reading.passed)
45+
}
46+
47+
@Test
48+
fun `one failing required child fails the composite`() = runTest {
49+
val composite = CompositeMeter(
50+
meterId = "composite",
51+
tolerance = Tolerance(0.0),
52+
children = listOf(
53+
WeightedMeter(meter = fixedMeter("a", 1.0), weight = 1.0, required = true),
54+
WeightedMeter(meter = fixedMeter("b", 0.0), weight = 1.0, required = true),
55+
),
56+
)
57+
val reading = composite.measure(anyTrace()).getOrThrow()
58+
59+
assertFalse(reading.passed)
60+
}
61+
62+
@Test
63+
fun `required child failure propagates immediately as Result failure`() = runTest {
64+
val boom = MeterError.EmptyTrace("boom")
65+
val failingMeter = Meter { _ -> Result.failure(boom) }
66+
val composite = CompositeMeter(
67+
meterId = "composite",
68+
tolerance = alwaysPass,
69+
children = listOf(
70+
WeightedMeter(meter = failingMeter, weight = 1.0, required = true),
71+
),
72+
)
73+
val result = composite.measure(anyTrace())
74+
75+
assertTrue(result.isFailure)
76+
assertEquals(boom, result.exceptionOrNull())
77+
}
78+
79+
@Test
80+
fun `non-required child failure is skipped`() = runTest {
81+
val failingMeter = Meter { _ -> Result.failure(MeterError.EmptyTrace("optional")) }
82+
val composite = CompositeMeter(
83+
meterId = "composite",
84+
tolerance = alwaysPass,
85+
children = listOf(
86+
WeightedMeter(meter = failingMeter, weight = 1.0, required = false),
87+
WeightedMeter(meter = fixedMeter("b", 0.6), weight = 1.0),
88+
),
89+
)
90+
val reading = composite.measure(anyTrace()).getOrThrow()
91+
92+
assertEquals(0.6, reading.score)
93+
}
94+
95+
@Test
96+
fun `all children failing returns NoReadings`() = runTest {
97+
val failingMeter = Meter { _ -> Result.failure(MeterError.EmptyTrace("x")) }
98+
val composite = CompositeMeter(
99+
meterId = "composite",
100+
tolerance = alwaysPass,
101+
children = listOf(WeightedMeter(meter = failingMeter, weight = 1.0, required = false)),
102+
)
103+
val result = composite.measure(anyTrace())
104+
105+
assertTrue(result.isFailure)
106+
assertIs<MeterError.NoReadings>(result.exceptionOrNull())
107+
}
108+
109+
// region — fixtures
110+
111+
private fun anyTrace() = Trace(
112+
id = "t",
113+
runId = "r",
114+
arcId = "a",
115+
createdAt = 0L,
116+
events = listOf(
117+
link.socket.ampere.eval.trace.TraceEvent(
118+
index = 0,
119+
timestamp = 1L,
120+
type = "Anything",
121+
payload = kotlinx.serialization.json.buildJsonObject {},
122+
),
123+
),
124+
)
125+
126+
private fun fixedMeter(id: String, score: Double): Meter {
127+
val reading = Reading(score = score, passed = score >= 0.5, meterId = id)
128+
return Meter { _ -> Result.success(reading) }
129+
}
130+
131+
// endregion
132+
}

0 commit comments

Comments
 (0)