Skip to content

Commit 200507c

Browse files
leifericfclaude
andcommitted
fix(bench): count partially-completed questions in multi-layer runs
When budget exhaustion interrupts a multi-layer benchmark (e.g. --layers raw,full --max-questions 1), questions that completed some layers but not all were excluded from results entirely, producing misleading output like "questions=0 full=0.00" even though a question scored correct on the full layer. Now stages->results includes questions with at least one fully-scored layer, and aggregate-scores only counts questions that have a score for each layer when computing per-layer means. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 057502d commit 200507c

2 files changed

Lines changed: 41 additions & 14 deletions

File tree

src/noumenon/benchmark.clj

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -533,7 +533,8 @@
533533
mean (fn [xs] (if (seq xs) (/ (reduce + xs) (count xs)) 0.0))
534534
layer-key (fn [layer] (keyword (str (name layer) "-score")))
535535
layer-mean (fn [layer rs]
536-
(mean (mapv #(score-value (get % (layer-key layer))) rs)))
536+
(let [scored (filterv #(contains? % (layer-key layer)) rs)]
537+
(mean (mapv #(score-value (get % (layer-key layer))) scored))))
537538
by-cat (group-by :category results)
538539
det-rs (filterv #(= :deterministic (:scoring %)) results)
539540
llm-rs (filterv #(not= :deterministic (:scoring %)) results)
@@ -548,9 +549,11 @@
548549
{:question-count (count results)
549550
:canonical canonical?
550551
:deterministic-count (count det-rs)
551-
:deterministic-mean (mean (mapv #(score-value (get % (layer-key primary-layer))) det-rs))
552+
:deterministic-mean (let [scored (filterv #(contains? % (layer-key primary-layer)) det-rs)]
553+
(mean (mapv #(score-value (get % (layer-key primary-layer))) scored)))
552554
:llm-judged-count (count llm-rs)
553-
:llm-judged-mean (mean (mapv #(score-value (get % (layer-key primary-layer))) llm-rs))
555+
:llm-judged-mean (let [scored (filterv #(contains? % (layer-key primary-layer)) llm-rs)]
556+
(mean (mapv #(score-value (get % (layer-key primary-layer))) scored)))
554557
:per-category (into {}
555558
(map (fn [[cat rs]]
556559
[cat (reduce (fn [m layer]
@@ -952,16 +955,32 @@
952955
:completed-at (java.util.Date.)})
953956
(throw e))))))))
954957

958+
(defn- layer-stage-keys
959+
"Return stage keys for a single [question, layer] pair under the given mode."
960+
[question layer mode]
961+
(let [qid (:id question)
962+
skip-judge? (:skip-judge mode)
963+
deterministic? (= :deterministic (:scoring question))]
964+
(cond-> [[qid layer :answer]]
965+
(or (not skip-judge?) deterministic?)
966+
(conj [qid layer :judge]))))
967+
955968
(defn stages->results
956-
"Convert stages map to result seq. Only includes questions with all expected stages complete.
969+
"Convert stages map to result seq. Includes questions with at least one layer
970+
fully scored (answer + judge complete). Layers that haven't finished are omitted
971+
from the result map rather than counted as wrong.
957972
Results include per-layer scores keyed as :<layer>-score, :<layer>-reasoning, :<layer>-answer."
958973
([questions stages] (stages->results questions stages nil))
959974
([questions stages mode]
960975
(let [layers (resolve-layers (or mode {}))]
961976
(for [q questions
962-
:let [qid (:id q)
963-
exp-keys (if mode (mode-stage-keys q mode) (stage-keys q layers))]
964-
:when (every? #(contains? stages %) exp-keys)]
977+
:let [qid (:id q)
978+
complete-layers (filterv
979+
(fn [layer]
980+
(every? #(contains? stages %)
981+
(layer-stage-keys q layer mode)))
982+
layers)]
983+
:when (seq complete-layers)]
965984
(reduce (fn [result layer]
966985
(let [judge-key [qid layer :judge]
967986
judge (get-in stages [judge-key :result])
@@ -977,7 +996,7 @@
977996
:category (:category q)
978997
:scoring (:scoring q)
979998
:query-name (:query-name q)}
980-
layers)))))
999+
complete-layers)))))
9811000

9821001
;; --- Rate limiting ---
9831002

test/noumenon/benchmark_test.clj

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -341,12 +341,20 @@
341341
(is (= "ra" (:raw-answer (first results))))
342342
(is (= :partial (:raw-score (first results))))))
343343

344-
(deftest stages->results-incomplete-question-excluded
345-
(let [qs [{:id :q01 :category :single-hop :query-name "test"}]
346-
stages {[:q01 :full :answer] {:status :ok :result "qa"}
347-
[:q01 :full :judge] {:status :ok :result {:score :correct :reasoning "good"}}}
348-
results (vec (bench/stages->results qs stages))]
349-
(is (= 0 (count results)))))
344+
(deftest stages->results-partial-layers-included
345+
(testing "Question with only :full complete is included with :full-score only"
346+
(let [qs [{:id :q01 :category :single-hop :query-name "test"}]
347+
stages {[:q01 :full :answer] {:status :ok :result "qa"}
348+
[:q01 :full :judge] {:status :ok :result {:score :correct :reasoning "good"}}}
349+
results (vec (bench/stages->results qs stages))]
350+
(is (= 1 (count results)))
351+
(is (= :correct (:full-score (first results))))
352+
(is (nil? (:raw-score (first results))))))
353+
(testing "Question with no complete layers is excluded"
354+
(let [qs [{:id :q01 :category :single-hop :query-name "test"}]
355+
stages {[:q01 :full :answer] {:status :ok :result "qa"}}
356+
results (vec (bench/stages->results qs stages))]
357+
(is (= 0 (count results))))))
350358

351359
(deftest stages->results-nil-judge-uses-wrong-when-judge-present
352360
(let [qs [{:id :q01 :category :single-hop :query-name "test"}]

0 commit comments

Comments
 (0)