|
533 | 533 | mean (fn [xs] (if (seq xs) (/ (reduce + xs) (count xs)) 0.0)) |
534 | 534 | layer-key (fn [layer] (keyword (str (name layer) "-score"))) |
535 | 535 | layer-mean (fn [layer rs] |
536 | | - (mean (mapv #(score-value (get % (layer-key layer))) rs))) |
| 536 | + (let [scored (filterv #(contains? % (layer-key layer)) rs)] |
| 537 | + (mean (mapv #(score-value (get % (layer-key layer))) scored)))) |
537 | 538 | by-cat (group-by :category results) |
538 | 539 | det-rs (filterv #(= :deterministic (:scoring %)) results) |
539 | 540 | llm-rs (filterv #(not= :deterministic (:scoring %)) results) |
|
548 | 549 | {:question-count (count results) |
549 | 550 | :canonical canonical? |
550 | 551 | :deterministic-count (count det-rs) |
551 | | - :deterministic-mean (mean (mapv #(score-value (get % (layer-key primary-layer))) det-rs)) |
| 552 | + :deterministic-mean (let [scored (filterv #(contains? % (layer-key primary-layer)) det-rs)] |
| 553 | + (mean (mapv #(score-value (get % (layer-key primary-layer))) scored))) |
552 | 554 | :llm-judged-count (count llm-rs) |
553 | | - :llm-judged-mean (mean (mapv #(score-value (get % (layer-key primary-layer))) llm-rs)) |
| 555 | + :llm-judged-mean (let [scored (filterv #(contains? % (layer-key primary-layer)) llm-rs)] |
| 556 | + (mean (mapv #(score-value (get % (layer-key primary-layer))) scored))) |
554 | 557 | :per-category (into {} |
555 | 558 | (map (fn [[cat rs]] |
556 | 559 | [cat (reduce (fn [m layer] |
|
952 | 955 | :completed-at (java.util.Date.)}) |
953 | 956 | (throw e)))))))) |
954 | 957 |
|
| 958 | +(defn- layer-stage-keys |
| 959 | + "Return stage keys for a single [question, layer] pair under the given mode." |
| 960 | + [question layer mode] |
| 961 | + (let [qid (:id question) |
| 962 | + skip-judge? (:skip-judge mode) |
| 963 | + deterministic? (= :deterministic (:scoring question))] |
| 964 | + (cond-> [[qid layer :answer]] |
| 965 | + (or (not skip-judge?) deterministic?) |
| 966 | + (conj [qid layer :judge])))) |
| 967 | + |
955 | 968 | (defn stages->results |
956 | | - "Convert stages map to result seq. Only includes questions with all expected stages complete. |
| 969 | + "Convert stages map to result seq. Includes questions with at least one layer |
| 970 | + fully scored (answer + judge complete). Layers that haven't finished are omitted |
| 971 | + from the result map rather than counted as wrong. |
957 | 972 | Results include per-layer scores keyed as :<layer>-score, :<layer>-reasoning, :<layer>-answer." |
958 | 973 | ([questions stages] (stages->results questions stages nil)) |
959 | 974 | ([questions stages mode] |
960 | 975 | (let [layers (resolve-layers (or mode {}))] |
961 | 976 | (for [q questions |
962 | | - :let [qid (:id q) |
963 | | - exp-keys (if mode (mode-stage-keys q mode) (stage-keys q layers))] |
964 | | - :when (every? #(contains? stages %) exp-keys)] |
| 977 | + :let [qid (:id q) |
| 978 | + complete-layers (filterv |
| 979 | + (fn [layer] |
| 980 | + (every? #(contains? stages %) |
| 981 | + (layer-stage-keys q layer mode))) |
| 982 | + layers)] |
| 983 | + :when (seq complete-layers)] |
965 | 984 | (reduce (fn [result layer] |
966 | 985 | (let [judge-key [qid layer :judge] |
967 | 986 | judge (get-in stages [judge-key :result]) |
|
977 | 996 | :category (:category q) |
978 | 997 | :scoring (:scoring q) |
979 | 998 | :query-name (:query-name q)} |
980 | | - layers))))) |
| 999 | + complete-layers))))) |
981 | 1000 |
|
982 | 1001 | ;; --- Rate limiting --- |
983 | 1002 |
|
|
0 commit comments