-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathexport-from-db.ts
More file actions
96 lines (85 loc) · 2.42 KB
/
export-from-db.ts
File metadata and controls
96 lines (85 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import { Database } from 'bun:sqlite'
import fs from 'node:fs'
const db = new Database('evals.db')
interface DBRow {
model: string
label: string
framework: string
category: string
value: number
updatedAt: string
}
interface Score {
model: string
label: string
framework: string
category: string
value: number
updatedAt: string
}
// Get BASELINE results (average of latest run per model+category)
// 1. Find the latest run_id for each model (baseline)
// 2. Average all eval scores from that run
const baselineQuery = db.query(`
SELECT
r.model,
r.label,
r.framework,
r.category,
AVG(r.value) as value,
MAX(r.timestamp) as updatedAt
FROM results r
INNER JOIN (
SELECT model, MAX(run_id) as latest_run
FROM results
WHERE label NOT LIKE '%(MCP)%'
GROUP BY model
) latest ON r.model = latest.model AND r.run_id = latest.latest_run
WHERE r.label NOT LIKE '%(MCP)%'
GROUP BY r.model, r.category
`)
const baseline = baselineQuery.all() as DBRow[]
console.log(`Baseline scores: ${baseline.length}`)
// Get MCP results (average of latest run per model+category)
const mcpQuery = db.query(`
SELECT
r.model,
r.label,
r.framework,
r.category,
AVG(r.value) as value,
MAX(r.timestamp) as updatedAt
FROM results r
INNER JOIN (
SELECT model, MAX(run_id) as latest_run
FROM results
WHERE label LIKE '%(MCP)%'
GROUP BY model
) latest ON r.model = latest.model AND r.run_id = latest.latest_run
WHERE r.label LIKE '%(MCP)%'
GROUP BY r.model, r.category
`)
const mcp = mcpQuery.all() as DBRow[]
console.log(`MCP scores: ${mcp.length}`)
// Show breakdown by model
console.log('\nBaseline per model:')
const baselineCounts = new Map<string, number>()
for (const s of baseline) {
const count = baselineCounts.get(s.label) || 0
baselineCounts.set(s.label, count + 1)
}
for (const [model, count] of baselineCounts.entries()) {
console.log(` ${model}: ${count}/6 categories`)
}
console.log('\nMCP per model:')
const mcpCounts = new Map<string, number>()
for (const s of mcp) {
const count = mcpCounts.get(s.label) || 0
mcpCounts.set(s.label, count + 1)
}
for (const [model, count] of mcpCounts.entries()) {
console.log(` ${model}: ${count}/6 categories`)
}
fs.writeFileSync('scores.json', JSON.stringify(baseline, null, 2))
fs.writeFileSync('scores-mcp.json', JSON.stringify(mcp, null, 2))
console.log('\nExported to scores.json and scores-mcp.json')