Skip to content

Commit 19fab5b

Browse files
committed
benchmark data from artificalanalysis & model picker improvements
1 parent 125ce71 commit 19fab5b

7 files changed

Lines changed: 441 additions & 7 deletions

File tree

.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ BETTER_AUTH_TRUSTED_ORIGINS=
1111

1212
NANOGPT_API_KEY=
1313

14+
# Artificial Analysis API key for model benchmarks (optional)
15+
# Get a free key at https://artificialanalysis.ai/
16+
ARTIFICIAL_ANALYSIS_API_KEY=
17+
1418
# Set to "true" to disable new account creation
1519
DISABLE_SIGNUPS=false
1620

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
- Configurable System Themes
3232
- Model Performance Tracking and Analytics.
3333
- Projects
34+
- Benchmark Data from artificialanalysis.ai API
3435

3536
## Setup (Docker)
3637

src/lib/cache/cached-query.svelte.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,9 @@ export const api = {
245245
betterAuth: {
246246
publicGetSession: { url: '/api/auth/session', method: 'GET' } as QueryConfig,
247247
},
248+
artificial_analysis: {
249+
benchmarks: { url: '/api/artificial-analysis/benchmarks', method: 'GET' } as QueryConfig,
250+
},
248251
} as const;
249252

250253
export function invalidateQuery(query: QueryConfig, queryArgs?: unknown): void {

src/lib/components/model-picker/model-info-panel.svelte

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
<script lang="ts">
22
import type { NanoGPTModel } from '$lib/backend/models/nano-gpt';
3+
import type { AALLMModel, AAImageModel, AABenchmarkData } from '$lib/types/artificial-analysis';
34
import {
45
supportsVision,
56
supportsReasoning,
67
isImageOnlyModel,
78
supportsVideo,
89
} from '$lib/utils/model-capabilities';
910
import { cn } from '$lib/utils/utils';
11+
import { useCachedQuery, api } from '$lib/cache/cached-query.svelte';
1012
import EyeIcon from '~icons/lucide/eye';
1113
import BrainIcon from '~icons/lucide/brain';
1214
import ImageIcon from '~icons/lucide/image';
1315
import VideoIcon from '~icons/lucide/video';
1416
import XIcon from '~icons/lucide/x';
1517
import CheckIcon from '~icons/lucide/check';
18+
import TrendingUpIcon from '~icons/lucide/trending-up';
19+
import ZapIcon from '~icons/lucide/zap';
20+
import ExternalLinkIcon from '~icons/lucide/external-link';
1621
1722
type Props = {
1823
model: NanoGPTModel;
@@ -22,6 +27,212 @@
2227
2328
let { model, iconUrl, onClose }: Props = $props();
2429
30+
// Fetch benchmark data
31+
const benchmarks = useCachedQuery<AABenchmarkData & { available: boolean }>(
32+
api.artificial_analysis.benchmarks,
33+
{},
34+
{ ttl: 60 * 60 * 1000 } // 1 hour cache
35+
);
36+
37+
// Normalize model name for matching (remove punctuation, lowercase)
38+
function normalizeForMatch(str: string): string {
39+
return str
40+
.toLowerCase()
41+
.replace(/[^a-z0-9]/g, '') // Remove all non-alphanumeric
42+
.trim();
43+
}
44+
45+
// Strip common suffixes that don't affect model identity
46+
function stripSuffixes(str: string): string {
47+
return str
48+
.replace(/[-_]?original$/i, '')
49+
.replace(/[-_]?\d{8}$/i, '') // Remove date suffixes like -20250929
50+
.trim();
51+
}
52+
53+
// Extract model name from ID (e.g., "zai-org/glm-4.7-original" -> "glm-4.7")
54+
function extractModelName(id: string): string {
55+
const parts = id.split('/');
56+
const name = (parts.length > 1 ? parts[parts.length - 1] : id) ?? id;
57+
return stripSuffixes(name);
58+
}
59+
60+
// Extract key tokens from a model name (keeping version numbers together)
61+
function extractKeyTokens(name: string): Set<string> {
62+
// First, normalize separators but keep decimal points intact in version numbers
63+
const normalized = name
64+
.toLowerCase()
65+
.replace(/(\d+)\.(\d+)/g, '$1$2') // "4.7" -> "47", "4.5" -> "45"
66+
.replace(/[^a-z0-9]+/g, ' ');
67+
const tokens = normalized.split(' ').filter((t) => t.length > 0);
68+
// Keep all tokens, but filter out very long date-like patterns (8+ digit numbers)
69+
return new Set(tokens.filter((t) => !/^\d{8,}$/.test(t)));
70+
}
71+
72+
// Check if two token sets have significant overlap
73+
function tokensMatch(set1: Set<string>, set2: Set<string>): boolean {
74+
if (set1.size === 0 || set2.size === 0) return false;
75+
let matches = 0;
76+
for (const token of set1) {
77+
if (set2.has(token)) matches++;
78+
}
79+
// Require at least 2 matching tokens to prevent false positives
80+
// This prevents "glm" alone from matching "glm-4.5-air"
81+
return matches >= 2;
82+
}
83+
84+
// Find matching LLM benchmark using scoring system (find BEST match, not first match)
85+
const llmBenchmark = $derived.by(() => {
86+
if (!benchmarks.data?.available || !benchmarks.data.llms) return null;
87+
88+
const modelName = stripSuffixes(model.name).toLowerCase();
89+
const modelIdFull = model.id.toLowerCase();
90+
const modelIdShort = extractModelName(model.id).toLowerCase();
91+
const normalizedName = normalizeForMatch(stripSuffixes(model.name));
92+
const normalizedId = normalizeForMatch(modelIdShort);
93+
const modelNameTokens = extractKeyTokens(stripSuffixes(model.name));
94+
const modelIdTokens = extractKeyTokens(modelIdShort);
95+
96+
console.log('[AA Benchmark] Searching for model:', {
97+
name: model.name,
98+
id: model.id,
99+
modelIdShort,
100+
normalizedName,
101+
normalizedId,
102+
nameTokens: Array.from(modelNameTokens),
103+
idTokens: Array.from(modelIdTokens),
104+
});
105+
106+
let bestMatch: AALLMModel | null = null;
107+
let bestScore = 0;
108+
let bestMatchReason = '';
109+
110+
for (const llm of benchmarks.data.llms) {
111+
const aaName = llm.name.toLowerCase();
112+
const aaSlug = llm.slug.toLowerCase();
113+
const normalizedAaName = normalizeForMatch(llm.name);
114+
const normalizedAaSlug = normalizeForMatch(llm.slug);
115+
116+
let score = 0;
117+
let reason = '';
118+
119+
// EXACT matches get highest score (100)
120+
if (modelName === aaName) {
121+
score = 100;
122+
reason = 'EXACT NAME';
123+
} else if (modelIdShort === aaSlug || modelIdFull === aaSlug) {
124+
score = 100;
125+
reason = 'EXACT SLUG';
126+
}
127+
// NORMALIZED EXACT matches get high score (90)
128+
else if (normalizedName === normalizedAaName) {
129+
score = 90;
130+
reason = 'NORMALIZED EXACT NAME';
131+
} else if (normalizedId === normalizedAaSlug) {
132+
score = 90;
133+
reason = 'NORMALIZED EXACT SLUG';
134+
}
135+
// Token-based matching with score based on number of matching tokens
136+
else {
137+
const aaNameTokens = extractKeyTokens(llm.name);
138+
const aaSlugTokens = extractKeyTokens(llm.slug);
139+
140+
// Count matching tokens
141+
let nameMatches = 0;
142+
for (const token of modelNameTokens) {
143+
if (aaNameTokens.has(token)) nameMatches++;
144+
}
145+
let slugMatches = 0;
146+
for (const token of modelIdTokens) {
147+
if (aaSlugTokens.has(token)) slugMatches++;
148+
}
149+
150+
const maxTokenMatches = Math.max(nameMatches, slugMatches);
151+
const minTokensNeeded = Math.min(
152+
modelNameTokens.size,
153+
modelIdTokens.size,
154+
aaNameTokens.size,
155+
aaSlugTokens.size
156+
);
157+
158+
// Require ALL tokens to match for a valid score (prevents partial matches)
159+
if (maxTokenMatches >= 2 && maxTokenMatches >= minTokensNeeded) {
160+
// Score based on how many tokens matched
161+
score = 50 + maxTokenMatches * 10;
162+
reason = `TOKEN MATCH (${maxTokenMatches} tokens)`;
163+
}
164+
}
165+
166+
if (score > bestScore) {
167+
bestScore = score;
168+
bestMatch = llm;
169+
bestMatchReason = reason;
170+
}
171+
}
172+
173+
if (bestMatch && bestScore >= 50) {
174+
console.log('[AA Benchmark] Best match:', {
175+
name: bestMatch.name,
176+
slug: bestMatch.slug,
177+
score: bestScore,
178+
reason: bestMatchReason,
179+
evaluations: bestMatch.evaluations,
180+
});
181+
return bestMatch;
182+
}
183+
184+
console.log('[AA Benchmark] No match found for:', model.name);
185+
return null;
186+
});
187+
188+
// Find matching image model benchmark
189+
const imageBenchmark = $derived.by(() => {
190+
if (!benchmarks.data?.available || !benchmarks.data.imageModels || !isImageOnlyModel(model))
191+
return null;
192+
193+
const modelName = model.name.toLowerCase();
194+
const modelIdShort = extractModelName(model.id).toLowerCase();
195+
const normalizedName = normalizeForMatch(model.name);
196+
const normalizedId = normalizeForMatch(modelIdShort);
197+
const modelNameTokens = extractKeyTokens(model.name);
198+
const modelIdTokens = extractKeyTokens(modelIdShort);
199+
200+
return benchmarks.data.imageModels.find((img: AAImageModel) => {
201+
const aaName = img.name.toLowerCase();
202+
const aaSlug = img.slug.toLowerCase();
203+
const normalizedAaName = normalizeForMatch(img.name);
204+
const normalizedAaSlug = normalizeForMatch(img.slug);
205+
206+
// Exact matches
207+
if (modelName === aaName || modelIdShort === aaSlug) return true;
208+
209+
// Normalized matches
210+
if (normalizedName === normalizedAaName || normalizedId === normalizedAaSlug) return true;
211+
212+
// Partial matches
213+
if (modelName.includes(aaName) || aaName.includes(modelName)) return true;
214+
if (modelIdShort.includes(aaSlug) || aaSlug.includes(modelIdShort)) return true;
215+
216+
// Normalized partial matches
217+
if (normalizedName.includes(normalizedAaName) || normalizedAaName.includes(normalizedName))
218+
return true;
219+
if (normalizedId.includes(normalizedAaSlug) || normalizedAaSlug.includes(normalizedId))
220+
return true;
221+
222+
// Token-based matching (handles different word orderings)
223+
const aaNameTokens = extractKeyTokens(img.name);
224+
const aaSlugTokens = extractKeyTokens(img.slug);
225+
if (tokensMatch(modelNameTokens, aaNameTokens)) return true;
226+
if (tokensMatch(modelIdTokens, aaSlugTokens)) return true;
227+
if (tokensMatch(modelNameTokens, aaSlugTokens)) return true;
228+
if (tokensMatch(modelIdTokens, aaNameTokens)) return true;
229+
230+
return false;
231+
});
232+
});
233+
234+
const hasBenchmarks = $derived(llmBenchmark || imageBenchmark);
235+
25236
function formatNumber(num: number | undefined): string {
26237
if (!num) return '-';
27238
if (num >= 1000000) return `${(num / 1000000).toFixed(1)}M`;
@@ -55,6 +266,11 @@
55266
.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
56267
.join(' ');
57268
}
269+
270+
function formatBenchmarkScore(score: number | undefined): string {
271+
if (score === undefined || score === null) return '-';
272+
return score.toFixed(1);
273+
}
58274
</script>
59275

60276
<div class="bg-popover border-border flex h-full w-[320px] flex-col overflow-hidden border-l">
@@ -88,6 +304,92 @@
88304
</div>
89305
{/if}
90306

307+
<!-- Benchmarks Section -->
308+
{#if hasBenchmarks}
309+
<div>
310+
<h4 class="text-muted-foreground mb-2 text-xs font-medium tracking-wide uppercase">
311+
<span class="inline-flex items-center gap-1.5">
312+
<TrendingUpIcon class="size-3" />
313+
Benchmarks
314+
</span>
315+
</h4>
316+
317+
{#if llmBenchmark}
318+
<div class="bg-muted/50 space-y-2 rounded-lg p-3">
319+
{#if llmBenchmark.evaluations?.artificial_analysis_intelligence_index}
320+
<div class="flex justify-between text-sm">
321+
<span class="text-muted-foreground">Intelligence</span>
322+
<span class="font-medium text-blue-400"
323+
>{formatBenchmarkScore(
324+
llmBenchmark.evaluations.artificial_analysis_intelligence_index
325+
)}</span
326+
>
327+
</div>
328+
{/if}
329+
{#if llmBenchmark.evaluations?.artificial_analysis_coding_index}
330+
<div class="flex justify-between text-sm">
331+
<span class="text-muted-foreground">Coding</span>
332+
<span class="font-medium text-green-400"
333+
>{formatBenchmarkScore(
334+
llmBenchmark.evaluations.artificial_analysis_coding_index
335+
)}</span
336+
>
337+
</div>
338+
{/if}
339+
{#if llmBenchmark.evaluations?.artificial_analysis_math_index}
340+
<div class="flex justify-between text-sm">
341+
<span class="text-muted-foreground">Math</span>
342+
<span class="font-medium text-purple-400"
343+
>{formatBenchmarkScore(
344+
llmBenchmark.evaluations.artificial_analysis_math_index
345+
)}</span
346+
>
347+
</div>
348+
{/if}
349+
{#if llmBenchmark.median_output_tokens_per_second}
350+
<div class="border-border flex justify-between border-t pt-2 text-sm">
351+
<span class="text-muted-foreground inline-flex items-center gap-1">
352+
<ZapIcon class="size-3" />
353+
Speed
354+
</span>
355+
<span class="font-medium text-yellow-400"
356+
>{llmBenchmark.median_output_tokens_per_second.toFixed(0)} tok/s</span
357+
>
358+
</div>
359+
{/if}
360+
</div>
361+
{/if}
362+
363+
{#if imageBenchmark}
364+
<div class="bg-muted/50 space-y-2 rounded-lg p-3">
365+
{#if imageBenchmark.elo}
366+
<div class="flex justify-between text-sm">
367+
<span class="text-muted-foreground">ELO Rating</span>
368+
<span class="font-medium text-blue-400">{imageBenchmark.elo}</span>
369+
</div>
370+
{/if}
371+
{#if imageBenchmark.rank}
372+
<div class="flex justify-between text-sm">
373+
<span class="text-muted-foreground">Rank</span>
374+
<span class="font-medium text-amber-400">#{imageBenchmark.rank}</span>
375+
</div>
376+
{/if}
377+
</div>
378+
{/if}
379+
380+
<!-- Attribution -->
381+
<a
382+
href="https://artificialanalysis.ai"
383+
target="_blank"
384+
rel="noopener noreferrer"
385+
class="text-muted-foreground hover:text-foreground mt-2 inline-flex items-center gap-1 text-xs transition-colors"
386+
>
387+
Data from Artificial Analysis
388+
<ExternalLinkIcon class="size-3" />
389+
</a>
390+
</div>
391+
{/if}
392+
91393
<!-- Features -->
92394
<div>
93395
<h4 class="text-muted-foreground mb-2 text-xs font-medium tracking-wide uppercase">

src/lib/components/model-picker/model-picker.svelte

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -370,12 +370,7 @@
370370
<FilterIcon class="text-muted-foreground size-4 opacity-50" />
371371
{/if}
372372
</label>
373-
<div
374-
class={cn(
375-
'max-h-[400px] min-h-0 flex-1 overflow-y-auto',
376-
isMobile.current ? 'p-2' : 'p-1'
377-
)}
378-
>
373+
<div class={cn('min-h-0 flex-1 overflow-y-auto', isMobile.current ? 'p-2' : 'p-1')}>
379374
<Command.List class="flex flex-col gap-0.5">
380375
{#each filteredModels as model (model.id)}
381376
{@const formatted = formatModelName(model.modelId)}
@@ -664,7 +659,7 @@
664659
class={cn(
665660
'flex flex-col overflow-hidden p-0 transition-all duration-200',
666661
infoModel ? 'w-[840px]' : 'w-[520px]',
667-
'max-h-[calc(100vh-120px)]',
662+
'h-[500px] max-h-[calc(100vh-120px)]',
668663
'data-[side=bottom]:translate-y-1 data-[side=top]:-translate-y-1'
669664
)}
670665
>

0 commit comments

Comments
 (0)