benchmark data from artificalanalysis & model picker improvements

0xGingi · 0xGingi · commit 19fab5bb561a · 2026-01-07T21:35:05.000Z
diff --git a/.env.example b/.env.example
@@ -11,6 +11,10 @@ BETTER_AUTH_TRUSTED_ORIGINS=
 
 NANOGPT_API_KEY=
 
+# Artificial Analysis API key for model benchmarks (optional)
+# Get a free key at https://artificialanalysis.ai/
+ARTIFICIAL_ANALYSIS_API_KEY=
+
 # Set to "true" to disable new account creation
 DISABLE_SIGNUPS=false
 
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@
 - Configurable System Themes
 - Model Performance Tracking and Analytics.
 - Projects
+- Benchmark Data from artificialanalysis.ai API
 
 ## Setup (Docker)
 
diff --git a/src/lib/cache/cached-query.svelte.ts b/src/lib/cache/cached-query.svelte.ts
@@ -245,6 +245,9 @@ export const api = {
 	betterAuth: {
 		publicGetSession: { url: '/api/auth/session', method: 'GET' } as QueryConfig,
 	},
+	artificial_analysis: {
+		benchmarks: { url: '/api/artificial-analysis/benchmarks', method: 'GET' } as QueryConfig,
+	},
 } as const;
 
 export function invalidateQuery(query: QueryConfig, queryArgs?: unknown): void {
diff --git a/src/lib/components/model-picker/model-info-panel.svelte b/src/lib/components/model-picker/model-info-panel.svelte
@@ -1,18 +1,23 @@
 <script lang="ts">
 	import type { NanoGPTModel } from '$lib/backend/models/nano-gpt';
+	import type { AALLMModel, AAImageModel, AABenchmarkData } from '$lib/types/artificial-analysis';
 	import {
 		supportsVision,
 		supportsReasoning,
 		isImageOnlyModel,
 		supportsVideo,
 	} from '$lib/utils/model-capabilities';
 	import { cn } from '$lib/utils/utils';
+	import { useCachedQuery, api } from '$lib/cache/cached-query.svelte';
 	import EyeIcon from '~icons/lucide/eye';
 	import BrainIcon from '~icons/lucide/brain';
 	import ImageIcon from '~icons/lucide/image';
 	import VideoIcon from '~icons/lucide/video';
 	import XIcon from '~icons/lucide/x';
 	import CheckIcon from '~icons/lucide/check';
+	import TrendingUpIcon from '~icons/lucide/trending-up';
+	import ZapIcon from '~icons/lucide/zap';
+	import ExternalLinkIcon from '~icons/lucide/external-link';
 
 	type Props = {
 		model: NanoGPTModel;
@@ -22,6 +27,212 @@
 
 	let { model, iconUrl, onClose }: Props = $props();
 
+	// Fetch benchmark data
+	const benchmarks = useCachedQuery<AABenchmarkData & { available: boolean }>(
+		api.artificial_analysis.benchmarks,
+		{},
+		{ ttl: 60 * 60 * 1000 } // 1 hour cache
+	);
+
+	// Normalize model name for matching (remove punctuation, lowercase)
+	function normalizeForMatch(str: string): string {
+		return str
+			.toLowerCase()
+			.replace(/[^a-z0-9]/g, '') // Remove all non-alphanumeric
+			.trim();
+	}
+
+	// Strip common suffixes that don't affect model identity
+	function stripSuffixes(str: string): string {
+		return str
+			.replace(/[-_]?original$/i, '')
+			.replace(/[-_]?\d{8}$/i, '') // Remove date suffixes like -20250929
+			.trim();
+	}
+
+	// Extract model name from ID (e.g., "zai-org/glm-4.7-original" -> "glm-4.7")
+	function extractModelName(id: string): string {
+		const parts = id.split('/');
+		const name = (parts.length > 1 ? parts[parts.length - 1] : id) ?? id;
+		return stripSuffixes(name);
+	}
+
+	// Extract key tokens from a model name (keeping version numbers together)
+	function extractKeyTokens(name: string): Set<string> {
+		// First, normalize separators but keep decimal points intact in version numbers
+		const normalized = name
+			.toLowerCase()
+			.replace(/(\d+)\.(\d+)/g, '$1$2') // "4.7" -> "47", "4.5" -> "45"
+			.replace(/[^a-z0-9]+/g, ' ');
+		const tokens = normalized.split(' ').filter((t) => t.length > 0);
+		// Keep all tokens, but filter out very long date-like patterns (8+ digit numbers)
+		return new Set(tokens.filter((t) => !/^\d{8,}$/.test(t)));
+	}
+
+	// Check if two token sets have significant overlap
+	function tokensMatch(set1: Set<string>, set2: Set<string>): boolean {
+		if (set1.size === 0 || set2.size === 0) return false;
+		let matches = 0;
+		for (const token of set1) {
+			if (set2.has(token)) matches++;
+		}
+		// Require at least 2 matching tokens to prevent false positives
+		// This prevents "glm" alone from matching "glm-4.5-air"
+		return matches >= 2;
+	}
+
+	// Find matching LLM benchmark using scoring system (find BEST match, not first match)
+	const llmBenchmark = $derived.by(() => {
+		if (!benchmarks.data?.available || !benchmarks.data.llms) return null;
+
+		const modelName = stripSuffixes(model.name).toLowerCase();
+		const modelIdFull = model.id.toLowerCase();
+		const modelIdShort = extractModelName(model.id).toLowerCase();
+		const normalizedName = normalizeForMatch(stripSuffixes(model.name));
+		const normalizedId = normalizeForMatch(modelIdShort);
+		const modelNameTokens = extractKeyTokens(stripSuffixes(model.name));
+		const modelIdTokens = extractKeyTokens(modelIdShort);
+
+		console.log('[AA Benchmark] Searching for model:', {
+			name: model.name,
+			id: model.id,
+			modelIdShort,
+			normalizedName,
+			normalizedId,
+			nameTokens: Array.from(modelNameTokens),
+			idTokens: Array.from(modelIdTokens),
+		});
+
+		let bestMatch: AALLMModel | null = null;
+		let bestScore = 0;
+		let bestMatchReason = '';
+
+		for (const llm of benchmarks.data.llms) {
+			const aaName = llm.name.toLowerCase();
+			const aaSlug = llm.slug.toLowerCase();
+			const normalizedAaName = normalizeForMatch(llm.name);
+			const normalizedAaSlug = normalizeForMatch(llm.slug);
+
+			let score = 0;
+			let reason = '';
+
+			// EXACT matches get highest score (100)
+			if (modelName === aaName) {
+				score = 100;
+				reason = 'EXACT NAME';
+			} else if (modelIdShort === aaSlug || modelIdFull === aaSlug) {
+				score = 100;
+				reason = 'EXACT SLUG';
+			}
+			// NORMALIZED EXACT matches get high score (90)
+			else if (normalizedName === normalizedAaName) {
+				score = 90;
+				reason = 'NORMALIZED EXACT NAME';
+			} else if (normalizedId === normalizedAaSlug) {
+				score = 90;
+				reason = 'NORMALIZED EXACT SLUG';
+			}
+			// Token-based matching with score based on number of matching tokens
+			else {
+				const aaNameTokens = extractKeyTokens(llm.name);
+				const aaSlugTokens = extractKeyTokens(llm.slug);
+
+				// Count matching tokens
+				let nameMatches = 0;
+				for (const token of modelNameTokens) {
+					if (aaNameTokens.has(token)) nameMatches++;
+				}
+				let slugMatches = 0;
+				for (const token of modelIdTokens) {
+					if (aaSlugTokens.has(token)) slugMatches++;
+				}
+
+				const maxTokenMatches = Math.max(nameMatches, slugMatches);
+				const minTokensNeeded = Math.min(
+					modelNameTokens.size,
+					modelIdTokens.size,
+					aaNameTokens.size,
+					aaSlugTokens.size
+				);
+
+				// Require ALL tokens to match for a valid score (prevents partial matches)
+				if (maxTokenMatches >= 2 && maxTokenMatches >= minTokensNeeded) {
+					// Score based on how many tokens matched
+					score = 50 + maxTokenMatches * 10;
+					reason = `TOKEN MATCH (${maxTokenMatches} tokens)`;
+				}
+			}
+
+			if (score > bestScore) {
+				bestScore = score;
+				bestMatch = llm;
+				bestMatchReason = reason;
+			}
+		}
+
+		if (bestMatch && bestScore >= 50) {
+			console.log('[AA Benchmark] Best match:', {
+				name: bestMatch.name,
+				slug: bestMatch.slug,
+				score: bestScore,
+				reason: bestMatchReason,
+				evaluations: bestMatch.evaluations,
+			});
+			return bestMatch;
+		}
+
+		console.log('[AA Benchmark] No match found for:', model.name);
+		return null;
+	});
+
+	// Find matching image model benchmark
+	const imageBenchmark = $derived.by(() => {
+		if (!benchmarks.data?.available || !benchmarks.data.imageModels || !isImageOnlyModel(model))
+			return null;
+
+		const modelName = model.name.toLowerCase();
+		const modelIdShort = extractModelName(model.id).toLowerCase();
+		const normalizedName = normalizeForMatch(model.name);
+		const normalizedId = normalizeForMatch(modelIdShort);
+		const modelNameTokens = extractKeyTokens(model.name);
+		const modelIdTokens = extractKeyTokens(modelIdShort);
+
+		return benchmarks.data.imageModels.find((img: AAImageModel) => {
+			const aaName = img.name.toLowerCase();
+			const aaSlug = img.slug.toLowerCase();
+			const normalizedAaName = normalizeForMatch(img.name);
+			const normalizedAaSlug = normalizeForMatch(img.slug);
+
+			// Exact matches
+			if (modelName === aaName || modelIdShort === aaSlug) return true;
+
+			// Normalized matches
+			if (normalizedName === normalizedAaName || normalizedId === normalizedAaSlug) return true;
+
+			// Partial matches
+			if (modelName.includes(aaName) || aaName.includes(modelName)) return true;
+			if (modelIdShort.includes(aaSlug) || aaSlug.includes(modelIdShort)) return true;
+
+			// Normalized partial matches
+			if (normalizedName.includes(normalizedAaName) || normalizedAaName.includes(normalizedName))
+				return true;
+			if (normalizedId.includes(normalizedAaSlug) || normalizedAaSlug.includes(normalizedId))
+				return true;
+
+			// Token-based matching (handles different word orderings)
+			const aaNameTokens = extractKeyTokens(img.name);
+			const aaSlugTokens = extractKeyTokens(img.slug);
+			if (tokensMatch(modelNameTokens, aaNameTokens)) return true;
+			if (tokensMatch(modelIdTokens, aaSlugTokens)) return true;
+			if (tokensMatch(modelNameTokens, aaSlugTokens)) return true;
+			if (tokensMatch(modelIdTokens, aaNameTokens)) return true;
+
+			return false;
+		});
+	});
+
+	const hasBenchmarks = $derived(llmBenchmark || imageBenchmark);
+
 	function formatNumber(num: number | undefined): string {
 		if (!num) return '-';
 		if (num >= 1000000) return `${(num / 1000000).toFixed(1)}M`;
@@ -55,6 +266,11 @@
 			.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
 			.join(' ');
 	}
+
+	function formatBenchmarkScore(score: number | undefined): string {
+		if (score === undefined || score === null) return '-';
+		return score.toFixed(1);
+	}
 </script>
 
 <div class="bg-popover border-border flex h-full w-[320px] flex-col overflow-hidden border-l">
@@ -88,6 +304,92 @@
 			</div>
 		{/if}
 
+		<!-- Benchmarks Section -->
+		{#if hasBenchmarks}
+			<div>
+				<h4 class="text-muted-foreground mb-2 text-xs font-medium tracking-wide uppercase">
+					<span class="inline-flex items-center gap-1.5">
+						<TrendingUpIcon class="size-3" />
+						Benchmarks
+					</span>
+				</h4>
+
+				{#if llmBenchmark}
+					<div class="bg-muted/50 space-y-2 rounded-lg p-3">
+						{#if llmBenchmark.evaluations?.artificial_analysis_intelligence_index}
+							<div class="flex justify-between text-sm">
+								<span class="text-muted-foreground">Intelligence</span>
+								<span class="font-medium text-blue-400"
+									>{formatBenchmarkScore(
+										llmBenchmark.evaluations.artificial_analysis_intelligence_index
+									)}</span
+								>
+							</div>
+						{/if}
+						{#if llmBenchmark.evaluations?.artificial_analysis_coding_index}
+							<div class="flex justify-between text-sm">
+								<span class="text-muted-foreground">Coding</span>
+								<span class="font-medium text-green-400"
+									>{formatBenchmarkScore(
+										llmBenchmark.evaluations.artificial_analysis_coding_index
+									)}</span
+								>
+							</div>
+						{/if}
+						{#if llmBenchmark.evaluations?.artificial_analysis_math_index}
+							<div class="flex justify-between text-sm">
+								<span class="text-muted-foreground">Math</span>
+								<span class="font-medium text-purple-400"
+									>{formatBenchmarkScore(
+										llmBenchmark.evaluations.artificial_analysis_math_index
+									)}</span
+								>
+							</div>
+						{/if}
+						{#if llmBenchmark.median_output_tokens_per_second}
+							<div class="border-border flex justify-between border-t pt-2 text-sm">
+								<span class="text-muted-foreground inline-flex items-center gap-1">
+									<ZapIcon class="size-3" />
+									Speed
+								</span>
+								<span class="font-medium text-yellow-400"
+									>{llmBenchmark.median_output_tokens_per_second.toFixed(0)} tok/s</span
+								>
+							</div>
+						{/if}
+					</div>
+				{/if}
+
+				{#if imageBenchmark}
+					<div class="bg-muted/50 space-y-2 rounded-lg p-3">
+						{#if imageBenchmark.elo}
+							<div class="flex justify-between text-sm">
+								<span class="text-muted-foreground">ELO Rating</span>
+								<span class="font-medium text-blue-400">{imageBenchmark.elo}</span>
+							</div>
+						{/if}
+						{#if imageBenchmark.rank}
+							<div class="flex justify-between text-sm">
+								<span class="text-muted-foreground">Rank</span>
+								<span class="font-medium text-amber-400">#{imageBenchmark.rank}</span>
+							</div>
+						{/if}
+					</div>
+				{/if}
+
+				<!-- Attribution -->
+				<a
+					href="https://artificialanalysis.ai"
+					target="_blank"
+					rel="noopener noreferrer"
+					class="text-muted-foreground hover:text-foreground mt-2 inline-flex items-center gap-1 text-xs transition-colors"
+				>
+					Data from Artificial Analysis
+					<ExternalLinkIcon class="size-3" />
+				</a>
+			</div>
+		{/if}
+
 		<!-- Features -->
 		<div>
 			<h4 class="text-muted-foreground mb-2 text-xs font-medium tracking-wide uppercase">
diff --git a/src/lib/components/model-picker/model-picker.svelte b/src/lib/components/model-picker/model-picker.svelte
@@ -370,12 +370,7 @@
 						<FilterIcon class="text-muted-foreground size-4 opacity-50" />
 					{/if}
 				</label>
-				<div
-					class={cn(
-						'max-h-[400px] min-h-0 flex-1 overflow-y-auto',
-						isMobile.current ? 'p-2' : 'p-1'
-					)}
-				>
+				<div class={cn('min-h-0 flex-1 overflow-y-auto', isMobile.current ? 'p-2' : 'p-1')}>
 					<Command.List class="flex flex-col gap-0.5">
 						{#each filteredModels as model (model.id)}
 							{@const formatted = formatModelName(model.modelId)}
@@ -664,7 +659,7 @@
 			class={cn(
 				'flex flex-col overflow-hidden p-0 transition-all duration-200',
 				infoModel ? 'w-[840px]' : 'w-[520px]',
-				'max-h-[calc(100vh-120px)]',
+				'h-[500px] max-h-[calc(100vh-120px)]',
 				'data-[side=bottom]:translate-y-1 data-[side=top]:-translate-y-1'
 			)}
 		>
diff --git a/src/lib/types/artificial-analysis.ts b/src/lib/types/artificial-analysis.ts
diff --git a/src/routes/api/artificial-analysis/benchmarks/+server.ts b/src/routes/api/artificial-analysis/benchmarks/+server.ts