Skip to content
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions app/composables/useNavigation.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,14 +134,14 @@ const footerLinks = [{
}, {
label: 'Explore',
children: [{
label: 'Modules',
to: 'https://nuxt.com/modules'
}, {
label: 'Templates',
to: 'https://nuxt.com/templates'
}, {
label: 'Showcase',
to: 'https://nuxt.com/showcase'
}, {
label: 'AI Evals',
to: '/evals'
}]
}, {
label: 'Enterprise',
Expand Down
319 changes: 319 additions & 0 deletions app/pages/evals.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
<script setup lang="ts">
import { h, resolveComponent } from 'vue'
import type { TableColumn, TableRow } from '@nuxt/ui'
import { joinURL } from 'ufo'

const UButton = resolveComponent('UButton')
const UBadge = resolveComponent('UBadge')
const UAvatar = resolveComponent('UAvatar')

definePageMeta({
heroBackground: 'opacity-70 -z-10'
})

// Types
interface EvalResultItem {
evalPath: string
result: {
success: boolean
duration: number
evalPath: string
timestamp: string
}
}

interface Experiment {
name: string
timestamp: string
modelName: string
agentHarness: string
}

interface AgentResultsData {
metadata: {
exportedAt: string
experiments: Experiment[]
}
results: Record<string, EvalResultItem[]>
}

interface ModelRow {
model: string
agent: string
totalEvals: number
successRate: number
evals: EvalResultItem[]
}

const { url } = useSiteConfig()

const [{ data: page }, { data: rawData }] = await Promise.all([
useAsyncData('evals', () => queryCollection('evals').first()),
useAsyncData('agent-results', () => $fetch<AgentResultsData>(joinURL(url, '/agent-results.json')))
])

if (!page.value) {
throw createError({ statusCode: 404, statusMessage: 'Page not found', fatal: true })
}
if (!rawData.value) {
throw createError({ statusCode: 404, statusMessage: 'Data not found', fatal: true })
}

const title = page.value.title
const description = page.value.description

useSeoMeta({
titleTemplate: '%s',
title,
description,
ogDescription: description,
ogTitle: title
})
defineOgImageComponent('Docs', { title, description })

// Build agent map from experiments
const agentMap = computed(() => {
const map: Record<string, string> = {}
if (!rawData.value?.metadata?.experiments) return map
for (const exp of rawData.value.metadata.experiments) {
map[exp.modelName] = exp.agentHarness
}
return map
})

// Process results into table rows
const allResults = computed<ModelRow[]>(() => {
if (!rawData.value?.results) return []
const rows: ModelRow[] = []
for (const [modelName, evals] of Object.entries(rawData.value.results)) {
const successes = evals.filter(e => e.result.success).length
rows.push({
model: modelName,
agent: agentMap.value[modelName] || 'Unknown',
totalEvals: evals.length,
successRate: Math.round((successes / evals.length) * 100),
evals
})
}
return rows.sort((a, b) => b.successRate - a.successRate)
})

// Agent filter
const agents = computed(() => {
return [...new Set(allResults.value.map(r => r.agent))]
})
const selectedAgents = ref<string[]>([])

const filteredResults = computed(() => {
if (selectedAgents.value.length === 0) {
return allResults.value
}
return allResults.value.filter(r => selectedAgents.value.includes(r.agent))
})

// Format exported date
const formattedDate = computed(() => {
if (!rawData.value?.metadata?.exportedAt) return ''
const date = new Date(rawData.value.metadata.exportedAt)
return date.toLocaleDateString('en-US', { month: 'long', day: 'numeric', year: 'numeric' })
})

// Model icon mapping
function getModelAvatar(model: string): string | undefined {
const lower = model.toLowerCase()
if (lower.includes('claude')) return '/assets/agents/anthropic.avif'
if (lower.includes('gpt') || lower.includes('codex')) return '/assets/agents/openai.avif'
if (lower.includes('gemini')) return '/assets/agents/google.avif'
if (lower.includes('deepseek')) return '/assets/agents/deepseek.avif'
if (lower.includes('devstral')) return '/assets/agents/mistral.avif'
if (lower.includes('minimax')) return '/assets/agents/minimax.avif'
if (lower.includes('kat')) return '/assets/agents/kwaipilot.avif'
if (lower.includes('moonshot')) return '/assets/agents/moonshotai.avif'
if (lower.includes('grok')) return '/assets/agents/xai.avif'
return undefined
}

// Format duration from ms to seconds
function formatDuration(ms: number): string {
return `${(ms / 1000).toFixed(2)}s`
}

// Expanded rows state
const expanded = ref({})

// Toggle expand on row click
function onSelect(_e: Event, row: TableRow<ModelRow>) {
row.toggleExpanded()
}

// Table columns
const columns: TableColumn<ModelRow>[] = [
{
id: 'expand',
meta: {
class: {
th: 'w-0',
td: 'w-0'
}
},
cell: ({ row }) => h(UButton, {
'color': 'neutral',
'variant': 'ghost',
'icon': 'i-lucide-chevron-right',
'square': true,
'size': 'sm',
'aria-label': 'Expand',
'ui': {
leadingIcon: ['transition-transform', row.getIsExpanded() ? 'duration-200 rotate-90' : '']
},
'onClick': (e: Event) => {
e.stopPropagation()
row.toggleExpanded()
},
'class': 'group-hover:bg-elevated'
})
},
{
accessorKey: 'model',
header: 'Model',
cell: ({ row }) => h('div', { class: 'flex items-center gap-2' }, [
h(UAvatar, { src: getModelAvatar(row.original.model), size: 'xs', loading: 'lazy', class: 'border border-default' }),
h('span', {}, row.original.model)
])
},
{
accessorKey: 'agent',
header: 'Agent'
},
{
accessorKey: 'totalEvals',
header: 'Total Evals',
meta: {
class: {
th: 'text-center',
td: 'text-center'
}
}
},
{
accessorKey: 'successRate',
header: 'Success Rate',
meta: {
class: {
th: 'text-right',
td: 'text-right'
}
},
cell: ({ row }) => h('span', {}, `${row.original.successRate}%`)
}
]

// Expanded eval table columns
const evalColumns: TableColumn<EvalResultItem>[] = [
{
accessorKey: 'evalPath',
header: 'Evaluation'
},
{
id: 'score',
header: 'Score',
meta: {
class: {
th: 'text-center',
td: 'text-center'
}
},
cell: ({ row }) => h(UBadge, {
color: row.original.result.success ? 'success' : 'error',
variant: 'subtle'
}, () => row.original.result.success ? 'Pass' : 'Fail')
},
{
id: 'duration',
header: 'Duration',
meta: {
class: {
th: 'text-right',
td: 'text-right'
}
},
cell: ({ row }) => h('span', {}, formatDuration(row.original.result.duration))
}
]
</script>

<template>
<div v-if="page && rawData">
<UPageHero
:title="page.title"
:description="page.description"
:ui="{
title: 'text-4xl sm:text-5xl lg:text-6xl font-bold',
description: 'max-w-2xl mx-auto text-pretty',
links: 'items-center'
}"
>
<template #links>
<UButton
:to="page.githubUrl"
icon="i-simple-icons-github"
label="View on GitHub"
target="_blank"
color="neutral"
variant="ghost"
/>

<USeparator orientation="vertical" class="h-6" />

<span class="text-sm font-medium">Last run date: <span class="text-muted font-normal">{{ formattedDate }}</span></span>
</template>
</UPageHero>

<UPageBody class="mt-0">
<UContainer class="max-w-6xl">
<div class="flex items-center justify-between mb-4">
<h2 class="text-2xl font-bold">
Agent Performance Results
</h2>

<USelectMenu
v-model="selectedAgents"
:items="agents"
multiple
placeholder="All Agents"
color="neutral"
variant="subtle"
class="w-52 bg-elevated/50 hover:bg-elevated data-[state=open]:bg-elevated group"
:ui="{ trailingIcon: 'group-data-[state=open]:rotate-180 transition-transform duration-200' }"
/>
</div>

<UTable
v-model:expanded="expanded"
:data="filteredResults"
:columns="columns"
:ui="{
thead: '[&>tr]:bg-elevated/50 border-b border-default',
tr: 'py-2.5 peer peer-data-[expanded=true]:[&>td]:p-4! group',
td: 'py-2.5'
}"
class="flex-1 border border-default rounded-lg"
@select="onSelect"
>
<template #expanded="{ row }">
<UTable
:data="row.original.evals"
:columns="evalColumns"
:ui="{
thead: '[&>tr]:bg-elevated/50 border-b border-default',
tr: 'py-2.5',
td: 'py-2.5'
}"
class="flex-1 border border-default rounded-lg"
/>
</template>
</UTable>
</UContainer>
</UPageBody>
</div>
</template>
9 changes: 9 additions & 0 deletions content.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,15 @@ export default defineContentConfig({
links: z.array(Link)
}))
})
}),
evals: defineCollection({
type: 'data',
source: 'evals.yml',
schema: z.object({
title: z.string(),
description: z.string(),
githubUrl: z.string()
})
})
}
})
3 changes: 3 additions & 0 deletions content/evals.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
title: AI Agent Evaluations
description: "Performance results of AI coding agents on Nuxt code generation tasks, measuring success rate and execution time."
githubUrl: "https://github.com/vercel/nuxt-evals"
Loading