Skip to content

Commit dffaae4

Browse files
idemergeclaude
andcommitted
Release v2.6.0
feat: output scope selector for long-context presets feat: input/output/total throughput metrics in workflow detail feat: tooltips on all metric labels and parameter controls Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 310156e commit dffaae4

14 files changed

Lines changed: 503 additions & 99 deletions

CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/), and this project adheres to [Semantic Versioning](https://semver.org/).
66

7+
## [2.6.0] - 2026-04-23
8+
9+
### Added
10+
- Output Scope selector for long-context presets (16K/64K/150K/256K): controls how many documents the model reads, limiting output length (~500 tokens for 3 docs, unlimited for All docs)
11+
- Output Scope available in Benchmark, Workflow, and Playground pages with persistent selection via localStorage
12+
- Input/Output/Total throughput metrics in Workflow Detail: calculated as concurrency × avg tokens per request / avg response time
13+
- Throughput columns (In T/s, Out T/s, Total T/s) in provider comparison tables
14+
- Throughput summary in workflow header and results summary bar
15+
- Tooltips on all metric labels, table column headers, and parameter controls across all pages (WorkflowResults, ResultsPanel, ConfigPanel, PlaygroundPage, HistoryDetailPage)
16+
17+
### Changed
18+
- Long-context 64K preset prompt suffix updated to support configurable output scope
19+
720
## [2.5.1] - 2026-04-19
821

922
### Added

backend/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "llm-benchmark-backend",
3-
"version": "2.5.0",
3+
"version": "2.6.0",
44
"description": "LLM API Radar - Backend",
55
"main": "dist/index.js",
66
"scripts": {

backend/src/services/workflowEngine.ts

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,10 +297,21 @@ async function generateSummary(workflow: BenchmarkWorkflow): Promise<WorkflowSum
297297
totalOutputTokens: 0,
298298
totalCost: 0,
299299
overallSuccessRate: 0,
300+
inputThroughput: 0,
301+
outputThroughput: 0,
302+
totalThroughput: 0,
300303
perTaskMetrics: [],
301304
};
302305
}
303306

307+
// Calculate throughput: concurrency × avgTokensPerRequest / avgResponseTime × 1000
308+
const successCount = Math.round(result.summary.successRate * task.config.iterations) || 1;
309+
const avgRT = result.summary.avgResponseTime;
310+
const c = task.config.concurrency;
311+
const avgIn = (result.summary.totalInputTokens || 0) / successCount;
312+
const avgOut = (result.summary.totalOutputTokens || 0) / successCount;
313+
const avgTotal = result.summary.totalTokens / successCount;
314+
304315
const metric: TaskMetricPoint = {
305316
taskId: task.id,
306317
taskName: task.name,
@@ -314,6 +325,9 @@ async function generateSummary(workflow: BenchmarkWorkflow): Promise<WorkflowSum
314325
avgFirstTokenLatency: result.summary.avgFirstTokenLatency,
315326
avgTokensPerSecond: result.summary.avgTokensPerSecond,
316327
systemThroughput: result.summary.systemThroughput || 0,
328+
inputThroughput: avgRT > 0 ? Math.round((c * avgIn * 1000) / avgRT) : 0,
329+
outputThroughput: avgRT > 0 ? Math.round((c * avgOut * 1000) / avgRT) : 0,
330+
totalThroughput: avgRT > 0 ? Math.round((c * avgTotal * 1000) / avgRT) : 0,
317331
successRate: result.summary.successRate,
318332
estimatedCost: result.summary.estimatedCost,
319333
};
@@ -339,6 +353,9 @@ async function generateSummary(workflow: BenchmarkWorkflow): Promise<WorkflowSum
339353
summary.totalOutputTokens = metrics.reduce((a, m) => a + m.outputTokens, 0);
340354
summary.totalCost = Number(metrics.reduce((a, m) => a + m.estimatedCost, 0).toFixed(6));
341355
summary.overallSuccessRate = Number((metrics.reduce((a, m) => a + m.successRate, 0) / metrics.length).toFixed(4));
356+
summary.inputThroughput = Math.round(metrics.reduce((a, m) => a + m.inputThroughput, 0) / metrics.length);
357+
summary.outputThroughput = Math.round(metrics.reduce((a, m) => a + m.outputThroughput, 0) / metrics.length);
358+
summary.totalThroughput = Math.round(metrics.reduce((a, m) => a + m.totalThroughput, 0) / metrics.length);
342359
}
343360

344361
const startTime = workflow.startedAt ? new Date(workflow.startedAt).getTime() : Date.now();

backend/src/types.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ export interface WorkflowProviderSummary {
201201
totalOutputTokens: number;
202202
totalCost: number;
203203
overallSuccessRate: number;
204+
inputThroughput: number;
205+
outputThroughput: number;
206+
totalThroughput: number;
204207
perTaskMetrics: TaskMetricPoint[];
205208
}
206209

@@ -217,6 +220,9 @@ export interface TaskMetricPoint {
217220
avgFirstTokenLatency: number;
218221
avgTokensPerSecond: number;
219222
systemThroughput: number;
223+
inputThroughput: number;
224+
outputThroughput: number;
225+
totalThroughput: number;
220226
successRate: number;
221227
estimatedCost: number;
222228
}

frontend/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "frontend",
33
"private": true,
4-
"version": "2.5.1",
4+
"version": "2.6.0",
55
"type": "module",
66
"scripts": {
77
"dev": "vite",

frontend/src/components/ConfigPanel.tsx

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,13 @@ import {
88
QUICK_ITERATIONS,
99
QUICK_WARMUP,
1010
QUICK_INTERVAL,
11+
OUTPUT_SCOPE_OPTIONS,
12+
applyOutputScope,
13+
getStoredOutputScope,
14+
storeOutputScope,
1115
} from '../constants';
1216
import { useProviders } from '../hooks/useProviders';
13-
import { Button, Input, InputNumber, Switch, Segmented } from '../antdImports';
17+
import { Button, Input, InputNumber, Switch, Segmented, Select, Tooltip } from '../antdImports';
1418
import { LoadingOutlined } from '@ant-design/icons';
1519
import { useTokenCount } from '../utils/tokenCount';
1620
import { loadHeavyPreset } from '../constants';
@@ -63,6 +67,8 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
6367
const [warmupRuns, setWarmupRuns] = useState(0);
6468
const [requestInterval, setRequestInterval] = useState(0);
6569
const [randomizeInterval, setRandomizeInterval] = useState(false);
70+
const [isLongContext, setIsLongContext] = useState(false);
71+
const [outputScope, setOutputScope] = useState(getStoredOutputScope);
6672

6773
const toggleModel = (provider: ProviderConfigResponse, modelName: string) => {
6874
const key = `${provider.id}:${modelName}`;
@@ -235,11 +241,14 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
235241
<button
236242
key={preset.label}
237243
onClick={async () => {
244+
const isLC = !!preset.multiDoc;
245+
setIsLongContext(isLC);
238246
if (preset.heavy) {
239247
const bucket = preset.tokens >= 200_000 ? '256k' : preset.tokens >= 100_000 ? '150k' : '64k';
240-
setPrompt(await loadHeavyPreset(bucket));
248+
const raw = await loadHeavyPreset(bucket);
249+
setPrompt(isLC ? applyOutputScope(raw, outputScope) : raw);
241250
} else {
242-
setPrompt(preset.prompt);
251+
setPrompt(isLC ? applyOutputScope(preset.prompt, outputScope) : preset.prompt);
243252
}
244253
}}
245254
className={`text-[11px] px-2.5 py-1.5 rounded-md border transition-all font-medium ${
@@ -252,6 +261,26 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
252261
</button>
253262
))}
254263
</div>
264+
{isLongContext && (
265+
<div className="flex items-center gap-2">
266+
<Tooltip title="Controls how many documents the model should read and summarize. Fewer docs = shorter output (~500 tokens for 3 docs). Use this to limit output length while keeping the full prompt as input.">
267+
<label className="text-[11px] text-text-secondary font-medium whitespace-nowrap cursor-help">
268+
Output Scope
269+
</label>
270+
</Tooltip>
271+
<Select
272+
size="small"
273+
value={outputScope}
274+
onChange={(v) => {
275+
setOutputScope(v);
276+
storeOutputScope(v);
277+
setPrompt((prev) => applyOutputScope(prev, v));
278+
}}
279+
options={OUTPUT_SCOPE_OPTIONS}
280+
style={{ width: 160, fontSize: 11 }}
281+
/>
282+
</div>
283+
)}
255284
<Input.TextArea
256285
value={prompt}
257286
onChange={(e) => setPrompt(e.target.value)}
@@ -267,7 +296,9 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
267296
<label className="section-title">Parameters</label>
268297
<div className="grid grid-cols-3 gap-3">
269298
<div className="space-y-2">
270-
<label className="text-[11px] text-text-secondary font-medium">Max Tokens</label>
299+
<Tooltip title="Maximum number of tokens the model can generate in its response">
300+
<label className="text-[11px] text-text-secondary font-medium cursor-help">Max Tokens</label>
301+
</Tooltip>
271302
<QuickButtons options={QUICK_MAX_TOKENS} value={maxTokens} onChange={setMaxTokens} color="accent-teal" />
272303
<InputNumber
273304
value={maxTokens}
@@ -280,7 +311,9 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
280311
/>
281312
</div>
282313
<div className="space-y-2">
283-
<label className="text-[11px] text-text-secondary font-medium">Concurrency</label>
314+
<Tooltip title="Number of parallel requests sent simultaneously to the API">
315+
<label className="text-[11px] text-text-secondary font-medium cursor-help">Concurrency</label>
316+
</Tooltip>
284317
<QuickButtons
285318
options={QUICK_CONCURRENCY}
286319
value={concurrency}
@@ -298,7 +331,9 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
298331
/>
299332
</div>
300333
<div className="space-y-2">
301-
<label className="text-[11px] text-text-secondary font-medium">Iterations</label>
334+
<Tooltip title="Total number of requests to send during the benchmark">
335+
<label className="text-[11px] text-text-secondary font-medium cursor-help">Iterations</label>
336+
</Tooltip>
302337
<QuickButtons options={QUICK_ITERATIONS} value={iterations} onChange={setIterations} color="accent-teal" />
303338
<InputNumber
304339
value={iterations}
@@ -338,7 +373,9 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
338373
<label className="section-title">Advanced</label>
339374
<div className="grid grid-cols-2 gap-3">
340375
<div className="space-y-2">
341-
<label className="text-[11px] text-text-secondary font-medium">Warmup Runs</label>
376+
<Tooltip title="Requests to send before benchmarking starts, to warm up the API connection and caches">
377+
<label className="text-[11px] text-text-secondary font-medium cursor-help">Warmup Runs</label>
378+
</Tooltip>
342379
<QuickButtons
343380
options={QUICK_WARMUP}
344381
value={warmupRuns}
@@ -356,7 +393,9 @@ export function ConfigPanel({ onStart, isRunning, currentProviders: _currentProv
356393
/>
357394
</div>
358395
<div className="space-y-2">
359-
<label className="text-[11px] text-text-secondary font-medium">Interval (ms)</label>
396+
<Tooltip title="Delay between consecutive requests in milliseconds">
397+
<label className="text-[11px] text-text-secondary font-medium cursor-help">Interval (ms)</label>
398+
</Tooltip>
360399
<QuickButtons
361400
options={QUICK_INTERVAL}
362401
value={requestInterval}

frontend/src/components/HistoryDetailPage.tsx

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,17 @@ export function HistoryDetailPage({ workflowId, onExport, onCancel, onBack }: Hi
169169
const outputTokens =
170170
workflow.summary.totalOutputTokens || summaries.reduce((a, s) => a + (s.totalOutputTokens || 0), 0);
171171
if (inputTokens === 0 && outputTokens === 0) return null;
172-
return { inputTokens, outputTokens };
172+
// Average throughput across providers
173+
const avgInputThroughput = summaries.length
174+
? Math.round(summaries.reduce((a, s) => a + (s.inputThroughput || 0), 0) / summaries.length)
175+
: 0;
176+
const avgOutputThroughput = summaries.length
177+
? Math.round(summaries.reduce((a, s) => a + (s.outputThroughput || 0), 0) / summaries.length)
178+
: 0;
179+
const avgTotalThroughput = summaries.length
180+
? Math.round(summaries.reduce((a, s) => a + (s.totalThroughput || 0), 0) / summaries.length)
181+
: 0;
182+
return { inputTokens, outputTokens, avgInputThroughput, avgOutputThroughput, avgTotalThroughput };
173183
})();
174184

175185
return (
@@ -241,26 +251,58 @@ export function HistoryDetailPage({ workflowId, onExport, onCancel, onBack }: Hi
241251
</div>
242252
{/* Token Stats */}
243253
{tokenStats && (
244-
<div className="flex items-center gap-3 mt-2 flex-wrap">
245-
<span className="text-[11px] text-text-secondary font-mono">
246-
Input Tokens: <span className="text-accent-blue">{tokenStats.inputTokens.toLocaleString()}</span>
247-
</span>
248-
<span className="text-[11px] text-text-secondary font-mono">
249-
Output Tokens: <span className="text-accent-teal">{tokenStats.outputTokens.toLocaleString()}</span>
250-
</span>
251-
<span className="text-[11px] text-text-secondary font-mono">
252-
Ratio (In:Out):{' '}
253-
<span className="text-accent-violet">
254-
{tokenStats.outputTokens > 0 && tokenStats.inputTokens > 0
255-
? (() => {
256-
const r = tokenStats.inputTokens / tokenStats.outputTokens;
257-
if (r >= 10) return `${Math.round(r)}:1`;
258-
return `${r.toFixed(2)}:1`;
259-
})()
260-
: '-'}
261-
</span>
262-
</span>
263-
</div>
254+
<>
255+
<div className="flex items-center gap-3 mt-2 flex-wrap">
256+
<Tooltip title="Total input tokens sent to the API across all requests">
257+
<span className="text-[11px] text-text-secondary font-mono cursor-help">
258+
Input Tokens:{' '}
259+
<span className="text-accent-blue">{tokenStats.inputTokens.toLocaleString()}</span>
260+
</span>
261+
</Tooltip>
262+
<Tooltip title="Total output tokens generated by the model across all requests">
263+
<span className="text-[11px] text-text-secondary font-mono cursor-help">
264+
Output Tokens:{' '}
265+
<span className="text-accent-teal">{tokenStats.outputTokens.toLocaleString()}</span>
266+
</span>
267+
</Tooltip>
268+
<Tooltip title="Ratio of input tokens to output tokens — higher means more reading, less generating">
269+
<span className="text-[11px] text-text-secondary font-mono cursor-help">
270+
Ratio (In:Out):{' '}
271+
<span className="text-accent-violet">
272+
{tokenStats.outputTokens > 0 && tokenStats.inputTokens > 0
273+
? (() => {
274+
const r = tokenStats.inputTokens / tokenStats.outputTokens;
275+
if (r >= 10) return `${Math.round(r)}:1`;
276+
return `${r.toFixed(2)}:1`;
277+
})()
278+
: '-'}
279+
</span>
280+
</span>
281+
</Tooltip>
282+
</div>
283+
{(tokenStats.avgInputThroughput > 0 || tokenStats.avgOutputThroughput > 0) && (
284+
<div className="flex items-center gap-3 mt-1 flex-wrap">
285+
<Tooltip title="Input throughput: concurrency × avg input tokens / avg response time">
286+
<span className="text-[11px] text-text-secondary font-mono cursor-help">
287+
Input T/s:{' '}
288+
<span className="text-accent-blue">{tokenStats.avgInputThroughput.toLocaleString()}</span>
289+
</span>
290+
</Tooltip>
291+
<Tooltip title="Output throughput: concurrency × avg output tokens / avg response time">
292+
<span className="text-[11px] text-text-secondary font-mono cursor-help">
293+
Output T/s:{' '}
294+
<span className="text-accent-teal">{tokenStats.avgOutputThroughput.toLocaleString()}</span>
295+
</span>
296+
</Tooltip>
297+
<Tooltip title="Total throughput: concurrency × avg total tokens / avg response time">
298+
<span className="text-[11px] text-text-secondary font-mono cursor-help">
299+
Total T/s:{' '}
300+
<span className="text-accent-violet">{tokenStats.avgTotalThroughput.toLocaleString()}</span>
301+
</span>
302+
</Tooltip>
303+
</div>
304+
)}
305+
</>
264306
)}
265307
</div>
266308
</div>

0 commit comments

Comments
 (0)