@@ -309,11 +309,11 @@ async function llmCall(messages, opts = {}) {
309309 // - Local llama-server: requires 'max_tokens', may not understand 'max_completion_tokens'
310310 const isCloudApi = ! opts . vlm && ( LLM_API_TYPE === 'openai' || LLM_BASE_URL . includes ( 'openai.com' ) || LLM_BASE_URL . includes ( 'api.anthropic' ) ) ;
311311
312- // No max_tokens for any API — the streaming loop's 2000-token hard cap is the safety net.
313- // Sending max_tokens to thinking models (Qwen3.5) starves actual output since
314- // reasoning_content counts against the limit.
312+ // Inject max_tokens to override the server's default limit (typically 2048 for SwiftLM),
313+ // which would otherwise prematurely truncate models with extensive reasoning/thinking output.
314+ const maxTokensField = isCloudApi ? 'max_completion_tokens' : 'max_tokens' ;
315+ const computedMaxTokens = opts . maxTokens || 4096 ; // Ensure sufficient headroom for tools/reasoning
315316
316- // Lookup model-family-specific config (e.g. reasoning_effort for Mistral,
317317 // minTemperature for Nemotron/LFM2).
318318 // VLM calls skip the LLM family table — VLM models are always local llava-compatible.
319319 const modelFamily = opts . vlm ? { } : getModelFamily ( model || LLM_MODEL ) ;
@@ -341,6 +341,7 @@ async function llmCall(messages, opts = {}) {
341341 // which activates prefix buffering to strip hallucinated artifacts
342342 ...( opts . expectJSON && ! isCloudApi && { response_format : { type : 'json_object' } } ) ,
343343 ...( opts . tools && { tools : opts . tools } ) ,
344+ [ maxTokensField ] : computedMaxTokens ,
344345 // Model-family-specific params (e.g. reasoning_effort:'none' for Mistral).
345346 // These are merged last so they take precedence over defaults.
346347 ...modelFamilyParams ,
@@ -483,14 +484,14 @@ async function llmCall(messages, opts = {}) {
483484 }
484485 }
485486 // Hard cap: abort if token count far exceeds maxTokens
486- if ( opts . maxTokens && tokenCount > opts . maxTokens * 2 ) {
487- log ( ` ⚠ Aborting: ${ tokenCount } tokens exceeds ${ opts . maxTokens } ×2 safety limit` ) ;
487+ if ( computedMaxTokens && tokenCount > computedMaxTokens * 2 ) {
488+ log ( ` ⚠ Aborting: ${ tokenCount } tokens exceeds ${ computedMaxTokens } ×2 safety limit` ) ;
488489 controller . abort ( ) ;
489490 break ;
490491 }
491- // Global safety limit: no benchmark test should ever need >2000 tokens
492- if ( tokenCount > 2000 ) {
493- log ( ` ⚠ Aborting: ${ tokenCount } tokens exceeds global 2000 -token safety limit` ) ;
492+ // Global safety limit: no benchmark test should ever need >8000 tokens
493+ if ( tokenCount > 8192 ) {
494+ log ( ` ⚠ Aborting: ${ tokenCount } tokens exceeds global 8192 -token safety limit` ) ;
494495 controller . abort ( ) ;
495496 break ;
496497 }
@@ -2636,7 +2637,7 @@ async function main() {
26362637 if ( TEST_MODE !== 'full' ) {
26372638 const isVlmSuite = ( name ) => name . includes ( 'VLM Scene' ) || name . includes ( '📸' ) ;
26382639 const originalCount = suites . length ;
2639- if ( TEST_MODE === 'llm ') {
2640+ if ( TEST_MODE !== 'vlm ') {
26402641 // Remove VLM image-analysis suites (VLM-to-Alert Triage stays — it's LLM-based text triage)
26412642 for ( let i = suites . length - 1 ; i >= 0 ; i -- ) {
26422643 if ( isVlmSuite ( suites [ i ] . name ) ) suites . splice ( i , 1 ) ;
0 commit comments