free-coding-models/src/utils.js at main · vava-nessa/free-coding-models · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
/**
 * @file lib/utils.js
 * @description Pure utility functions extracted from the main CLI for testability.
 *
 * 📖 This file was created to separate the "brain" of the app from the "body" (TUI, I/O, chalk).
 *    Every function here is a pure function — no side effects, no process.exit, no console output.
 *    This makes them trivial to unit test with `node:test` without mocking anything.
 *
 * 📖 The main CLI (bin/free-coding-models.js) imports everything from here.
 *    If you need to add new logic (calculations, data transforms, parsing),
 *    add it here so tests can cover it.
 *
 * 📖 Data flow:
 *    sources.js → MODELS array → main CLI creates result objects → these utils process them
 *
 * 📖 Result object shape (created by the main CLI, consumed by these functions):
 *    {
 *      idx: number,          // 1-based index for display
 *      modelId: string,      // e.g. "deepseek-ai/deepseek-v3.2"
 *      label: string,        // e.g. "DeepSeek V3.2" (human-friendly name)
 *      tier: string,         // e.g. "S+", "A", "B+" — from sources.js
 *      sweScore: string,     // e.g. "49.2%", "73.1%" — SWE-bench Verified score
 *      status: string,       // "pending" | "up" | "down" | "timeout"
 *      pings: Array<{ms: number, code: string}>,  // full ping history since start
 *      httpCode: string|null // last HTTP status code (for detecting 429 rate limits)
 *    }
 *
 * @functions
 *   → getAvg(result) — Calculate average latency from successful pings only
 *   → getVerdict(result) — Determine model health verdict based on avg latency and stability
 *   → getUptime(result) — Calculate uptime percentage (successful / total pings)
 *   → getP95(result) — Calculate 95th percentile latency from successful pings
 *   → getJitter(result) — Calculate latency standard deviation (jitter)
 *   → getStabilityScore(result) — Composite 0–100 stability score (p95 + jitter + spikes + uptime)
 *   → sortResults(results, sortColumn, sortDirection) — Sort model results by any column
 *   → filterByTier(results, tierLetter) — Filter results by tier letter (S/A/B/C)
 *   → findBestModel(results) — Pick the best model by status → avg → stability → uptime priority
 *   → parseArgs(argv) — Parse CLI arguments into structured flags and values
 *
 * @exports getAvg, getVerdict, getUptime, getP95, getJitter, getStabilityScore
 * @exports sortResults, filterByTier, findBestModel, parseArgs
 * @exports scoreModelForTask, getTopRecommendations
 * @exports TIER_ORDER, VERDICT_ORDER, TIER_LETTER_MAP, TASK_TYPES, PRIORITY_TYPES, CONTEXT_BUDGETS
 *
 * @see bin/free-coding-models.js — main CLI that imports these utils
 * @see sources.js — model definitions consumed by these functions
 * @see test/test.js — unit tests that validate all these functions
 */

// ─── Constants ────────────────────────────────────────────────────────────────

// 📖 Tier sort order — defines the hierarchy from best to worst.
// 📖 Used by sortResults to compare tiers numerically via indexOf.
// 📖 S+ (elite frontier coders) is index 0, C (lightweight edge) is index 7.
// 📖 This must stay in sync with the tiers defined in sources.js.
export const TIER_ORDER = ['S+', 'S', 'A+', 'A', 'A-', 'B+', 'B', 'C']

// 📖 Verdict strings in order from healthiest to unhealthiest.
// 📖 Used by sortResults when sorting by the "verdict" column.
// 📖 "Perfect" means < 400ms avg, "Pending" means no data yet.
// 📖 The order matters — it determines sort rank in the TUI table.
export const VERDICT_ORDER = ['Perfect', 'Normal', 'Slow', 'Spiky', 'Very Slow', 'Overloaded', 'Unstable', 'Not Active', 'Pending']

// 📖 Maps a CLI tier letter (--tier S/A/B/C) to the full tier strings it includes.
// 📖 Example: --tier A matches A+, A, and A- models (all "A-family" tiers).
// 📖 This avoids users needing to know the exact sub-tier names.
// 📖 Used by filterByTier() and the --tier CLI flag.
export const TIER_LETTER_MAP = {
  'S': ['S+', 'S'],      // 📖 Frontier coders — top Aider polyglot scores
  'A': ['A+', 'A', 'A-'], // 📖 Excellent alternatives — strong at most coding tasks
  'B': ['B+', 'B'],       // 📖 Solid performers — good for targeted programming
  'C': ['C'],              // 📖 Lightweight/edge models — code completion on constrained infra
}

// ─── Core Logic Functions ────────────────────────────────────────────────────

// 📖 measureablePingCodes: HTTP codes that still give us a real round-trip latency sample.
// 📖 200 = normal success, 401 = no key / bad key but the provider endpoint is reachable.
const measurablePingCodes = new Set(['200', '401'])

// 📖 getAvg: Calculate average latency from pings that produced a real latency sample.
// 📖 HTTP 200 and 401 both count because a 401 still proves the endpoint responded in X ms.
// 📖 Timeouts and server failures are excluded to avoid mixing availability with raw latency.
// 📖 Returns Infinity when no measurable pings exist — this sorts "unknown" models to the bottom.
// 📖 The rounding to integer avoids displaying fractional milliseconds in the TUI.
//
// 📖 Example:
//   pings = [{ms: 200, code: '200'}, {ms: 320, code: '401'}, {ms: 999, code: '500'}]
//   → getAvg returns 260 (only the measurable pings count: (200+320)/2)
export const getAvg = (r) => {
  const measurablePings = (r.pings || []).filter(p => measurablePingCodes.has(p.code))
  if (measurablePings.length === 0) return Infinity
  return Math.round(measurablePings.reduce((a, b) => a + b.ms, 0) / measurablePings.length)
}

// 📖 getVerdict: Determine a human-readable health verdict for a model.
// 📖 This is the "Status" column label shown in the TUI table.
//
// 📖 Decision priority (first match wins):
//   1. HTTP 429 → "Overloaded" (rate limited by NVIDIA, not a latency issue)
//   2. Timeout/down BUT was previously up → "Unstable" (it worked before, now it doesn't)
//   3. Timeout/down and never worked → "Not Active" (model might be offline)
//   4. No successful pings yet → "Pending" (still waiting for first response)
//   5. Stability-aware speed tiers (avg + p95/jitter penalty):
//      - Avg < 400ms + stable → "Perfect"
//      - Avg < 400ms but spiky p95 → "Spiky" (fast on average, but tail latency hurts)
//      - Avg < 1000ms → "Normal"
//      - Avg < 3000ms → "Slow"
//      - Avg < 5000ms → "Very Slow"
//      - Avg >= 5000ms → "Unstable"
//
// 📖 The "Spiky" verdict catches models that look fast on paper (low avg) but randomly
//    stall your IDE/agent with tail-latency spikes. A model with avg 250ms but p95 6000ms
//    gets downgraded from "Perfect" to "Spiky" — because consistency matters more than speed.
//
// 📖 The "wasUpBefore" check is key — it distinguishes between a model that's
//    temporarily flaky vs one that was never reachable in the first place.
export const getVerdict = (r) => {
  const avg = getAvg(r)
  const wasUpBefore = r.pings.length > 0 && r.pings.some(p => p.code === '200')

  if (r.httpCode === '429') return 'Overloaded'
  if ((r.status === 'timeout' || r.status === 'down') && wasUpBefore) return 'Unstable'
  if (r.status === 'timeout' || r.status === 'down') return 'Not Active'
  if (avg === Infinity) return 'Pending'

  // 📖 Stability-aware verdict: penalize models with good avg but terrible tail latency
  const measurablePings = (r.pings || []).filter(p => measurablePingCodes.has(p.code))
  const p95 = getP95(r)

  if (avg < 400) {
    // 📖 Only flag as "Spiky" when we have enough data (≥3 pings) to judge stability
    if (measurablePings.length >= 3 && p95 > 3000) return 'Spiky'
    return 'Perfect'
  }
  if (avg < 1000) {
    if (measurablePings.length >= 3 && p95 > 5000) return 'Spiky'
    return 'Normal'
  }
  if (avg < 3000) return 'Slow'
  if (avg < 5000) return 'Very Slow'
  if (avg < 10000) return 'Unstable'
  return 'Unstable'
}

// 📖 getUptime: Calculate the percentage of successful pings (code 200) over total pings.
// 📖 Returns 0 when no pings have been made yet (avoids division by zero).
// 📖 Displayed as "Up%" column in the TUI — e.g., "85%" means 85% of pings got HTTP 200.
// 📖 This metric is useful for identifying models that are technically "up" but flaky.
export const getUptime = (r) => {
  if (r.pings.length === 0) return 0
  const successful = r.pings.filter(p => p.code === '200').length
  return Math.round((successful / r.pings.length) * 100)
}

// 📖 getP95: Calculate the 95th percentile latency from measurable pings (HTTP 200/401).
// 📖 The p95 answers: "95% of requests are faster than this value."
// 📖 A low p95 means consistently fast responses — a high p95 signals tail-latency spikes.
// 📖 Returns Infinity when no measurable pings exist.
//
// 📖 Algorithm: sort latencies ascending, pick the value at ceil(N * 0.95) - 1.
// 📖 Example: [100, 200, 300, 400, 5000] → p95 index = ceil(5 * 0.95) - 1 = 4 → 5000ms
export const getP95 = (r) => {
  const measurablePings = (r.pings || []).filter(p => measurablePingCodes.has(p.code))
  if (measurablePings.length === 0) return Infinity
  const sorted = measurablePings.map(p => p.ms).sort((a, b) => a - b)
  const idx = Math.ceil(sorted.length * 0.95) - 1
  return sorted[Math.max(0, idx)]
}

// 📖 getJitter: Calculate latency standard deviation (σ) from measurable pings.
// 📖 Low jitter = predictable response times. High jitter = erratic, spiky latency.
// 📖 Returns 0 when fewer than 2 measurable pings (can't compute variance from 1 point).
// 📖 Uses population σ (divides by N, not N-1) since we have ALL the data, not a sample.
export const getJitter = (r) => {
  const measurablePings = (r.pings || []).filter(p => measurablePingCodes.has(p.code))
  if (measurablePings.length < 2) return 0
  const mean = measurablePings.reduce((a, b) => a + b.ms, 0) / measurablePings.length
  const variance = measurablePings.reduce((sum, p) => sum + (p.ms - mean) ** 2, 0) / measurablePings.length
  return Math.round(Math.sqrt(variance))
}

// 📖 getStabilityScore: Composite 0–100 score that rewards consistency and reliability.
// 📖 Combines four signals into a single number:
//   - p95 latency (30%) — penalizes tail-latency spikes
//   - Jitter / σ (30%) — penalizes erratic response times
//   - Spike rate (20%) — fraction of pings above 3000ms threshold
//   - Uptime / reliability (20%) — fraction of successful pings
//
// 📖 Each component is normalized to 0–100, then weighted and combined.
// 📖 Returns -1 when no successful pings exist (not enough data yet).
//
// 📖 Example:
//   Model A: avg 250ms, p95 6000ms (tons of spikes) → score ~30
//   Model B: avg 400ms, p95 650ms (boringly consistent) → score ~85
//   In real usage, Model B FEELS faster because it doesn't randomly stall.
export const getStabilityScore = (r) => {
  const measurablePings = (r.pings || []).filter(p => measurablePingCodes.has(p.code))
  if (measurablePings.length === 0) return -1

  const p95 = getP95(r)
  const jitter = getJitter(r)
  const uptime = getUptime(r)
  const spikeCount = measurablePings.filter(p => p.ms > 3000).length
  const spikeRate = spikeCount / measurablePings.length

  // 📖 Normalize each component to 0–100 (higher = better)
  const p95Score = Math.max(0, Math.min(100, 100 * (1 - p95 / 5000)))
  const jitterScore = Math.max(0, Math.min(100, 100 * (1 - jitter / 2000)))
  const spikeScore = Math.max(0, 100 * (1 - spikeRate))
  const reliabilityScore = uptime

  // 📖 Weighted composite: 30% p95, 30% jitter, 20% spikes, 20% reliability
  const score = 0.3 * p95Score + 0.3 * jitterScore + 0.2 * spikeScore + 0.2 * reliabilityScore
  return Math.round(score)
}

// 📖 sortResults: Sort the results array by any column the user can click/press in the TUI.
// 📖 Returns a NEW array — never mutates the original (important for React-style re-renders).
//
// 📖 Supported columns in the sorter.
// 📖 Most map directly to visible TUI sort hotkeys; `tier` remains available internally
// 📖 while `Y` is used by the live UI for favorites display mode.
//   - 'rank'      (R key) — original index from sources.js
//   - 'tier'      (internal) — tier hierarchy (S+ first, C last)
//   - 'origin'    (O key) — provider name (all NIM for now, future-proofed)
//   - 'model'     (M key) — alphabetical by display label
//   - 'ping'      (L key) — last ping latency (only successful ones count)
//   - 'avg'       (A key) — average latency across all successful pings
//   - 'swe'       (S key) — SWE-bench score (higher is better)
//   - 'ctx'       (N key) — context window size (larger is better)
//   - 'condition'  (H key) — health status (alphabetical)
//   - 'verdict'   (V key) — verdict order (Perfect → Pending)
//   - 'uptime'    (U key) — uptime percentage
//   - 'stability' (B key) — stability score (0–100, higher = more stable)
//
// 📖 sortDirection 'asc' = ascending (smallest first), 'desc' = descending (largest first)
export const sortResults = (results, sortColumn, sortDirection) => {
  return [...results].sort((a, b) => {
    let cmp = 0

    switch (sortColumn) {
      case 'rank':
        cmp = a.idx - b.idx
        break
      case 'tier':
        // 📖 Compare by position in TIER_ORDER — lower index = better tier
        cmp = TIER_ORDER.indexOf(a.tier) - TIER_ORDER.indexOf(b.tier)
        break
      case 'origin':
        // 📖 Sort by providerKey (or fallback to modelId prefix) for multi-provider support
        cmp = (a.providerKey ?? 'nvidia').localeCompare(b.providerKey ?? 'nvidia')
        break
      case 'model':
        cmp = a.label.localeCompare(b.label)
        break
      case 'ping': {
        // 📖 Sort by LAST ping only — gives a real-time "right now" snapshot
        // 📖 Failed last pings sort to the bottom (Infinity)
        const aLast = a.pings.length > 0 ? a.pings[a.pings.length - 1] : null
        const bLast = b.pings.length > 0 ? b.pings[b.pings.length - 1] : null
        const aPing = aLast?.code === '200' ? aLast.ms : Infinity
        const bPing = bLast?.code === '200' ? bLast.ms : Infinity
        cmp = aPing - bPing
        break
      }
      case 'avg':
        cmp = getAvg(a) - getAvg(b)
        break
      case 'swe': {
        // 📖 Sort by SWE-bench score — higher is better
        // 📖 Parse percentage strings like "49.2%", "73.1%" or use 0 for missing values
        const parseSwe = (score) => {
          if (!score || score === '—') return 0
          const num = parseFloat(score.replace('%', ''))
          return isNaN(num) ? 0 : num
        }
        cmp = parseSwe(a.sweScore) - parseSwe(b.sweScore)
        break
      }
      case 'ctx': {
        // 📖 Sort by context window size — larger is better
        // 📖 Parse strings like "128k", "32k", "1m" into numeric tokens
        const parseCtx = (ctx) => {
          if (!ctx || ctx === '—') return 0
          const str = ctx.toLowerCase()
          // 📖 Handle millions (1m = 1000k)
          if (str.includes('m')) {
            const num = parseFloat(str.replace('m', ''))
            return num * 1000
          }
          // 📖 Handle thousands (128k)
          if (str.includes('k')) {
            const num = parseFloat(str.replace('k', ''))
            return num
          }
          return 0
        }
        cmp = parseCtx(a.ctx) - parseCtx(b.ctx)
        break
      }
      case 'condition':
        cmp = a.status.localeCompare(b.status)
        break
      case 'verdict': {
        // 📖 Sort by verdict order — "Perfect" first, "Pending" last
        const aVerdict = getVerdict(a)
        const bVerdict = getVerdict(b)
        cmp = VERDICT_ORDER.indexOf(aVerdict) - VERDICT_ORDER.indexOf(bVerdict)
        break
      }
      case 'uptime':
        cmp = getUptime(a) - getUptime(b)
        break
      case 'stability':
        // 📖 Sort by stability score — higher = more stable = better
        // 📖 Models with no data (-1) sort to the bottom
        cmp = getStabilityScore(a) - getStabilityScore(b)
        break
      case 'usage':
        // 📖 Sort by quota usage percent (usagePercent numeric field, 0–100)
        // 📖 Models with no usage data (undefined/null) are treated as 0 — stable tie-break
        // 📖 via JS stable sort preserving original order when values are equal
        cmp = (a.usagePercent ?? 0) - (b.usagePercent ?? 0)
        break
    }

    // 📖 Flip comparison for descending order
    return sortDirection === 'asc' ? cmp : -cmp
  })
}

// 📖 filterByTier: Filter model results by a single tier letter.
// 📖 Uses TIER_LETTER_MAP to expand the letter into matching tier strings.
// 📖 Returns null if the tier letter is invalid — the caller decides how to handle
//    (the main CLI exits with an error message, tests can assert null).
//
// 📖 Example: filterByTier(results, 'A') → returns only models with tier A+, A, or A-
export function filterByTier(results, tierLetter) {
  const letter = tierLetter.toUpperCase()
  const allowed = TIER_LETTER_MAP[letter]
  if (!allowed) return null
  return results.filter(r => allowed.includes(r.tier))
}

// 📖 findBestModel: Pick the single best model from a results array.
// 📖 Used by --fiable mode to output the most reliable model after 10s of analysis.
//
// 📖 Selection priority (quad-key sort):
//   1. Status: "up" models always beat non-up models
//   2. Average latency: faster average wins (lower is better)
//   3. Stability score: higher stability wins (more consistent = better)
//   4. Uptime %: higher uptime wins as final tiebreaker
//
// 📖 Returns null if the array is empty.
export function findBestModel(results) {
  const sorted = [...results].sort((a, b) => {
    const avgA = getAvg(a)
    const avgB = getAvg(b)
    const stabilityA = getStabilityScore(a)
    const stabilityB = getStabilityScore(b)
    const uptimeA = getUptime(a)
    const uptimeB = getUptime(b)

    // 📖 Priority 1: Models that are currently responding beat those that aren't
    if (a.status === 'up' && b.status !== 'up') return -1
    if (a.status !== 'up' && b.status === 'up') return 1

    // 📖 Priority 2: Lower average latency = faster = better
    if (avgA !== avgB) return avgA - avgB

    // 📖 Priority 3: Higher stability = more consistent = better
    if (stabilityA !== stabilityB) return stabilityB - stabilityA

    // 📖 Priority 4: Higher uptime = more reliable = better (final tiebreaker)
    return uptimeB - uptimeA
  })

  return sorted.length > 0 ? sorted[0] : null
}

// ─── CLI Argument Parsing ────────────────────────────────────────────────────

// 📖 parseArgs: Parse process.argv into a structured object of flags and values.
// 📖 Expects the full argv array (including 'node' and 'script' at indices 0-1).
// 📖 Slices from index 2 to get user-provided arguments only.
//
// 📖 Argument types:
//   - API key: first positional arg that does not look like a CLI flag (e.g., "nvapi-xxx")
//   - Boolean flags: --best, --fiable, --opencode, --opencode-desktop, --openclaw,
//     --aider, --crush, --goose, --qwen,
//     --openhands, --amp, --pi, --no-telemetry, --json, --help/-h (case-insensitive)
//   - Value flag: --tier <letter> (the next non-flag arg is the tier value)
//
// 📖 Returns:
//   { apiKey, bestMode, fiableMode, openCodeMode, openCodeDesktopMode, openClawMode,
//     aiderMode, crushMode, gooseMode, qwenMode, openHandsMode, ampMode,
//     piMode, noTelemetry, jsonMode, helpMode, tierFilter }
//
// 📖 Note: apiKey may be null here — the main CLI falls back to env vars and saved config.
export function parseArgs(argv) {
  const args = argv.slice(2)
  let apiKey = null
  const flags = []

  // 📖 Determine which arg indices are consumed by --tier so we skip them
  const tierIdx = args.findIndex(a => a.toLowerCase() === '--tier')
  const tierValueIdx = (tierIdx !== -1 && args[tierIdx + 1] && !args[tierIdx + 1].startsWith('--'))
    ? tierIdx + 1
    : -1

  // New value flags
  const sortIdx = args.findIndex(a => a.toLowerCase() === '--sort')
  const sortValueIdx = (sortIdx !== -1 && args[sortIdx + 1] && !args[sortIdx + 1].startsWith('--'))
    ? sortIdx + 1
    : -1

  const originIdx = args.findIndex(a => a.toLowerCase() === '--origin')
  const originValueIdx = (originIdx !== -1 && args[originIdx + 1] && !args[originIdx + 1].startsWith('--'))
    ? originIdx + 1
    : -1

  const pingIntervalIdx = args.findIndex(a => a.toLowerCase() === '--ping-interval')
  const pingIntervalValueIdx = (pingIntervalIdx !== -1 && args[pingIntervalIdx + 1] && !args[pingIntervalIdx + 1].startsWith('--'))
    ? pingIntervalIdx + 1
    : -1

  // 📖 Set of arg indices that are values for flags (not API keys)
  const skipIndices = new Set()
  if (tierValueIdx !== -1) skipIndices.add(tierValueIdx)
  if (sortValueIdx !== -1) skipIndices.add(sortValueIdx)
  if (originValueIdx !== -1) skipIndices.add(originValueIdx)
  if (pingIntervalValueIdx !== -1) skipIndices.add(pingIntervalValueIdx)

  for (const [i, arg] of args.entries()) {
    if (arg.startsWith('--') || arg === '-h') {
      flags.push(arg.toLowerCase())
    } else if (skipIndices.has(i)) {
      // 📖 Skip — this is a value for --tier, not an API key
    } else if (!apiKey) {
      apiKey = arg
    }
  }

  const bestMode = flags.includes('--best')
  const fiableMode = flags.includes('--fiable')
  const openCodeMode = flags.includes('--opencode')
  const openCodeDesktopMode = flags.includes('--opencode-desktop')
  const openClawMode = flags.includes('--openclaw')
  const aiderMode = flags.includes('--aider')
  const crushMode = flags.includes('--crush')
  const gooseMode = flags.includes('--goose')
  const qwenMode = flags.includes('--qwen')
  const openHandsMode = flags.includes('--openhands')
  const ampMode = flags.includes('--amp')
  const piMode = flags.includes('--pi')
  const rovoMode = flags.includes('--rovo')
  const geminiMode = flags.includes('--gemini')
  const noTelemetry = flags.includes('--no-telemetry')
  const jsonMode = flags.includes('--json')
  const helpMode = flags.includes('--help') || flags.includes('-h')
  const premiumMode = flags.includes('--premium')

  // New boolean flags
  const sortDesc = flags.includes('--desc')
  const sortAscFlag = flags.includes('--asc')
  const hideUnconfigured = flags.includes('--hide-unconfigured')
  const showUnconfigured = flags.includes('--show-unconfigured')

  let tierFilter = tierValueIdx !== -1 ? args[tierValueIdx].toUpperCase() : null
  let sortColumn = sortValueIdx !== -1 ? args[sortValueIdx].toLowerCase() : null
  let originFilter = originValueIdx !== -1 ? args[originValueIdx] : null
  let pingInterval = pingIntervalValueIdx !== -1 ? parseInt(args[pingIntervalValueIdx], 10) : null
  let sortDirection = sortDesc ? 'desc' : (sortAscFlag ? 'asc' : null)

  // 📖 Profile system removed - API keys now persist permanently across all sessions

  // 📖 --recommend — launch directly into Smart Recommend mode (Q key equivalent)
  const recommendMode = flags.includes('--recommend')

  return {
    apiKey,
    bestMode,
    fiableMode,
    openCodeMode,
    openCodeDesktopMode,
    openClawMode,
    aiderMode,
    crushMode,
    gooseMode,
    qwenMode,
    openHandsMode,
    ampMode,
    piMode,
    rovoMode,
    geminiMode,
    noTelemetry,
    jsonMode,
    helpMode,
    tierFilter,
    sortColumn,
    sortDirection,
    originFilter,
    pingInterval,
    hideUnconfigured,
    showUnconfigured,
    premiumMode,
    // 📖 Profile system removed - API keys now persist permanently across all sessions
    recommendMode,
  }
}

// ─── Smart Recommend — Scoring Engine ─────────────────────────────────────────

// 📖 Task types for the Smart Recommend questionnaire.
// 📖 Each task type has different weight priorities — quick fixes favor speed,
//    deep refactors favor SWE score and context, code review needs balanced quality,
//    test generation needs high SWE score + medium context.
export const TASK_TYPES = {
  quickfix:    { label: 'Quick Fix',       sweWeight: 0.2, speedWeight: 0.5, ctxWeight: 0.1, stabilityWeight: 0.2 },
  refactor:    { label: 'Deep Refactor',   sweWeight: 0.4, speedWeight: 0.1, ctxWeight: 0.3, stabilityWeight: 0.2 },
  review:      { label: 'Code Review',     sweWeight: 0.35, speedWeight: 0.2, ctxWeight: 0.25, stabilityWeight: 0.2 },
  testgen:     { label: 'Test Generation', sweWeight: 0.35, speedWeight: 0.15, ctxWeight: 0.2, stabilityWeight: 0.3 },
}

// 📖 Priority presets — bias the scoring toward speed or quality.
// 📖 'speed' amplifies latency weighting, 'quality' amplifies SWE score weighting.
export const PRIORITY_TYPES = {
  speed:   { label: 'Speed',   speedMultiplier: 1.5, sweMultiplier: 0.7 },
  quality: { label: 'Quality', speedMultiplier: 0.7, sweMultiplier: 1.5 },
  balanced:{ label: 'Balanced', speedMultiplier: 1.0, sweMultiplier: 1.0 },
}

// 📖 Context budget categories — match against model's context window size.
// 📖 'small' (<4K tokens) can use any model. 'large' (>32K) strongly penalizes small-ctx models.
export const CONTEXT_BUDGETS = {
  small:  { label: 'Small file (<4K)',      minCtx: 0,     idealCtx: 32 },
  medium: { label: 'Medium project (<32K)', minCtx: 32,    idealCtx: 128 },
  large:  { label: 'Large codebase (>32K)', minCtx: 128,   idealCtx: 256 },
}

// 📖 parseCtxToK: Convert context window string ("128k", "1m", "200k") into numeric K tokens.
// 📖 Used by the scoring engine to compare against CONTEXT_BUDGETS thresholds.
function parseCtxToK(ctx) {
  if (!ctx || ctx === '—') return 0
  const str = ctx.toLowerCase()
  if (str.includes('m')) return parseFloat(str.replace('m', '')) * 1000
  if (str.includes('k')) return parseFloat(str.replace('k', ''))
  return 0
}

// 📖 formatCtxWindow: Convert context_length number to compact string (256000 → '256k', 1048576 → '1M')
// 📖 Used by dynamic OpenRouter model discovery to convert API response to our display format.
export function formatCtxWindow(n) {
  if (typeof n !== 'number' || n <= 0) return '128k'
  if (n >= 1_000_000) return Math.round(n / 1_000_000) + 'M'
  return Math.round(n / 1000) + 'k'
}

// 📖 labelFromId: Build a human-readable label from an OpenRouter model ID.
// 📖 'qwen/qwen3-coder:free' → 'Qwen3 Coder'
export function labelFromId(id) {
  const base = id.replace(/:free$/, '')
  const name = base.includes('/') ? base.split('/').pop() : base
  return name
    .split(/[-_]/)
    .map(w => w.charAt(0).toUpperCase() + w.slice(1))
    .join(' ')
}

// 📖 parseSweToNum: Convert SWE-bench score string ("49.2%", "73.1%") into a 0–100 number.
// 📖 Returns 0 for missing or invalid scores.
function parseSweToNum(sweScore) {
  if (!sweScore || sweScore === '—') return 0
  const num = parseFloat(sweScore.replace('%', ''))
  return isNaN(num) ? 0 : num
}

/**
 * 📖 scoreModelForTask: Score a single model result for a specific task/priority/context combination.
 *
 * 📖 The score is a weighted composite of 4 signals:
 *   - SWE quality score (0–100): how good the model is at coding (from sources.js benchmarks)
 *   - Speed score (0–100): inverse of average latency (faster = higher score)
 *   - Context fit score (0–100): how well the model's context window matches the user's budget
 *   - Stability score (0–100): composite p95/jitter/uptime from getStabilityScore()
 *
 * 📖 Each signal is weighted by the task type, then further adjusted by the priority multiplier.
 * 📖 Models that are down/timeout get a harsh penalty but aren't completely excluded
 *    (they might come back up during the analysis phase).
 *
 * @param {object} result — A model result object (from state.results)
 * @param {string} taskType — Key from TASK_TYPES ('quickfix'|'refactor'|'review'|'testgen')
 * @param {string} priority — Key from PRIORITY_TYPES ('speed'|'quality'|'balanced')
 * @param {string} contextBudget — Key from CONTEXT_BUDGETS ('small'|'medium'|'large')
 * @returns {number} Score between 0 and 100 (higher = better recommendation)
 */
export function scoreModelForTask(result, taskType, priority, contextBudget) {
  const task = TASK_TYPES[taskType]
  const prio = PRIORITY_TYPES[priority]
  const budget = CONTEXT_BUDGETS[contextBudget]
  if (!task || !prio || !budget) return 0

  // 📖 SWE quality signal (0–100) — raw SWE-bench score
  const sweNum = parseSweToNum(result.sweScore)
  const sweScore = Math.min(100, sweNum * (100 / 80)) // 📖 Normalize: 80% SWE → 100 score

  // 📖 Speed signal (0–100) — inverse latency, capped at 5000ms
  const avg = getAvg(result)
  let speedScore
  if (avg === Infinity) {
    speedScore = 0 // 📖 No data yet — can't judge speed
  } else {
    speedScore = Math.max(0, Math.min(100, 100 * (1 - avg / 5000)))
  }

  // 📖 Context fit signal (0–100):
  //   - Full score if model ctx >= idealCtx
  //   - Partial score if model ctx >= minCtx but < idealCtx (linear interpolation)
  //   - Zero if model ctx < minCtx (too small for the job)
  const modelCtx = parseCtxToK(result.ctx)
  let ctxScore
  if (modelCtx >= budget.idealCtx) {
    ctxScore = 100
  } else if (modelCtx >= budget.minCtx) {
    ctxScore = budget.idealCtx === budget.minCtx
      ? 100
      : Math.round(100 * (modelCtx - budget.minCtx) / (budget.idealCtx - budget.minCtx))
  } else {
    ctxScore = 0
  }

  // 📖 Stability signal (0–100) — from getStabilityScore(), or 0 if no data
  const stability = getStabilityScore(result)
  const stabScore = stability === -1 ? 0 : stability

  // 📖 Weighted combination: task weights × priority multipliers
  const rawScore =
    (sweScore   * task.sweWeight       * prio.sweMultiplier) +
    (speedScore * task.speedWeight     * prio.speedMultiplier) +
    (ctxScore   * task.ctxWeight) +
    (stabScore  * task.stabilityWeight)

  // 📖 Normalize by total effective weight to keep result in 0–100 range
  const totalWeight =
    (task.sweWeight   * prio.sweMultiplier) +
    (task.speedWeight * prio.speedMultiplier) +
    task.ctxWeight +
    task.stabilityWeight

  let score = totalWeight > 0 ? rawScore / totalWeight : 0

  // 📖 Penalty for models that are currently down/timeout — still scoreable but penalized
  if (result.status === 'down' || result.status === 'timeout') {
    score *= 0.2
  }

  return Math.round(Math.min(100, Math.max(0, score)))
}

/**
 * 📖 getTopRecommendations: Score all models and return the top N recommendations.
 *
 * 📖 Filters out hidden models, scores each one, sorts descending, returns topN.
 * 📖 Each returned item includes the original result + computed score for display.
 *
 * @param {Array} results — Full state.results array
 * @param {string} taskType — Key from TASK_TYPES
 * @param {string} priority — Key from PRIORITY_TYPES
 * @param {string} contextBudget — Key from CONTEXT_BUDGETS
 * @param {number} [topN=3] — How many recommendations to return
 * @returns {Array<{result: object, score: number}>} Top N scored models, descending by score
 */
export function getTopRecommendations(results, taskType, priority, contextBudget, topN = 3) {
  const scored = results
    .filter(r => !r.hidden)
    .map(r => ({ result: r, score: scoreModelForTask(r, taskType, priority, contextBudget) }))
    .sort((a, b) => b.score - a.score)

  return scored.slice(0, topN)
}

/**
 * 📖 getVersionStatusInfo turns startup + manual update-check state into a compact,
 * 📖 render-friendly footer descriptor for the main table.
 *
 * 📖 Priority:
 * 📖   1. Manual Settings check found an update (`available`)
 * 📖   2. Startup auto-check already found a newer npm version
 * 📖   3. Otherwise stay quiet
 * 📖
 * 📖 `versionAlertsEnabled` lets the CLI suppress npm-specific warnings in dev checkouts,
 * 📖 where telling contributors to run a global npm update would be bogus.
 *
 * @param {'idle'|'checking'|'available'|'up-to-date'|'error'|'installing'} updateState
 * @param {string|null} latestVersion
 * @param {string|null} [startupLatestVersion=null]
 * @param {boolean} [versionAlertsEnabled=true]
 * @returns {{ isOutdated: boolean, latestVersion: string|null }}
 */
export function getVersionStatusInfo(updateState, latestVersion, startupLatestVersion = null, versionAlertsEnabled = true) {
  if (!versionAlertsEnabled) {
    return {
      isOutdated: false,
      latestVersion: null,
    }
  }

  if (updateState === 'available' && typeof latestVersion === 'string' && latestVersion.trim()) {
    return {
      isOutdated: true,
      latestVersion: latestVersion.trim(),
    }
  }

  if (typeof startupLatestVersion === 'string' && startupLatestVersion.trim()) {
    return {
      isOutdated: true,
      latestVersion: startupLatestVersion.trim(),
    }
  }

  return {
    isOutdated: false,
    latestVersion: null,
  }
}

/**
 * 📖 formatResultsAsJSON converts model results to clean JSON output for scripting/automation.
 *
 * 📖 This is used by the --json flag to output results in a machine-readable format.
 * 📖 The output is designed to be:
 *    - Easy to parse with jq, grep, awk, or any JSON library
 *    - Human-readable for debugging
 *    - Stable (field names won't change between versions)
 *
 * 📖 Output format:
 *   [
 *     {
 *       "rank": 1,
 *       "modelId": "nvidia/deepseek-ai/deepseek-v3.2",
 *       "label": "DeepSeek V3.2",
 *       "provider": "nvidia",
 *       "tier": "S+",
 *       "sweScore": "73.1%",
 *       "context": "128k",
 *       "latestPing": 245,
 *       "avgPing": 260,
 *       "p95": 312,
 *       "jitter": 45,
 *       "stability": 87,
 *       "uptime": 95.5,
 *       "verdict": "Perfect",
 *       "status": "up"
 *     },
 *     ...
 *   ]
 *
 * 📖 Note: NaN and Infinity values are converted to null for cleaner JSON.
 *
 * @param {Array} results — Model result objects from the TUI
 * @param {string} sortBy — Current sort column (for rank calculation)
 * @param {number} limit — Maximum number of results to return (0 = all)
 * @returns {string} JSON string of formatted results
 */
export function formatResultsAsJSON(results, sortBy = 'avg', limit = 0) {
  const formatted = results
    .map((r, idx) => ({
      rank: r.idx || idx + 1,
      modelId: r.modelId || null,
      label: r.label || null,
      provider: r.providerKey || null,
      tier: r.tier || null,
      sweScore: r.sweScore || null,
      context: r.ctx || null,
      latestPing: (r.pings && r.pings.length > 0) ? r.pings[r.pings.length - 1].ms : null,
      avgPing: (Number.isFinite(r.avg)) ? r.avg : null,
      p95: (Number.isFinite(r.p95)) ? r.p95 : null,
      jitter: (Number.isFinite(r.jitter)) ? r.jitter : null,
      stability: (Number.isFinite(r.stability)) ? r.stability : null,
      uptime: (Number.isFinite(r.uptime)) ? r.uptime : null,
      verdict: r.verdict || null,
      status: r.status || null,
      httpCode: r.httpCode || null
    }))
    .slice(0, limit || undefined)

  return JSON.stringify(formatted, null, 2)
}