From 13997cb70924575e43f1b58a0a525a6893c63edd Mon Sep 17 00:00:00 2001
From: Jack Amadeo <jackamadeo@squareup.com>
Date: Wed, 18 Mar 2026 09:54:13 -0400
Subject: [PATCH 1/8] Provider tests in ts

---
 .../tests/integration/test_providers.test.ts  | 110 +++++++
 .../test_providers_code_exec.test.ts          |  80 +++++
 .../tests/integration/test_providers_lib.ts   | 292 ++++++++++++++++++
 ui/desktop/vitest.config.ts                   |  15 +-
 4 files changed, 491 insertions(+), 6 deletions(-)
 create mode 100644 ui/desktop/tests/integration/test_providers.test.ts
 create mode 100644 ui/desktop/tests/integration/test_providers_code_exec.test.ts
 create mode 100644 ui/desktop/tests/integration/test_providers_lib.ts
diff --git a/ui/desktop/tests/integration/test_providers.test.ts b/ui/desktop/tests/integration/test_providers.test.ts
new file mode 100644
index 000000000000..c7fbb77b343f
--- /dev/null
+++ b/ui/desktop/tests/integration/test_providers.test.ts
@@ -0,0 +1,110 @@
+/**
+ * Provider smoke tests — normal mode (direct tool calls).
+ *
+ * Ported from scripts/test_providers.sh.  Each available provider/model pair
+ * gets its own test that spawns `goose run` with the developer builtin, asks
+ * the model to read files via the shell tool, and validates the output.
+ */
+
+import { test, expect, beforeAll } from 'vitest';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import {
+  buildGoose,
+  discoverTestCases,
+  runGoose,
+  isAgenticProvider,
+  isAllowedFailure,
+  type TestCase,
+} from './test_providers_lib';
+
+const BUILTINS = 'developer';
+const TEST_CONTENT = 'test-content-abc123';
+
+let gooseBin: string;
+let testFile: string;
+
+beforeAll(() => {
+  gooseBin = buildGoose();
+
+  const targetDir = path.resolve(process.cwd(), '..', '..', 'target');
+  fs.mkdirSync(targetDir, { recursive: true });
+  testFile = path.join(targetDir, 'test-content.txt');
+  fs.writeFileSync(testFile, TEST_CONTENT + '\n');
+});
+
+const allCases = discoverTestCases();
+const available = allCases.filter((tc) => tc.available && !isAllowedFailure(tc.provider, tc.model));
+const flaky = allCases.filter((tc) => tc.available && isAllowedFailure(tc.provider, tc.model));
+const skipped = allCases.filter((tc) => !tc.available);
+
+async function runNormalTest(tc: TestCase): Promise<void> {
+  const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-test-'));
+
+  try {
+    let prompt: string;
+    let tokenA: string | undefined;
+    let tokenB: string | undefined;
+
+    if (isAgenticProvider(tc.provider)) {
+      fs.copyFileSync(testFile, path.join(testdir, 'test-content.txt'));
+      prompt = 'read ./test-content.txt and output its contents exactly';
+    } else {
+      tokenA = `smoke-alpha-${Math.floor(Math.random() * 32768)}`;
+      tokenB = `smoke-bravo-${Math.floor(Math.random() * 32768)}`;
+      fs.writeFileSync(path.join(testdir, 'part-a.txt'), tokenA + '\n');
+      fs.writeFileSync(path.join(testdir, 'part-b.txt'), tokenB + '\n');
+      prompt =
+        'Use the shell tool to cat ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else.';
+    }
+
+    const output = await runGoose(gooseBin, testdir, prompt, BUILTINS, {
+      GOOSE_PROVIDER: tc.provider,
+      GOOSE_MODEL: tc.model,
+    });
+
+    if (isAgenticProvider(tc.provider)) {
+      expect(
+        output.toLowerCase(),
+        `Expected model output to contain "${TEST_CONTENT}"\n\nFull output:\n${output}`
+      ).toContain(TEST_CONTENT.toLowerCase());
+    } else {
+      const shellToolPattern = /(shell \| developer)|(▸.*shell)/;
+      expect(
+        shellToolPattern.test(output),
+        `Expected model to use shell tool\n\nFull output:\n${output}`
+      ).toBe(true);
+      expect(
+        output,
+        `Expected output to contain token from part-a.txt (${tokenA})\n\nFull output:\n${output}`
+      ).toContain(tokenA);
+      expect(
+        output,
+        `Expected output to contain token from part-b.txt (${tokenB})\n\nFull output:\n${output}`
+      ).toContain(tokenB);
+    }
+  } finally {
+    fs.rmSync(testdir, { recursive: true, force: true });
+  }
+}
+
+if (available.length > 0) {
+  test.each(available)('$provider / $model', async (tc) => {
+    await runNormalTest(tc);
+  });
+}
+
+if (flaky.length > 0) {
+  test.each(flaky)('$provider / $model (flaky — allowed to fail)', async (tc) => {
+    try {
+      await runNormalTest(tc);
+    } catch (err) {
+      console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
+    }
+  });
+}
+
+if (skipped.length > 0) {
+  test.skip.each(skipped)('$provider / $model — $skippedReason', () => {});
+}
diff --git a/ui/desktop/tests/integration/test_providers_code_exec.test.ts b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
new file mode 100644
index 000000000000..a3d17d8f0bf6
--- /dev/null
+++ b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
@@ -0,0 +1,80 @@
+/**
+ * Provider smoke tests — code execution mode (JS batching).
+ *
+ * Ported from scripts/test_providers_code_exec.sh.  Each available
+ * (non-agentic) provider/model pair gets its own test that spawns `goose run`
+ * with the memory + code_execution builtins and validates that the
+ * code_execution tool was invoked.
+ */
+
+import { test, expect, beforeAll } from 'vitest';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import {
+  buildGoose,
+  discoverTestCases,
+  runGoose,
+  isAllowedFailure,
+  type TestCase,
+} from './test_providers_lib';
+
+const BUILTINS = 'memory,code_execution';
+
+let gooseBin: string;
+
+beforeAll(() => {
+  gooseBin = buildGoose();
+});
+
+const allCases = discoverTestCases({ skipAgentic: true });
+const available = allCases.filter((tc) => tc.available && !isAllowedFailure(tc.provider, tc.model));
+const flaky = allCases.filter((tc) => tc.available && isAllowedFailure(tc.provider, tc.model));
+const skipped = allCases.filter((tc) => !tc.available);
+
+async function runCodeExecTest(tc: TestCase): Promise<void> {
+  const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-codeexec-'));
+
+  try {
+    const prompt =
+      "Store a memory with category 'test' and data 'hello world', then retrieve all memories from category 'test'.";
+
+    const output = await runGoose(gooseBin, testdir, prompt, BUILTINS, {
+      GOOSE_PROVIDER: tc.provider,
+      GOOSE_MODEL: tc.model,
+    });
+
+    // Matches: "execute | code_execution", "get_function_details | code_execution",
+    //           "tool call | execute", "tool calls | execute" (old format)
+    //           "▸ execute N tool call" (new format with tool_graph)
+    const codeExecPattern =
+      /(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)/;
+
+    expect(
+      codeExecPattern.test(output),
+      `Expected code_execution tool to be called\n\nFull output:\n${output}`
+    ).toBe(true);
+  } finally {
+    fs.rmSync(testdir, { recursive: true, force: true });
+  }
+}
+
+if (available.length > 0) {
+  test.each(available)('$provider / $model', async (tc) => {
+    await runCodeExecTest(tc);
+  });
+}
+
+if (flaky.length > 0) {
+  test.each(flaky)('$provider / $model (flaky — allowed to fail)', async (tc) => {
+    try {
+      await runCodeExecTest(tc);
+    } catch (err) {
+      console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
+    }
+  });
+}
+
+if (skipped.length > 0) {
+  test.skip.each(skipped)('$provider / $model — $skippedReason', () => {});
+}
diff --git a/ui/desktop/tests/integration/test_providers_lib.ts b/ui/desktop/tests/integration/test_providers_lib.ts
new file mode 100644
index 000000000000..3fe5e46441d5
--- /dev/null
+++ b/ui/desktop/tests/integration/test_providers_lib.ts
@@ -0,0 +1,292 @@
+/**
+ * Shared library for provider smoke tests.
+ *
+ * Ported from scripts/test_providers_lib.sh — keeps the same provider config,
+ * allowed-failure list, agentic-provider list, and environment detection.
+ */
+
+import { execSync, spawn, type ChildProcess } from 'node:child_process';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+// ---------------------------------------------------------------------------
+// Provider configuration
+// ---------------------------------------------------------------------------
+
+const PROVIDER_CONFIG_RAW = `
+openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b
+xai -> grok-3
+openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5
+anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-5-20251101
+google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview
+tetrate -> claude-sonnet-4-20250514
+databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o
+azure_openai -> \${AZURE_OPENAI_DEPLOYMENT_NAME}
+aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0
+gcp_vertex_ai -> gemini-2.5-pro
+snowflake -> claude-sonnet-4-5
+venice -> llama-3.3-70b
+litellm -> gpt-4o-mini
+sagemaker_tgi -> sagemaker-tgi-endpoint
+github_copilot -> gpt-4.1
+chatgpt_codex -> gpt-5.1-codex
+claude-code -> default
+codex -> gpt-5.2-codex
+gemini-cli -> gemini-2.5-pro
+cursor-agent -> auto
+ollama -> qwen3
+`;
+
+const ALLOWED_FAILURES = new Set([
+  'google:gemini-2.5-flash',
+  'google:gemini-3-pro-preview',
+  'openrouter:nvidia/nemotron-3-nano-30b-a3b',
+  'openrouter:qwen/qwen3-coder:exacto',
+  'openai:gpt-3.5-turbo',
+]);
+
+const AGENTIC_PROVIDERS = new Set(['claude-code', 'codex', 'gemini-cli', 'cursor-agent']);
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function loadDotenv(): void {
+  const envPath = path.resolve(process.cwd(), '.env');
+  if (!fs.existsSync(envPath)) return;
+  const lines = fs.readFileSync(envPath, 'utf-8').split('\n');
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (!trimmed || trimmed.startsWith('#')) continue;
+    const eqIdx = trimmed.indexOf('=');
+    if (eqIdx === -1) continue;
+    const key = trimmed.slice(0, eqIdx);
+    const value = trimmed.slice(eqIdx + 1);
+    if (!(key in process.env)) {
+      process.env[key] = value;
+    }
+  }
+}
+
+function hasEnv(name: string): boolean {
+  return !!process.env[name];
+}
+
+function hasCmd(name: string): boolean {
+  try {
+    execSync(`command -v ${name}`, { stdio: 'ignore' });
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+function hasFile(p: string): boolean {
+  return fs.existsSync(p);
+}
+
+export function isAgenticProvider(provider: string): boolean {
+  return AGENTIC_PROVIDERS.has(provider);
+}
+
+function isProviderAvailable(provider: string): boolean {
+  switch (provider) {
+    case 'openrouter':
+      return hasEnv('OPENROUTER_API_KEY');
+    case 'xai':
+      return hasEnv('XAI_API_KEY');
+    case 'openai':
+      return hasEnv('OPENAI_API_KEY');
+    case 'anthropic':
+      return hasEnv('ANTHROPIC_API_KEY');
+    case 'google':
+      return hasEnv('GOOGLE_API_KEY');
+    case 'tetrate':
+      return hasEnv('TETRATE_API_KEY');
+    case 'databricks':
+      return hasEnv('DATABRICKS_HOST') && hasEnv('DATABRICKS_TOKEN');
+    case 'azure_openai':
+      return hasEnv('AZURE_OPENAI_ENDPOINT') && hasEnv('AZURE_OPENAI_DEPLOYMENT_NAME');
+    case 'aws_bedrock':
+      return hasEnv('AWS_REGION') && (hasEnv('AWS_PROFILE') || hasEnv('AWS_ACCESS_KEY_ID'));
+    case 'gcp_vertex_ai':
+      return hasEnv('GCP_PROJECT_ID');
+    case 'snowflake':
+      return hasEnv('SNOWFLAKE_HOST') && hasEnv('SNOWFLAKE_TOKEN');
+    case 'venice':
+      return hasEnv('VENICE_API_KEY');
+    case 'litellm':
+      return hasEnv('LITELLM_API_KEY');
+    case 'sagemaker_tgi':
+      return hasEnv('SAGEMAKER_ENDPOINT_NAME') && hasEnv('AWS_REGION');
+    case 'github_copilot':
+      return (
+        hasEnv('GITHUB_COPILOT_TOKEN') ||
+        hasFile(path.join(os.homedir(), '.config/goose/github_copilot_token.json'))
+      );
+    case 'chatgpt_codex':
+      return (
+        hasEnv('CHATGPT_CODEX_TOKEN') ||
+        hasFile(path.join(os.homedir(), '.config/goose/chatgpt_codex_token.json'))
+      );
+    case 'ollama':
+      return hasEnv('OLLAMA_HOST') || hasCmd('ollama');
+    case 'claude-code':
+      return hasCmd('claude');
+    case 'codex':
+      return hasCmd('codex');
+    case 'gemini-cli':
+      return hasCmd('gemini');
+    case 'cursor-agent':
+      return hasCmd('cursor-agent');
+    default:
+      return true;
+  }
+}
+
+export function isAllowedFailure(provider: string, model: string): boolean {
+  return ALLOWED_FAILURES.has(`${provider}:${model}`);
+}
+
+function shouldSkipProvider(provider: string): boolean {
+  const skip = process.env.SKIP_PROVIDERS;
+  if (!skip) return false;
+  return skip
+    .split(',')
+    .map((s) => s.trim())
+    .includes(provider);
+}
+
+// ---------------------------------------------------------------------------
+// Parse provider config
+// ---------------------------------------------------------------------------
+
+interface ProviderLine {
+  provider: string;
+  modelsStr: string;
+}
+
+function parseProviderConfig(): ProviderLine[] {
+  const lines: ProviderLine[] = [];
+  for (const raw of PROVIDER_CONFIG_RAW.split('\n')) {
+    const line = raw.trim();
+    if (!line || line.startsWith('#')) continue;
+    const arrowIdx = line.indexOf(' -> ');
+    if (arrowIdx === -1) continue;
+    const provider = line.slice(0, arrowIdx).trim();
+    let modelsStr = line.slice(arrowIdx + 4).trim();
+    modelsStr = modelsStr.replace(/\$\{(\w+)\}/g, (_, name) => process.env[name] ?? '');
+    lines.push({ provider, modelsStr });
+  }
+  return lines;
+}
+
+// ---------------------------------------------------------------------------
+// Build goose binary
+// ---------------------------------------------------------------------------
+
+export function buildGoose(): string {
+  if (!process.env.SKIP_BUILD) {
+    console.error('Building goose...');
+    execSync('cargo build --bin goose', { stdio: 'inherit' });
+    console.error('');
+  } else {
+    console.error('Skipping build (SKIP_BUILD is set)...');
+    console.error('');
+  }
+  return path.resolve(process.cwd(), '..', '..', 'target/debug/goose');
+}
+
+// ---------------------------------------------------------------------------
+// Test case discovery
+// ---------------------------------------------------------------------------
+
+export interface TestCase {
+  provider: string;
+  model: string;
+  available: boolean;
+  skippedReason?: string;
+}
+
+export function discoverTestCases(options?: { skipAgentic?: boolean }): TestCase[] {
+  loadDotenv();
+  const skipAgentic = options?.skipAgentic ?? false;
+  const providerLines = parseProviderConfig();
+
+  const testCases: TestCase[] = [];
+
+  for (const { provider, modelsStr } of providerLines) {
+    const available = isProviderAvailable(provider);
+    const models = modelsStr.split('|');
+
+    for (const model of models) {
+      if (!available) {
+        testCases.push({
+          provider,
+          model,
+          available: false,
+          skippedReason: 'prerequisites not met',
+        });
+      } else if (shouldSkipProvider(provider)) {
+        testCases.push({
+          provider,
+          model,
+          available: false,
+          skippedReason: 'SKIP_PROVIDERS',
+        });
+      } else if (skipAgentic && isAgenticProvider(provider)) {
+        testCases.push({
+          provider,
+          model,
+          available: false,
+          skippedReason: 'agentic provider skipped in this mode',
+        });
+      } else {
+        testCases.push({ provider, model, available: true });
+      }
+    }
+  }
+
+  return testCases;
+}
+
+// ---------------------------------------------------------------------------
+// Utility: run goose binary and capture output
+// ---------------------------------------------------------------------------
+
+export function runGoose(
+  gooseBin: string,
+  cwd: string,
+  prompt: string,
+  builtins: string,
+  env: Record<string, string>
+): Promise<string> {
+  return new Promise((resolve) => {
+    const child: ChildProcess = spawn(
+      gooseBin,
+      ['run', '--text', prompt, '--with-builtin', builtins],
+      {
+        cwd,
+        env: { ...process.env, ...env },
+        stdio: ['ignore', 'pipe', 'pipe'],
+      }
+    );
+
+    let output = '';
+    child.stdout?.on('data', (d) => {
+      output += String(d);
+    });
+    child.stderr?.on('data', (d) => {
+      output += String(d);
+    });
+
+    child.on('close', () => {
+      resolve(output);
+    });
+
+    child.on('error', (err) => {
+      resolve(`spawn error: ${err.message}`);
+    });
+  });
+}
diff --git a/ui/desktop/vitest.config.ts b/ui/desktop/vitest.config.ts
index 7a2965c12f80..7a09ffc3c508 100644
--- a/ui/desktop/vitest.config.ts
+++ b/ui/desktop/vitest.config.ts
@@ -1,7 +1,7 @@
 /// <reference types="vitest" />
-import { defineConfig } from 'vitest/config'
-import react from '@vitejs/plugin-react'
-import { resolve } from 'node:path'
+import { defineConfig } from 'vitest/config';
+import react from '@vitejs/plugin-react';
+import { resolve } from 'node:path';
 
 const cfg = {
   plugins: [react()],
@@ -15,8 +15,11 @@ const cfg = {
     environment: 'jsdom',
     setupFiles: ['./src/test/setup.ts'],
     css: true,
-    include: ['src/**/*.{test,spec}.{js,jsx,ts,tsx}'],
+    include: [
+      'src/**/*.{test,spec}.{js,jsx,ts,tsx}',
+      'tests/integration/**/*.{test,spec}.{js,jsx,ts,tsx}',
+    ],
   },
-} satisfies Record<string, any>
+} satisfies Record<string, any>;
 
-export default defineConfig(cfg as any)
+export default defineConfig(cfg as any);

From 7727a0a40b0e479c3bf1c4c8288f04839f997576 Mon Sep 17 00:00:00 2001
From: Jack Amadeo <jackamadeo@squareup.com>
Date: Tue, 31 Mar 2026 18:30:46 -0400
Subject: [PATCH 2/8] Switch over to it

---
 .github/workflows/pr-smoke-test.yml |  19 ++-
 RELEASE_CHECKLIST.md                |   2 +-
 scripts/test_providers.sh           |  71 --------
 scripts/test_providers_code_exec.sh |  45 -----
 scripts/test_providers_lib.sh       | 244 ----------------------------
 5 files changed, 14 insertions(+), 367 deletions(-)
 delete mode 100755 scripts/test_providers.sh
 delete mode 100755 scripts/test_providers_code_exec.sh
 delete mode 100755 scripts/test_providers_lib.sh

diff --git a/.github/workflows/pr-smoke-test.yml b/.github/workflows/pr-smoke-test.yml
index cdc0414ace61..e930b1788b7f 100644
--- a/.github/workflows/pr-smoke-test.yml
+++ b/.github/workflows/pr-smoke-test.yml
@@ -110,7 +110,11 @@ jobs:
       - name: Install agentic providers
         run: npm install -g @anthropic-ai/claude-code @openai/codex @google/gemini-cli @zed-industries/claude-agent-acp @zed-industries/codex-acp
 
-      - name: Run Smoke Tests with Provider Script
+      - name: Install Node.js Dependencies
+        run: source ../../bin/activate-hermit && pnpm install --frozen-lockfile
+        working-directory: ui/desktop
+
+      - name: Run Smoke Tests (Normal Mode)
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -127,12 +131,10 @@ jobs:
           SKIP_BUILD: 1
           SKIP_PROVIDERS: ${{ vars.SKIP_PROVIDERS || '' }}
         run: |
-          # Ensure the HOME directory structure exists
           mkdir -p $HOME/.local/share/goose/sessions
           mkdir -p $HOME/.config/goose
-
-          # Run the provider test script (binary already built and downloaded)
-          bash scripts/test_providers.sh
+          source ../../bin/activate-hermit && pnpm run test:integration -- tests/integration/test_providers.test.ts
+        working-directory: ui/desktop
 
       - name: Set up Python
         uses: actions/setup-python@v5
@@ -188,6 +190,10 @@ jobs:
       - name: Make Binary Executable
         run: chmod +x target/debug/goose
 
+      - name: Install Node.js Dependencies
+        run: source ../../bin/activate-hermit && pnpm install --frozen-lockfile
+        working-directory: ui/desktop
+
       - name: Run Provider Tests (Code Execution Mode)
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
@@ -205,7 +211,8 @@ jobs:
         run: |
           mkdir -p $HOME/.local/share/goose/sessions
           mkdir -p $HOME/.config/goose
-          bash scripts/test_providers_code_exec.sh
+          source ../../bin/activate-hermit && pnpm run test:integration -- tests/integration/test_providers_code_exec.test.ts
+        working-directory: ui/desktop
 
   compaction-tests:
     name: Compaction Tests
diff --git a/RELEASE_CHECKLIST.md b/RELEASE_CHECKLIST.md
index 9a10dc6b5773..fdc0268790ce 100644
--- a/RELEASE_CHECKLIST.md
+++ b/RELEASE_CHECKLIST.md
@@ -17,7 +17,7 @@ Make a copy of this document for each version and check off as steps are verifie
 
 ### Provider Testing
 
-- [ ] Run `./scripts/test_providers.sh` locally from the release branch and verify all providers/models work
+- [ ] Run `cd ui/desktop && pnpm run test:integration -- tests/integration/test_providers.test.ts` locally from the release branch and verify all providers/models work
 - [ ] Launch goose, click reset providers, choose databricks and a model
 
 ### Starting Conversations
diff --git a/scripts/test_providers.sh b/scripts/test_providers.sh
deleted file mode 100755
index b6c28b8d445e..000000000000
--- a/scripts/test_providers.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-
-LIB_DIR="$(cd "$(dirname "$0")" && pwd)"
-source "$LIB_DIR/test_providers_lib.sh"
-
-echo "Mode: normal (direct tool calls)"
-echo ""
-
-GOOSE_BIN=$(build_goose)
-BUILTINS="developer"
-
-mkdir -p target
-TEST_CONTENT="test-content-abc123"
-TEST_FILE="./target/test-content.txt"
-echo "$TEST_CONTENT" > "$TEST_FILE"
-
-run_test() {
-  local provider="$1" model="$2" result_file="$3" output_file="$4"
-  local testdir=$(mktemp -d)
-
-  local prompt
-  if is_agentic_provider "$provider"; then
-    cp "$TEST_FILE" "$testdir/test-content.txt"
-    prompt="read ./test-content.txt and output its contents exactly"
-  else
-    # Write two files with unique random tokens. Validation checks that the shell
-    # tool was used and that both tokens appear in the output, proving the model
-    # actually read the files (random tokens can't be guessed or hallucinated).
-    local token_a="smoke-alpha-$RANDOM"
-    local token_b="smoke-bravo-$RANDOM"
-    echo "$token_a" > "$testdir/part-a.txt"
-    echo "$token_b" > "$testdir/part-b.txt"
-    # Store tokens so validation can check them
-    echo "$token_a" > "$testdir/.token_a"
-    echo "$token_b" > "$testdir/.token_b"
-    prompt="Use the shell tool to cat ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else."
-  fi
-
-  (
-    export GOOSE_PROVIDER="$provider"
-    export GOOSE_MODEL="$model"
-    cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
-  ) > "$output_file" 2>&1
-
-  if is_agentic_provider "$provider"; then
-    if grep -qi "$TEST_CONTENT" "$output_file"; then
-      echo "success|test content found by model" > "$result_file"
-    else
-      echo "failure|test content not found by model" > "$result_file"
-    fi
-  else
-    local token_a token_b
-    token_a=$(cat "$testdir/.token_a")
-    token_b=$(cat "$testdir/.token_b")
-    if ! grep -qE "(shell \| developer)|(▸.*shell)" "$output_file"; then
-      echo "failure|model did not use shell tool" > "$result_file"
-    elif ! grep -q "$token_a" "$output_file"; then
-      echo "failure|model did not return contents of part-a.txt ($token_a)" > "$result_file"
-    elif ! grep -q "$token_b" "$output_file"; then
-      echo "failure|model did not return contents of part-b.txt ($token_b)" > "$result_file"
-    else
-      echo "success|model read and returned both file contents" > "$result_file"
-    fi
-  fi
-
-  rm -rf "$testdir"
-}
-
-build_test_cases
-run_test_cases run_test
-report_results
diff --git a/scripts/test_providers_code_exec.sh b/scripts/test_providers_code_exec.sh
deleted file mode 100755
index c9d720d202a0..000000000000
--- a/scripts/test_providers_code_exec.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Provider smoke tests - code execution mode (JS batching)
-
-LIB_DIR="$(cd "$(dirname "$0")" && pwd)"
-source "$LIB_DIR/test_providers_lib.sh"
-
-echo "Mode: code_execution (JS batching)"
-echo ""
-
-# --- Setup ---
-
-GOOSE_BIN=$(build_goose)
-BUILTINS="memory,code_execution"
-
-# --- Test case ---
-
-run_test() {
-  local provider="$1" model="$2" result_file="$3" output_file="$4"
-  local testdir=$(mktemp -d)
-
-  local prompt="Store a memory with category 'test' and data 'hello world', then retrieve all memories from category 'test'."
-
-  # Run goose
-  (
-    export GOOSE_PROVIDER="$provider"
-    export GOOSE_MODEL="$model"
-    cd "$testdir" && "$GOOSE_BIN" run --text "$prompt" --with-builtin "$BUILTINS" 2>&1
-  ) > "$output_file" 2>&1
-
-  # Matches: "execute_typescript | code_execution", "get_function_details | code_execution",
-  #           "tool call | execute", "tool calls | execute" (old format)
-  #           "▸ execute N tool call" (new format with tool_graph)
-  #           "▸ execute_typescript" (plain tool name in output)
-  if grep -qE "(execute_typescript \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)|(▸ execute_typescript)" "$output_file"; then
-    echo "success|code_execution tool called" > "$result_file"
-  else
-    echo "failure|no code_execution tool calls found" > "$result_file"
-  fi
-
-  rm -rf "$testdir"
-}
-
-build_test_cases --skip-agentic
-run_test_cases run_test
-report_results
diff --git a/scripts/test_providers_lib.sh b/scripts/test_providers_lib.sh
deleted file mode 100755
index 0ef52f12d11c..000000000000
--- a/scripts/test_providers_lib.sh
+++ /dev/null
@@ -1,244 +0,0 @@
-#!/bin/bash
-
-PROVIDER_CONFIG="
-openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b
-xai -> grok-3
-openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5
-anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-5-20251101
-google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview
-tetrate -> claude-sonnet-4-20250514
-databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o
-azure_openai -> ${AZURE_OPENAI_DEPLOYMENT_NAME}
-aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0
-gcp_vertex_ai -> gemini-2.5-pro
-snowflake -> claude-sonnet-4-5
-venice -> llama-3.3-70b
-litellm -> gpt-4o-mini
-sagemaker_tgi -> sagemaker-tgi-endpoint
-github_copilot -> gpt-4.1
-chatgpt_codex -> gpt-5.1-codex
-claude-code -> default
-codex -> gpt-5.2-codex
-gemini-cli -> gemini-2.5-pro
-cursor-agent -> auto
-ollama -> qwen3
-"
-
-# Flaky models allowed to fail without blocking PRs.
-ALLOWED_FAILURES=(
-  "google:gemini-2.5-flash"
-  "google:gemini-3-pro-preview"
-  "openrouter:nvidia/nemotron-3-nano-30b-a3b"
-  "openrouter:qwen/qwen3-coder:exacto"
-  "openai:gpt-3.5-turbo"
-)
-
-AGENTIC_PROVIDERS=("claude-code" "codex" "gemini-cli" "cursor-agent")
-
-if [ -f .env ]; then
-  export $(grep -v '^#' .env | xargs)
-fi
-
-build_goose() {
-  if [ -z "$SKIP_BUILD" ]; then
-    echo "Building goose..." >&2
-    cargo build --bin goose >&2
-    echo "" >&2
-  else
-    echo "Skipping build (SKIP_BUILD is set)..." >&2
-    echo "" >&2
-  fi
-
-  echo "$(pwd)/target/debug/goose"
-}
-
-has_env() { [ -n "${!1}" ]; }
-has_cmd() { command -v "$1" &>/dev/null; }
-has_file() { [ -f "$1" ]; }
-
-is_provider_available() {
-  case "$1" in
-    openrouter)      has_env OPENROUTER_API_KEY ;;
-    xai)             has_env XAI_API_KEY ;;
-    openai)          has_env OPENAI_API_KEY ;;
-    anthropic)       has_env ANTHROPIC_API_KEY ;;
-    google)          has_env GOOGLE_API_KEY ;;
-    tetrate)         has_env TETRATE_API_KEY ;;
-    databricks)      has_env DATABRICKS_HOST && has_env DATABRICKS_TOKEN ;;
-    azure_openai)    has_env AZURE_OPENAI_ENDPOINT && has_env AZURE_OPENAI_DEPLOYMENT_NAME ;;
-    aws_bedrock)     has_env AWS_REGION && { has_env AWS_PROFILE || has_env AWS_ACCESS_KEY_ID; } ;;
-    gcp_vertex_ai)   has_env GCP_PROJECT_ID ;;
-    snowflake)       has_env SNOWFLAKE_HOST && has_env SNOWFLAKE_TOKEN ;;
-    venice)          has_env VENICE_API_KEY ;;
-    litellm)         has_env LITELLM_API_KEY ;;
-    sagemaker_tgi)   has_env SAGEMAKER_ENDPOINT_NAME && has_env AWS_REGION ;;
-    github_copilot)  has_env GITHUB_COPILOT_TOKEN || has_file "$HOME/.config/goose/github_copilot_token.json" ;;
-    chatgpt_codex)   has_env CHATGPT_CODEX_TOKEN || has_file "$HOME/.config/goose/chatgpt_codex_token.json" ;;
-    ollama)          has_env OLLAMA_HOST || has_cmd ollama ;;
-    claude-code)     has_cmd claude ;;
-    codex)           has_cmd codex ;;
-    gemini-cli)      has_cmd gemini ;;
-    cursor-agent)    has_cmd cursor-agent ;;
-    *)               return 0 ;;
-  esac
-}
-
-is_allowed_failure() {
-  local key="${1}:${2}"
-  for allowed in "${ALLOWED_FAILURES[@]}"; do
-    [ "$allowed" = "$key" ] && return 0
-  done
-  return 1
-}
-
-should_skip_provider() {
-  [ -z "$SKIP_PROVIDERS" ] && return 1
-  IFS=',' read -ra SKIP_LIST <<< "$SKIP_PROVIDERS"
-  for skip in "${SKIP_LIST[@]}"; do
-    skip=$(echo "$skip" | xargs)
-    [ "$skip" = "$1" ] && return 0
-  done
-  return 1
-}
-
-is_agentic_provider() {
-  for agentic in "${AGENTIC_PROVIDERS[@]}"; do
-    [ "$agentic" = "$1" ] && return 0
-  done
-  return 1
-}
-
-# build_test_cases [--skip-agentic]
-build_test_cases() {
-  local skip_agentic=false
-  [ "$1" = "--skip-agentic" ] && skip_agentic=true
-
-  local providers=()
-  while IFS= read -r line; do
-    [[ "$line" =~ ^#.*$ || -z "$line" ]] && continue
-    local provider="${line%% -> *}"
-    if is_provider_available "$provider"; then
-      providers+=("$line")
-      echo "✓ Including $provider"
-    else
-      echo "⚠️  Skipping $provider (prerequisites not met)"
-    fi
-  done <<< "$PROVIDER_CONFIG"
-  echo ""
-
-  TEST_CASES=()
-  local job_index=0
-  for provider_config in "${providers[@]}"; do
-    local provider="${provider_config%% -> *}"
-    local models_str="${provider_config#* -> }"
-
-    if should_skip_provider "$provider"; then
-      echo "⊘ Skipping provider: ${provider} (SKIP_PROVIDERS)"
-      continue
-    fi
-
-    if [ "$skip_agentic" = true ] && is_agentic_provider "$provider"; then
-      echo "⊘ Skipping agentic provider: ${provider}"
-      continue
-    fi
-
-    IFS='|' read -ra models <<< "$models_str"
-    for model in "${models[@]}"; do
-      TEST_CASES+=("$provider|$model|$job_index")
-      ((job_index++))
-    done
-  done
-}
-
-# run_test_cases <test_fn>
-run_test_cases() {
-  local test_fn="$1"
-
-  RESULTS_DIR=$(mktemp -d)
-  trap 'if [ -n "${RESULTS_DIR:-}" ]; then rm -rf -- "$RESULTS_DIR"; fi; if [ -n "${CLEANUP_DIR:-}" ]; then rm -rf -- "$CLEANUP_DIR"; fi' EXIT
-  MAX_PARALLEL=${MAX_PARALLEL:-$(nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 8)}
-  echo "Running ${#TEST_CASES[@]} tests (max $MAX_PARALLEL parallel)"
-  echo ""
-
-  local running=0
-  for ((i=0; i<${#TEST_CASES[@]}; i++)); do
-    IFS='|' read -r provider model idx <<< "${TEST_CASES[$i]}"
-
-    if [ $i -eq 0 ]; then
-      # First test runs sequentially to catch early failures
-      "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx"
-    else
-      "$test_fn" "$provider" "$model" "$RESULTS_DIR/result_$idx" "$RESULTS_DIR/output_$idx" &
-      ((running++))
-      if [ $running -ge $MAX_PARALLEL ]; then
-        wait -n 2>/dev/null || wait
-        ((running--))
-      fi
-    fi
-  done
-  wait
-}
-
-report_results() {
-  echo ""
-  echo "=== Test Results ==="
-  echo ""
-
-  RESULTS=()
-  HARD_FAILURES=()
-
-  for job in "${TEST_CASES[@]}"; do
-    IFS='|' read -r provider model idx <<< "$job"
-
-    echo "Provider: $provider"
-    echo "Model: $model"
-    echo ""
-    cat "$RESULTS_DIR/output_$idx"
-    echo ""
-
-    local result_line=""
-    [ -f "$RESULTS_DIR/result_$idx" ] && result_line=$(cat "$RESULTS_DIR/result_$idx")
-    local status="${result_line%%|*}"
-    local msg="${result_line#*|}"
-
-    if [ "$status" = "success" ]; then
-      echo "✓ SUCCESS: $msg"
-      RESULTS+=("✓ ${provider}: ${model}")
-    else
-      if is_allowed_failure "$provider" "$model"; then
-        echo "⚠ FLAKY: $msg"
-        RESULTS+=("⚠ ${provider}: ${model} (flaky)")
-      else
-        echo "✗ FAILED: $msg"
-        RESULTS+=("✗ ${provider}: ${model}")
-        HARD_FAILURES+=("${provider}: ${model}")
-      fi
-    fi
-    echo "---"
-  done
-
-  echo ""
-  echo "=== Test Summary ==="
-  for result in "${RESULTS[@]}"; do
-    echo "$result"
-  done
-
-  if [ ${#HARD_FAILURES[@]} -gt 0 ]; then
-    echo ""
-    echo "Hard failures (${#HARD_FAILURES[@]}):"
-    for failure in "${HARD_FAILURES[@]}"; do
-      echo "  - $failure"
-    done
-    echo ""
-    echo "Some tests failed!"
-    exit 1
-  else
-    if echo "${RESULTS[@]}" | grep -q "⚠"; then
-      echo ""
-      echo "All required tests passed! (some flaky tests failed but are allowed)"
-    else
-      echo ""
-      echo "All tests passed!"
-    fi
-  fi
-}

From 3d5bea8b429dd6274555a4ca6883bc4c644bb0d8 Mon Sep 17 00:00:00 2001
From: Jack Amadeo <jackamadeo@squareup.com>
Date: Wed, 1 Apr 2026 08:23:07 -0400
Subject: [PATCH 3/8] fix the regex

---
 .../tests/integration/test_providers_code_exec.test.ts       | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ui/desktop/tests/integration/test_providers_code_exec.test.ts b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
index a3d17d8f0bf6..53467c33741c 100644
--- a/ui/desktop/tests/integration/test_providers_code_exec.test.ts
+++ b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
@@ -44,11 +44,12 @@ async function runCodeExecTest(tc: TestCase): Promise<void> {
       GOOSE_MODEL: tc.model,
     });
 
-    // Matches: "execute | code_execution", "get_function_details | code_execution",
+    // Matches: "execute_typescript | code_execution", "get_function_details | code_execution",
     //           "tool call | execute", "tool calls | execute" (old format)
     //           "▸ execute N tool call" (new format with tool_graph)
+    //           "▸ execute_typescript" (plain tool name in output)
     const codeExecPattern =
-      /(execute \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)/;
+      /(execute_typescript \| code_execution)|(get_function_details \| code_execution)|(tool calls? \| execute)|(▸.*execute.*tool call)|(▸ execute_typescript)/;
 
     expect(
       codeExecPattern.test(output),

From 4d1f328a8800d3b0b1bcaf5c6916535dfdd5cc22 Mon Sep 17 00:00:00 2001
From: Jack Amadeo <jackamadeo@squareup.com>
Date: Wed, 1 Apr 2026 09:29:45 -0400
Subject: [PATCH 4/8] fix targets

---
 .github/workflows/pr-smoke-test.yml | 3 ++-
 ui/desktop/vitest.config.ts         | 5 +----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/pr-smoke-test.yml b/.github/workflows/pr-smoke-test.yml
index e930b1788b7f..e30ec8f1042f 100644
--- a/.github/workflows/pr-smoke-test.yml
+++ b/.github/workflows/pr-smoke-test.yml
@@ -284,7 +284,8 @@ jobs:
           GOOSE_PROVIDER: anthropic
           GOOSE_MODEL: claude-sonnet-4-5-20250929
           SHELL: /bin/bash
+          SKIP_BUILD: 1
         run: |
             echo 'export PATH=/some/fake/path:$PATH' >> $HOME/.bash_profile
-            source ../../bin/activate-hermit && pnpm run test:integration:debug
+            source ../../bin/activate-hermit && pnpm run test:integration -- tests/integration/goosed.test.ts
         working-directory: ui/desktop
diff --git a/ui/desktop/vitest.config.ts b/ui/desktop/vitest.config.ts
index 7a09ffc3c508..f745b9244dcc 100644
--- a/ui/desktop/vitest.config.ts
+++ b/ui/desktop/vitest.config.ts
@@ -15,10 +15,7 @@ const cfg = {
     environment: 'jsdom',
     setupFiles: ['./src/test/setup.ts'],
     css: true,
-    include: [
-      'src/**/*.{test,spec}.{js,jsx,ts,tsx}',
-      'tests/integration/**/*.{test,spec}.{js,jsx,ts,tsx}',
-    ],
+    include: ['src/**/*.{test,spec}.{js,jsx,ts,tsx}'],
   },
 } satisfies Record<string, any>;
 

From 3db9eb5fb5a8585af4b466524c1f4937154dc1c5 Mon Sep 17 00:00:00 2001
From: Jack Amadeo <jackamadeo@squareup.com>
Date: Wed, 1 Apr 2026 10:02:31 -0400
Subject: [PATCH 5/8] provider types

---
 .../tests/integration/test_providers.test.ts  |  17 +-
 .../test_providers_code_exec.test.ts          |  12 +-
 .../tests/integration/test_providers_lib.ts   | 329 ++++++++++--------
 3 files changed, 194 insertions(+), 164 deletions(-)

diff --git a/ui/desktop/tests/integration/test_providers.test.ts b/ui/desktop/tests/integration/test_providers.test.ts
index c7fbb77b343f..fccff70ad81b 100644
--- a/ui/desktop/tests/integration/test_providers.test.ts
+++ b/ui/desktop/tests/integration/test_providers.test.ts
@@ -10,14 +10,7 @@ import { test, expect, beforeAll } from 'vitest';
 import fs from 'node:fs';
 import os from 'node:os';
 import path from 'node:path';
-import {
-  buildGoose,
-  discoverTestCases,
-  runGoose,
-  isAgenticProvider,
-  isAllowedFailure,
-  type TestCase,
-} from './test_providers_lib';
+import { buildGoose, discoverTestCases, runGoose, type TestCase } from './test_providers_lib';
 
 const BUILTINS = 'developer';
 const TEST_CONTENT = 'test-content-abc123';
@@ -35,8 +28,8 @@ beforeAll(() => {
 });
 
 const allCases = discoverTestCases();
-const available = allCases.filter((tc) => tc.available && !isAllowedFailure(tc.provider, tc.model));
-const flaky = allCases.filter((tc) => tc.available && isAllowedFailure(tc.provider, tc.model));
+const available = allCases.filter((tc) => tc.available && !tc.flaky);
+const flaky = allCases.filter((tc) => tc.available && tc.flaky);
 const skipped = allCases.filter((tc) => !tc.available);
 
 async function runNormalTest(tc: TestCase): Promise<void> {
@@ -47,7 +40,7 @@ async function runNormalTest(tc: TestCase): Promise<void> {
     let tokenA: string | undefined;
     let tokenB: string | undefined;
 
-    if (isAgenticProvider(tc.provider)) {
+    if (tc.agentic) {
       fs.copyFileSync(testFile, path.join(testdir, 'test-content.txt'));
       prompt = 'read ./test-content.txt and output its contents exactly';
     } else {
@@ -64,7 +57,7 @@ async function runNormalTest(tc: TestCase): Promise<void> {
       GOOSE_MODEL: tc.model,
     });
 
-    if (isAgenticProvider(tc.provider)) {
+    if (tc.agentic) {
       expect(
         output.toLowerCase(),
         `Expected model output to contain "${TEST_CONTENT}"\n\nFull output:\n${output}`
diff --git a/ui/desktop/tests/integration/test_providers_code_exec.test.ts b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
index 53467c33741c..d1234b39b793 100644
--- a/ui/desktop/tests/integration/test_providers_code_exec.test.ts
+++ b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
@@ -11,13 +11,7 @@ import { test, expect, beforeAll } from 'vitest';
 import fs from 'node:fs';
 import os from 'node:os';
 import path from 'node:path';
-import {
-  buildGoose,
-  discoverTestCases,
-  runGoose,
-  isAllowedFailure,
-  type TestCase,
-} from './test_providers_lib';
+import { buildGoose, discoverTestCases, runGoose, type TestCase } from './test_providers_lib';
 
 const BUILTINS = 'memory,code_execution';
 
@@ -28,8 +22,8 @@ beforeAll(() => {
 });
 
 const allCases = discoverTestCases({ skipAgentic: true });
-const available = allCases.filter((tc) => tc.available && !isAllowedFailure(tc.provider, tc.model));
-const flaky = allCases.filter((tc) => tc.available && isAllowedFailure(tc.provider, tc.model));
+const available = allCases.filter((tc) => tc.available && !tc.flaky);
+const flaky = allCases.filter((tc) => tc.available && tc.flaky);
 const skipped = allCases.filter((tc) => !tc.available);
 
 async function runCodeExecTest(tc: TestCase): Promise<void> {
diff --git a/ui/desktop/tests/integration/test_providers_lib.ts b/ui/desktop/tests/integration/test_providers_lib.ts
index 3fe5e46441d5..b4b790d109eb 100644
--- a/ui/desktop/tests/integration/test_providers_lib.ts
+++ b/ui/desktop/tests/integration/test_providers_lib.ts
@@ -14,59 +14,21 @@ import path from 'node:path';
 // Provider configuration
 // ---------------------------------------------------------------------------
 
-const PROVIDER_CONFIG_RAW = `
-openrouter -> google/gemini-2.5-pro|anthropic/claude-sonnet-4.5|qwen/qwen3-coder:exacto|z-ai/glm-4.6:exacto|nvidia/nemotron-3-nano-30b-a3b
-xai -> grok-3
-openai -> gpt-4o|gpt-4o-mini|gpt-3.5-turbo|gpt-5
-anthropic -> claude-sonnet-4-5-20250929|claude-opus-4-5-20251101
-google -> gemini-2.5-pro|gemini-2.5-flash|gemini-3-pro-preview|gemini-3-flash-preview
-tetrate -> claude-sonnet-4-20250514
-databricks -> databricks-claude-sonnet-4|gemini-2-5-flash|gpt-4o
-azure_openai -> \${AZURE_OPENAI_DEPLOYMENT_NAME}
-aws_bedrock -> us.anthropic.claude-sonnet-4-5-20250929-v1:0
-gcp_vertex_ai -> gemini-2.5-pro
-snowflake -> claude-sonnet-4-5
-venice -> llama-3.3-70b
-litellm -> gpt-4o-mini
-sagemaker_tgi -> sagemaker-tgi-endpoint
-github_copilot -> gpt-4.1
-chatgpt_codex -> gpt-5.1-codex
-claude-code -> default
-codex -> gpt-5.2-codex
-gemini-cli -> gemini-2.5-pro
-cursor-agent -> auto
-ollama -> qwen3
-`;
+type ModelEntry = string | { name: string; flaky: true };
 
-const ALLOWED_FAILURES = new Set([
-  'google:gemini-2.5-flash',
-  'google:gemini-3-pro-preview',
-  'openrouter:nvidia/nemotron-3-nano-30b-a3b',
-  'openrouter:qwen/qwen3-coder:exacto',
-  'openai:gpt-3.5-turbo',
-]);
-
-const AGENTIC_PROVIDERS = new Set(['claude-code', 'codex', 'gemini-cli', 'cursor-agent']);
+interface ProviderConfig {
+  provider: string;
+  models: ModelEntry[];
+  agentic?: boolean;
+  available: () => boolean;
+}
 
-// ---------------------------------------------------------------------------
-// Helpers
-// ---------------------------------------------------------------------------
+function modelName(entry: ModelEntry): string {
+  return typeof entry === 'string' ? entry : entry.name;
+}
 
-function loadDotenv(): void {
-  const envPath = path.resolve(process.cwd(), '.env');
-  if (!fs.existsSync(envPath)) return;
-  const lines = fs.readFileSync(envPath, 'utf-8').split('\n');
-  for (const line of lines) {
-    const trimmed = line.trim();
-    if (!trimmed || trimmed.startsWith('#')) continue;
-    const eqIdx = trimmed.indexOf('=');
-    if (eqIdx === -1) continue;
-    const key = trimmed.slice(0, eqIdx);
-    const value = trimmed.slice(eqIdx + 1);
-    if (!(key in process.env)) {
-      process.env[key] = value;
-    }
-  }
+function modelFlaky(entry: ModelEntry): boolean {
+  return typeof entry !== 'string' && entry.flaky;
 }
 
 function hasEnv(name: string): boolean {
@@ -86,67 +48,155 @@ function hasFile(p: string): boolean {
   return fs.existsSync(p);
 }
 
-export function isAgenticProvider(provider: string): boolean {
-  return AGENTIC_PROVIDERS.has(provider);
-}
-
-function isProviderAvailable(provider: string): boolean {
-  switch (provider) {
-    case 'openrouter':
-      return hasEnv('OPENROUTER_API_KEY');
-    case 'xai':
-      return hasEnv('XAI_API_KEY');
-    case 'openai':
-      return hasEnv('OPENAI_API_KEY');
-    case 'anthropic':
-      return hasEnv('ANTHROPIC_API_KEY');
-    case 'google':
-      return hasEnv('GOOGLE_API_KEY');
-    case 'tetrate':
-      return hasEnv('TETRATE_API_KEY');
-    case 'databricks':
-      return hasEnv('DATABRICKS_HOST') && hasEnv('DATABRICKS_TOKEN');
-    case 'azure_openai':
-      return hasEnv('AZURE_OPENAI_ENDPOINT') && hasEnv('AZURE_OPENAI_DEPLOYMENT_NAME');
-    case 'aws_bedrock':
-      return hasEnv('AWS_REGION') && (hasEnv('AWS_PROFILE') || hasEnv('AWS_ACCESS_KEY_ID'));
-    case 'gcp_vertex_ai':
-      return hasEnv('GCP_PROJECT_ID');
-    case 'snowflake':
-      return hasEnv('SNOWFLAKE_HOST') && hasEnv('SNOWFLAKE_TOKEN');
-    case 'venice':
-      return hasEnv('VENICE_API_KEY');
-    case 'litellm':
-      return hasEnv('LITELLM_API_KEY');
-    case 'sagemaker_tgi':
-      return hasEnv('SAGEMAKER_ENDPOINT_NAME') && hasEnv('AWS_REGION');
-    case 'github_copilot':
-      return (
+function getProviders(): ProviderConfig[] {
+  return [
+    {
+      provider: 'openrouter',
+      models: [
+        'google/gemini-2.5-pro',
+        'anthropic/claude-sonnet-4.5',
+        { name: 'qwen/qwen3-coder:exacto', flaky: true },
+        'z-ai/glm-4.6:exacto',
+        { name: 'nvidia/nemotron-3-nano-30b-a3b', flaky: true },
+      ],
+      available: () => hasEnv('OPENROUTER_API_KEY'),
+    },
+    {
+      provider: 'xai',
+      models: ['grok-3'],
+      available: () => hasEnv('XAI_API_KEY'),
+    },
+    {
+      provider: 'openai',
+      models: ['gpt-4o', 'gpt-4o-mini', { name: 'gpt-3.5-turbo', flaky: true }, 'gpt-5'],
+      available: () => hasEnv('OPENAI_API_KEY'),
+    },
+    {
+      provider: 'anthropic',
+      models: ['claude-sonnet-4-5-20250929', 'claude-opus-4-5-20251101'],
+      available: () => hasEnv('ANTHROPIC_API_KEY'),
+    },
+    {
+      provider: 'google',
+      models: [
+        'gemini-2.5-pro',
+        { name: 'gemini-2.5-flash', flaky: true },
+        { name: 'gemini-3-pro-preview', flaky: true },
+        'gemini-3-flash-preview',
+      ],
+      available: () => hasEnv('GOOGLE_API_KEY'),
+    },
+    {
+      provider: 'tetrate',
+      models: ['claude-sonnet-4-20250514'],
+      available: () => hasEnv('TETRATE_API_KEY'),
+    },
+    {
+      provider: 'databricks',
+      models: ['databricks-claude-sonnet-4', 'gemini-2-5-flash', 'gpt-4o'],
+      available: () => hasEnv('DATABRICKS_HOST') && hasEnv('DATABRICKS_TOKEN'),
+    },
+    {
+      provider: 'azure_openai',
+      models: [process.env.AZURE_OPENAI_DEPLOYMENT_NAME ?? ''],
+      available: () => hasEnv('AZURE_OPENAI_ENDPOINT') && hasEnv('AZURE_OPENAI_DEPLOYMENT_NAME'),
+    },
+    {
+      provider: 'aws_bedrock',
+      models: ['us.anthropic.claude-sonnet-4-5-20250929-v1:0'],
+      available: () =>
+        hasEnv('AWS_REGION') && (hasEnv('AWS_PROFILE') || hasEnv('AWS_ACCESS_KEY_ID')),
+    },
+    {
+      provider: 'gcp_vertex_ai',
+      models: ['gemini-2.5-pro'],
+      available: () => hasEnv('GCP_PROJECT_ID'),
+    },
+    {
+      provider: 'snowflake',
+      models: ['claude-sonnet-4-5'],
+      available: () => hasEnv('SNOWFLAKE_HOST') && hasEnv('SNOWFLAKE_TOKEN'),
+    },
+    {
+      provider: 'venice',
+      models: ['llama-3.3-70b'],
+      available: () => hasEnv('VENICE_API_KEY'),
+    },
+    {
+      provider: 'litellm',
+      models: ['gpt-4o-mini'],
+      available: () => hasEnv('LITELLM_API_KEY'),
+    },
+    {
+      provider: 'sagemaker_tgi',
+      models: ['sagemaker-tgi-endpoint'],
+      available: () => hasEnv('SAGEMAKER_ENDPOINT_NAME') && hasEnv('AWS_REGION'),
+    },
+    {
+      provider: 'github_copilot',
+      models: ['gpt-4.1'],
+      available: () =>
         hasEnv('GITHUB_COPILOT_TOKEN') ||
-        hasFile(path.join(os.homedir(), '.config/goose/github_copilot_token.json'))
-      );
-    case 'chatgpt_codex':
-      return (
+        hasFile(path.join(os.homedir(), '.config/goose/github_copilot_token.json')),
+    },
+    {
+      provider: 'chatgpt_codex',
+      models: ['gpt-5.1-codex'],
+      available: () =>
         hasEnv('CHATGPT_CODEX_TOKEN') ||
-        hasFile(path.join(os.homedir(), '.config/goose/chatgpt_codex_token.json'))
-      );
-    case 'ollama':
-      return hasEnv('OLLAMA_HOST') || hasCmd('ollama');
-    case 'claude-code':
-      return hasCmd('claude');
-    case 'codex':
-      return hasCmd('codex');
-    case 'gemini-cli':
-      return hasCmd('gemini');
-    case 'cursor-agent':
-      return hasCmd('cursor-agent');
-    default:
-      return true;
-  }
+        hasFile(path.join(os.homedir(), '.config/goose/chatgpt_codex_token.json')),
+    },
+    {
+      provider: 'claude-code',
+      models: ['default'],
+      agentic: true,
+      available: () => hasCmd('claude'),
+    },
+    {
+      provider: 'codex',
+      models: ['gpt-5.2-codex'],
+      agentic: true,
+      available: () => hasCmd('codex'),
+    },
+    {
+      provider: 'gemini-cli',
+      models: ['gemini-2.5-pro'],
+      agentic: true,
+      available: () => hasCmd('gemini'),
+    },
+    {
+      provider: 'cursor-agent',
+      models: ['auto'],
+      agentic: true,
+      available: () => hasCmd('cursor-agent'),
+    },
+    {
+      provider: 'ollama',
+      models: ['qwen3'],
+      available: () => hasEnv('OLLAMA_HOST') || hasCmd('ollama'),
+    },
+  ];
 }
 
-export function isAllowedFailure(provider: string, model: string): boolean {
-  return ALLOWED_FAILURES.has(`${provider}:${model}`);
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function loadDotenv(): void {
+  const envPath = path.resolve(process.cwd(), '.env');
+  if (!fs.existsSync(envPath)) return;
+  const lines = fs.readFileSync(envPath, 'utf-8').split('\n');
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (!trimmed || trimmed.startsWith('#')) continue;
+    const eqIdx = trimmed.indexOf('=');
+    if (eqIdx === -1) continue;
+    const key = trimmed.slice(0, eqIdx);
+    const value = trimmed.slice(eqIdx + 1);
+    if (!(key in process.env)) {
+      process.env[key] = value;
+    }
+  }
 }
 
 function shouldSkipProvider(provider: string): boolean {
@@ -158,30 +208,6 @@ function shouldSkipProvider(provider: string): boolean {
     .includes(provider);
 }
 
-// ---------------------------------------------------------------------------
-// Parse provider config
-// ---------------------------------------------------------------------------
-
-interface ProviderLine {
-  provider: string;
-  modelsStr: string;
-}
-
-function parseProviderConfig(): ProviderLine[] {
-  const lines: ProviderLine[] = [];
-  for (const raw of PROVIDER_CONFIG_RAW.split('\n')) {
-    const line = raw.trim();
-    if (!line || line.startsWith('#')) continue;
-    const arrowIdx = line.indexOf(' -> ');
-    if (arrowIdx === -1) continue;
-    const provider = line.slice(0, arrowIdx).trim();
-    let modelsStr = line.slice(arrowIdx + 4).trim();
-    modelsStr = modelsStr.replace(/\$\{(\w+)\}/g, (_, name) => process.env[name] ?? '');
-    lines.push({ provider, modelsStr });
-  }
-  return lines;
-}
-
 // ---------------------------------------------------------------------------
 // Build goose binary
 // ---------------------------------------------------------------------------
@@ -206,44 +232,61 @@ export interface TestCase {
   provider: string;
   model: string;
   available: boolean;
+  flaky: boolean;
+  agentic: boolean;
   skippedReason?: string;
 }
 
 export function discoverTestCases(options?: { skipAgentic?: boolean }): TestCase[] {
   loadDotenv();
   const skipAgentic = options?.skipAgentic ?? false;
-  const providerLines = parseProviderConfig();
+  const providers = getProviders();
 
   const testCases: TestCase[] = [];
 
-  for (const { provider, modelsStr } of providerLines) {
-    const available = isProviderAvailable(provider);
-    const models = modelsStr.split('|');
+  for (const pc of providers) {
+    const providerAvailable = pc.available();
+    const agentic = pc.agentic ?? false;
+
+    for (const entry of pc.models) {
+      const model = modelName(entry);
+      const flaky = modelFlaky(entry);
 
-    for (const model of models) {
-      if (!available) {
+      if (!providerAvailable) {
         testCases.push({
-          provider,
+          provider: pc.provider,
           model,
           available: false,
+          flaky,
+          agentic,
           skippedReason: 'prerequisites not met',
         });
-      } else if (shouldSkipProvider(provider)) {
+      } else if (shouldSkipProvider(pc.provider)) {
         testCases.push({
-          provider,
+          provider: pc.provider,
           model,
           available: false,
+          flaky,
+          agentic,
           skippedReason: 'SKIP_PROVIDERS',
         });
-      } else if (skipAgentic && isAgenticProvider(provider)) {
+      } else if (skipAgentic && agentic) {
         testCases.push({
-          provider,
+          provider: pc.provider,
           model,
           available: false,
+          flaky,
+          agentic,
           skippedReason: 'agentic provider skipped in this mode',
         });
       } else {
-        testCases.push({ provider, model, available: true });
+        testCases.push({
+          provider: pc.provider,
+          model,
+          available: true,
+          flaky,
+          agentic,
+        });
       }
     }
   }

From 532468d825b18503a2978ac92342cfe06fba7d86 Mon Sep 17 00:00:00 2001
From: Jack Amadeo <jackamadeo@squareup.com>
Date: Wed, 1 Apr 2026 11:27:00 -0400
Subject: [PATCH 6/8] split

---
 .../tests/integration/test_providers.test.ts  | 119 ++++++++----------
 .../test_providers_code_exec.test.ts          |  55 +++-----
 .../tests/integration/test_providers_lib.ts   |  54 ++++++++
 3 files changed, 120 insertions(+), 108 deletions(-)

diff --git a/ui/desktop/tests/integration/test_providers.test.ts b/ui/desktop/tests/integration/test_providers.test.ts
index fccff70ad81b..44feb2a7c00f 100644
--- a/ui/desktop/tests/integration/test_providers.test.ts
+++ b/ui/desktop/tests/integration/test_providers.test.ts
@@ -1,16 +1,16 @@
 /**
  * Provider smoke tests — normal mode (direct tool calls).
  *
- * Ported from scripts/test_providers.sh.  Each available provider/model pair
- * gets its own test that spawns `goose run` with the developer builtin, asks
- * the model to read files via the shell tool, and validates the output.
+ * Each available provider/model pair gets its own test that spawns `goose run`
+ * with the developer builtin, asks the model to read files via the shell tool,
+ * and validates the output.
  */
 
-import { test, expect, beforeAll } from 'vitest';
+import { expect, beforeAll } from 'vitest';
 import fs from 'node:fs';
 import os from 'node:os';
 import path from 'node:path';
-import { buildGoose, discoverTestCases, runGoose, type TestCase } from './test_providers_lib';
+import { buildGoose, discoverTestCases, runGoose, providerTest } from './test_providers_lib';
 
 const BUILTINS = 'developer';
 const TEST_CONTENT = 'test-content-abc123';
@@ -27,77 +27,60 @@ beforeAll(() => {
   fs.writeFileSync(testFile, TEST_CONTENT + '\n');
 });
 
-const allCases = discoverTestCases();
-const available = allCases.filter((tc) => tc.available && !tc.flaky);
-const flaky = allCases.filter((tc) => tc.available && tc.flaky);
-const skipped = allCases.filter((tc) => !tc.available);
+const { testAgentic, testNonAgentic } = providerTest(discoverTestCases());
 
-async function runNormalTest(tc: TestCase): Promise<void> {
+testNonAgentic('reads files via shell tool', async (tc) => {
   const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-test-'));
-
   try {
-    let prompt: string;
-    let tokenA: string | undefined;
-    let tokenB: string | undefined;
-
-    if (tc.agentic) {
-      fs.copyFileSync(testFile, path.join(testdir, 'test-content.txt'));
-      prompt = 'read ./test-content.txt and output its contents exactly';
-    } else {
-      tokenA = `smoke-alpha-${Math.floor(Math.random() * 32768)}`;
-      tokenB = `smoke-bravo-${Math.floor(Math.random() * 32768)}`;
-      fs.writeFileSync(path.join(testdir, 'part-a.txt'), tokenA + '\n');
-      fs.writeFileSync(path.join(testdir, 'part-b.txt'), tokenB + '\n');
-      prompt =
-        'Use the shell tool to cat ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else.';
-    }
+    const tokenA = `smoke-alpha-${Math.floor(Math.random() * 32768)}`;
+    const tokenB = `smoke-bravo-${Math.floor(Math.random() * 32768)}`;
+    fs.writeFileSync(path.join(testdir, 'part-a.txt'), tokenA + '\n');
+    fs.writeFileSync(path.join(testdir, 'part-b.txt'), tokenB + '\n');
 
-    const output = await runGoose(gooseBin, testdir, prompt, BUILTINS, {
-      GOOSE_PROVIDER: tc.provider,
-      GOOSE_MODEL: tc.model,
-    });
+    const output = await runGoose(
+      gooseBin,
+      testdir,
+      'Use the shell tool to cat ./part-a.txt and ./part-b.txt, then reply with ONLY the contents of both files, one per line, nothing else.',
+      BUILTINS,
+      { GOOSE_PROVIDER: tc.provider, GOOSE_MODEL: tc.model }
+    );
 
-    if (tc.agentic) {
-      expect(
-        output.toLowerCase(),
-        `Expected model output to contain "${TEST_CONTENT}"\n\nFull output:\n${output}`
-      ).toContain(TEST_CONTENT.toLowerCase());
-    } else {
-      const shellToolPattern = /(shell \| developer)|(▸.*shell)/;
-      expect(
-        shellToolPattern.test(output),
-        `Expected model to use shell tool\n\nFull output:\n${output}`
-      ).toBe(true);
-      expect(
-        output,
-        `Expected output to contain token from part-a.txt (${tokenA})\n\nFull output:\n${output}`
-      ).toContain(tokenA);
-      expect(
-        output,
-        `Expected output to contain token from part-b.txt (${tokenB})\n\nFull output:\n${output}`
-      ).toContain(tokenB);
-    }
+    const shellToolPattern = /(shell \| developer)|(▸.*shell)/;
+    expect(
+      shellToolPattern.test(output),
+      `Expected model to use shell tool\n\nFull output:\n${output}`
+    ).toBe(true);
+    expect(
+      output,
+      `Expected output to contain token from part-a.txt (${tokenA})\n\nFull output:\n${output}`
+    ).toContain(tokenA);
+    expect(
+      output,
+      `Expected output to contain token from part-b.txt (${tokenB})\n\nFull output:\n${output}`
+    ).toContain(tokenB);
   } finally {
     fs.rmSync(testdir, { recursive: true, force: true });
   }
-}
+});
 
-if (available.length > 0) {
-  test.each(available)('$provider / $model', async (tc) => {
-    await runNormalTest(tc);
-  });
-}
+testAgentic('reads file contents', async (tc) => {
+  const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-test-'));
+  try {
+    fs.copyFileSync(testFile, path.join(testdir, 'test-content.txt'));
 
-if (flaky.length > 0) {
-  test.each(flaky)('$provider / $model (flaky — allowed to fail)', async (tc) => {
-    try {
-      await runNormalTest(tc);
-    } catch (err) {
-      console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
-    }
-  });
-}
+    const output = await runGoose(
+      gooseBin,
+      testdir,
+      'read ./test-content.txt and output its contents exactly',
+      BUILTINS,
+      { GOOSE_PROVIDER: tc.provider, GOOSE_MODEL: tc.model }
+    );
 
-if (skipped.length > 0) {
-  test.skip.each(skipped)('$provider / $model — $skippedReason', () => {});
-}
+    expect(
+      output.toLowerCase(),
+      `Expected model output to contain "${TEST_CONTENT}"\n\nFull output:\n${output}`
+    ).toContain(TEST_CONTENT.toLowerCase());
+  } finally {
+    fs.rmSync(testdir, { recursive: true, force: true });
+  }
+});
diff --git a/ui/desktop/tests/integration/test_providers_code_exec.test.ts b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
index d1234b39b793..d166c126cdc1 100644
--- a/ui/desktop/tests/integration/test_providers_code_exec.test.ts
+++ b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
@@ -1,17 +1,16 @@
 /**
  * Provider smoke tests — code execution mode (JS batching).
  *
- * Ported from scripts/test_providers_code_exec.sh.  Each available
- * (non-agentic) provider/model pair gets its own test that spawns `goose run`
- * with the memory + code_execution builtins and validates that the
- * code_execution tool was invoked.
+ * Each available (non-agentic) provider/model pair gets its own test that
+ * spawns `goose run` with the memory + code_execution builtins and validates
+ * that the code_execution tool was invoked.
  */
 
-import { test, expect, beforeAll } from 'vitest';
+import { expect, beforeAll } from 'vitest';
 import fs from 'node:fs';
 import os from 'node:os';
 import path from 'node:path';
-import { buildGoose, discoverTestCases, runGoose, type TestCase } from './test_providers_lib';
+import { buildGoose, discoverTestCases, runGoose, providerTest } from './test_providers_lib';
 
 const BUILTINS = 'memory,code_execution';
 
@@ -21,22 +20,18 @@ beforeAll(() => {
   gooseBin = buildGoose();
 });
 
-const allCases = discoverTestCases({ skipAgentic: true });
-const available = allCases.filter((tc) => tc.available && !tc.flaky);
-const flaky = allCases.filter((tc) => tc.available && tc.flaky);
-const skipped = allCases.filter((tc) => !tc.available);
+const { testAll } = providerTest(discoverTestCases({ skipAgentic: true }));
 
-async function runCodeExecTest(tc: TestCase): Promise<void> {
+testAll('invokes code_execution tool', async (tc) => {
   const testdir = fs.mkdtempSync(path.join(os.tmpdir(), 'goose-codeexec-'));
-
   try {
-    const prompt =
-      "Store a memory with category 'test' and data 'hello world', then retrieve all memories from category 'test'.";
-
-    const output = await runGoose(gooseBin, testdir, prompt, BUILTINS, {
-      GOOSE_PROVIDER: tc.provider,
-      GOOSE_MODEL: tc.model,
-    });
+    const output = await runGoose(
+      gooseBin,
+      testdir,
+      "Store a memory with category 'test' and data 'hello world', then retrieve all memories from category 'test'.",
+      BUILTINS,
+      { GOOSE_PROVIDER: tc.provider, GOOSE_MODEL: tc.model }
+    );
 
     // Matches: "execute_typescript | code_execution", "get_function_details | code_execution",
     //           "tool call | execute", "tool calls | execute" (old format)
@@ -52,24 +47,4 @@ async function runCodeExecTest(tc: TestCase): Promise<void> {
   } finally {
     fs.rmSync(testdir, { recursive: true, force: true });
   }
-}
-
-if (available.length > 0) {
-  test.each(available)('$provider / $model', async (tc) => {
-    await runCodeExecTest(tc);
-  });
-}
-
-if (flaky.length > 0) {
-  test.each(flaky)('$provider / $model (flaky — allowed to fail)', async (tc) => {
-    try {
-      await runCodeExecTest(tc);
-    } catch (err) {
-      console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
-    }
-  });
-}
-
-if (skipped.length > 0) {
-  test.skip.each(skipped)('$provider / $model — $skippedReason', () => {});
-}
+});
diff --git a/ui/desktop/tests/integration/test_providers_lib.ts b/ui/desktop/tests/integration/test_providers_lib.ts
index b4b790d109eb..fe5bd418edc7 100644
--- a/ui/desktop/tests/integration/test_providers_lib.ts
+++ b/ui/desktop/tests/integration/test_providers_lib.ts
@@ -5,6 +5,7 @@
  * allowed-failure list, agentic-provider list, and environment detection.
  */
 
+import { test } from 'vitest';
 import { execSync, spawn, type ChildProcess } from 'node:child_process';
 import fs from 'node:fs';
 import os from 'node:os';
@@ -294,6 +295,59 @@ export function discoverTestCases(options?: { skipAgentic?: boolean }): TestCase
   return testCases;
 }
 
+// ---------------------------------------------------------------------------
+// Test registration helpers
+// ---------------------------------------------------------------------------
+
+type ProviderTestFn = (tc: TestCase) => Promise<void>;
+
+function registerTests(label: string, cases: TestCase[], fn: ProviderTestFn): void {
+  const available = cases.filter((tc) => tc.available && !tc.flaky);
+  const flaky = cases.filter((tc) => tc.available && tc.flaky);
+  const skipped = cases.filter((tc) => !tc.available);
+
+  if (available.length > 0) {
+    test.each(available)(`${label} — $provider / $model`, async (tc) => {
+      await fn(tc);
+    });
+  }
+
+  if (flaky.length > 0) {
+    test.each(flaky)(`${label} — $provider / $model (flaky)`, async (tc) => {
+      try {
+        await fn(tc);
+      } catch (err) {
+        console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
+      }
+    });
+  }
+
+  if (skipped.length > 0) {
+    test.skip.each(skipped)(`${label} — $provider / $model — $skippedReason`, () => {});
+  }
+}
+
+/**
+ * Build decorator-style test registrars from a set of discovered test cases.
+ *
+ * Usage:
+ *   const { testAll, testAgentic, testNonAgentic } = providerTest(cases);
+ *
+ *   testAll('reads a file', async (tc) => { ... });
+ *   testAgentic('delegates work', async (tc) => { ... });
+ *   testNonAgentic('uses shell tool', async (tc) => { ... });
+ */
+export function providerTest(cases: TestCase[]) {
+  const agentic = cases.filter((tc) => tc.agentic);
+  const nonAgentic = cases.filter((tc) => !tc.agentic);
+
+  return {
+    testAll: (label: string, fn: ProviderTestFn) => registerTests(label, cases, fn),
+    testAgentic: (label: string, fn: ProviderTestFn) => registerTests(label, agentic, fn),
+    testNonAgentic: (label: string, fn: ProviderTestFn) => registerTests(label, nonAgentic, fn),
+  };
+}
+
 // ---------------------------------------------------------------------------
 // Utility: run goose binary and capture output
 // ---------------------------------------------------------------------------

From 33aee214e2d42c8bbf99a2ed0ce01f2eabf7fcea Mon Sep 17 00:00:00 2001
From: Douwe Osinga <douwe@squareup.com>
Date: Wed, 1 Apr 2026 20:31:05 -0400
Subject: [PATCH 7/8] fix: resolve .env from repo root, strip quotes, extend
 flaky test timeout

- loadDotenv() now resolves .env from the repository root via __dirname
  instead of process.cwd(), matching the old shell script behavior when
  run from ui/desktop
- Strip surrounding quotes from dotenv values so KEY="value" works
- Give flaky tests a 120s timeout so the try/catch handler runs before
  vitest kills the test

Signed-off-by: Douwe Osinga <douwe@squareup.com>
---
 ui/desktop/tests/integration/test_providers.test.ts |  2 +-
 .../integration/test_providers_code_exec.test.ts    |  2 +-
 ui/desktop/tests/integration/test_providers_lib.ts  | 13 +++++++++++--
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/ui/desktop/tests/integration/test_providers.test.ts b/ui/desktop/tests/integration/test_providers.test.ts
index fccff70ad81b..1819f7dd3893 100644
--- a/ui/desktop/tests/integration/test_providers.test.ts
+++ b/ui/desktop/tests/integration/test_providers.test.ts
@@ -95,7 +95,7 @@ if (flaky.length > 0) {
     } catch (err) {
       console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
     }
-  });
+  }, 120_000);
 }
 
 if (skipped.length > 0) {
diff --git a/ui/desktop/tests/integration/test_providers_code_exec.test.ts b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
index d1234b39b793..85ace41b2b78 100644
--- a/ui/desktop/tests/integration/test_providers_code_exec.test.ts
+++ b/ui/desktop/tests/integration/test_providers_code_exec.test.ts
@@ -67,7 +67,7 @@ if (flaky.length > 0) {
     } catch (err) {
       console.warn(`Flaky test ${tc.provider}/${tc.model} failed (allowed): ${err}`);
     }
-  });
+  }, 120_000);
 }
 
 if (skipped.length > 0) {
diff --git a/ui/desktop/tests/integration/test_providers_lib.ts b/ui/desktop/tests/integration/test_providers_lib.ts
index b4b790d109eb..6945843abb80 100644
--- a/ui/desktop/tests/integration/test_providers_lib.ts
+++ b/ui/desktop/tests/integration/test_providers_lib.ts
@@ -182,8 +182,17 @@ function getProviders(): ProviderConfig[] {
 // Helpers
 // ---------------------------------------------------------------------------
 
+function stripQuotes(s: string): string {
+  if (s.length >= 2 && ((s.startsWith('"') && s.endsWith('"')) || (s.startsWith("'") && s.endsWith("'")))) {
+    return s.slice(1, -1);
+  }
+  return s;
+}
+
 function loadDotenv(): void {
-  const envPath = path.resolve(process.cwd(), '.env');
+  // Resolve .env from the repository root (two levels up from ui/desktop).
+  const repoRoot = path.resolve(__dirname, '..', '..', '..', '..');
+  const envPath = path.join(repoRoot, '.env');
   if (!fs.existsSync(envPath)) return;
   const lines = fs.readFileSync(envPath, 'utf-8').split('\n');
   for (const line of lines) {
@@ -192,7 +201,7 @@ function loadDotenv(): void {
     const eqIdx = trimmed.indexOf('=');
     if (eqIdx === -1) continue;
     const key = trimmed.slice(0, eqIdx);
-    const value = trimmed.slice(eqIdx + 1);
+    const value = stripQuotes(trimmed.slice(eqIdx + 1));
     if (!(key in process.env)) {
       process.env[key] = value;
     }

From 4abfea7b98f18b6125af7ed35d12fc26ed9a46f5 Mon Sep 17 00:00:00 2001
From: Jack Amadeo <jackamadeo@squareup.com>
Date: Fri, 24 Apr 2026 13:12:04 -0400
Subject: [PATCH 8/8] update test targets

---
 .github/workflows/pr-smoke-test.yml | 6 +++---
 RELEASE_CHECKLIST.md                | 2 +-
 ui/desktop/package.json             | 3 +++
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr-smoke-test.yml b/.github/workflows/pr-smoke-test.yml
index 0f251e614837..f49d983dae5e 100644
--- a/.github/workflows/pr-smoke-test.yml
+++ b/.github/workflows/pr-smoke-test.yml
@@ -133,7 +133,7 @@ jobs:
         run: |
           mkdir -p $HOME/.local/share/goose/sessions
           mkdir -p $HOME/.config/goose
-          source ../../bin/activate-hermit && pnpm run test:integration -- tests/integration/test_providers.test.ts
+          source ../../bin/activate-hermit && pnpm run test:integration:providers
         working-directory: ui/desktop
 
       - name: Set up Python
@@ -211,7 +211,7 @@ jobs:
         run: |
           mkdir -p $HOME/.local/share/goose/sessions
           mkdir -p $HOME/.config/goose
-          source ../../bin/activate-hermit && pnpm run test:integration -- tests/integration/test_providers_code_exec.test.ts
+          source ../../bin/activate-hermit && pnpm run test:integration:providers-code-exec
         working-directory: ui/desktop
 
   compaction-tests:
@@ -287,5 +287,5 @@ jobs:
           SKIP_BUILD: 1
         run: |
             echo 'export PATH=/some/fake/path:$PATH' >> $HOME/.bash_profile
-            source ../../bin/activate-hermit && pnpm run test:integration -- tests/integration/goosed.test.ts
+            source ../../bin/activate-hermit && pnpm run test:integration:goosed
         working-directory: ui/desktop
diff --git a/RELEASE_CHECKLIST.md b/RELEASE_CHECKLIST.md
index fdc0268790ce..e031d00c0c4d 100644
--- a/RELEASE_CHECKLIST.md
+++ b/RELEASE_CHECKLIST.md
@@ -17,7 +17,7 @@ Make a copy of this document for each version and check off as steps are verifie
 
 ### Provider Testing
 
-- [ ] Run `cd ui/desktop && pnpm run test:integration -- tests/integration/test_providers.test.ts` locally from the release branch and verify all providers/models work
+- [ ] Run `cd ui/desktop && pnpm run test:integration:providers` locally from the release branch and verify all providers/models work
 - [ ] Launch goose, click reset providers, choose databricks and a model
 
 ### Starting Conversations
diff --git a/ui/desktop/package.json b/ui/desktop/package.json
index 643698f97b3e..0988f399e79f 100644
--- a/ui/desktop/package.json
+++ b/ui/desktop/package.json
@@ -35,6 +35,9 @@
     "test:ui": "vitest --ui",
     "test:coverage": "vitest run --coverage",
     "test:integration": "vitest run --config vitest.integration.config.ts",
+    "test:integration:goosed": "vitest run --config vitest.integration.config.ts tests/integration/goosed.test.ts",
+    "test:integration:providers": "vitest run --config vitest.integration.config.ts tests/integration/test_providers.test.ts",
+    "test:integration:providers-code-exec": "vitest run --config vitest.integration.config.ts tests/integration/test_providers_code_exec.test.ts",
     "test:integration:watch": "vitest --config vitest.integration.config.ts",
     "test:integration:debug": "DEBUG=1 vitest run --config vitest.integration.config.ts",
     "i18n:extract": "formatjs extract 'src/**/*.{ts,tsx}' --out-file src/i18n/messages/en.json --flatten && pnpm run i18n:compile",