Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
542f3b1
Add initial testing code
latekvo Dec 10, 2025
35aac32
better typing, result guards
latekvo Dec 10, 2025
d7ecdf5
add allowed tools check
latekvo Dec 10, 2025
3e29cf1
add basic error reporting, cleanup
latekvo Dec 10, 2025
66aa875
register command for running tests
latekvo Dec 10, 2025
acc153d
more metadata in status reports
latekvo Dec 10, 2025
7f0734c
Merge branch 'main' into @latekvo/create-ai-tests
latekvo Dec 10, 2025
51d77e4
stylistic fix
latekvo Dec 10, 2025
557ccf0
Merge branch '@latekvo/create-ai-tests' of https://github.com/softwar…
latekvo Dec 10, 2025
9b8f3d8
initial file-reading impl (syncing work, branch switch)
latekvo Dec 10, 2025
4da6725
fix workaround for not awaiting completion
latekvo Dec 10, 2025
a36f7ee
system-agnostic random path
latekvo Dec 10, 2025
4b6b795
add undo before new chat
latekvo Dec 10, 2025
898d595
ensure agent mode is used
latekvo Dec 10, 2025
e3eed04
remove all popups requiring human input, add todos
latekvo Dec 11, 2025
f68ddad
Merge remote-tracking branch 'origin/main' into @latekvo/create-ai-tests
latekvo Dec 11, 2025
2b7df4d
sync
latekvo Dec 12, 2025
e559400
prevent more popups
latekvo Dec 12, 2025
317d18a
pretty result printing
latekvo Dec 12, 2025
b661880
add git restore on each run
latekvo Dec 15, 2025
df1c643
Merge remote-tracking branch 'origin/main' into @latekvo/create-ai-tests
latekvo Dec 15, 2025
106dfb7
cleanup
latekvo Dec 15, 2025
e3b278c
replace sleep with known util
latekvo Dec 15, 2025
f0b8865
add initial termination implementation
latekvo Dec 16, 2025
204d571
hook up the state manager
latekvo Dec 18, 2025
c8021e0
remove resolved todo
latekvo Dec 18, 2025
b9889e2
fix: use correct status update command
latekvo Dec 18, 2025
54595ba
fix invalid command name
latekvo Dec 18, 2025
45a66a9
fix type errors, fix typecasting ide as state manager
latekvo Dec 18, 2025
104548e
use partial state instead
latekvo Dec 18, 2025
4bdb9f6
fix naming
latekvo Dec 18, 2025
066f377
await test state setting
latekvo Dec 19, 2025
3ceb9e0
minor comment change
latekvo Dec 19, 2025
1f25e8a
prevent multiple launches of the tool tests
latekvo Dec 19, 2025
5608e80
add docstrings for the command executors
latekvo Dec 19, 2025
9ea5386
await global context setting
latekvo Dec 19, 2025
c924ef8
Merge remote-tracking branch 'origin/main' into @latekvo/create-ai-tests
latekvo Dec 30, 2025
a24bfab
add more test cases
latekvo Jan 7, 2026
d606541
simplify timeout and termination code, remove unwanted test case
latekvo Jan 7, 2026
cac02bd
improve output formatting, data
latekvo Jan 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions packages/vscode-extension/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,18 @@
"command": "RNIDE.removeLicense",
"title": "Remove license",
"category": "Radon IDE"
},
{
"enablement": "!RNIDE.MCPToolTestsRunning",
"command": "RNIDE.testChatToolUsage",
"title": "Test AI tool usage",
"category": "Radon IDE"
},
{
"enablement": "RNIDE.MCPToolTestsRunning",
"command": "RNIDE.terminateChatToolTest",
"title": "Terminate MCP tool tests",
"category": "Radon IDE"
}
],
"keybindings": [
Expand Down
338 changes: 338 additions & 0 deletions packages/vscode-extension/src/ai/tests/aiChatTester.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,338 @@
import { randomBytes } from "crypto";
import { readFileSync } from "fs";
import { mkdtemp } from "fs/promises";
import { tmpdir } from "os";
import path from "path";
import { window, commands, Uri, workspace, StatusBarAlignment, ThemeColor } from "vscode";
import { Logger } from "../../Logger";
import { exec } from "../../utilities/subprocess";
import { Platform } from "../../utilities/platform";
import { IDE } from "../../project/ide";

export const GIT_PATH = Platform.select({
macos: "git",
windows: "git.exe",
linux: "git",
});

interface ChatData {
requests: Request[];
}

interface Request {
response: Response[];
}

type Response = ToolCallResponse | UnknownResponse;

interface UnknownResponse {
// `Exclude<string, "literal">` resolves to `string` (does not work)
kind: unknown;
}

type AllowedToolId =
| "query_documentation"
| "view_screenshot"
| "view_component_tree"
| "view_application_logs"
| "reload_application";

interface ToolCallResponse {
kind: "toolInvocationSerialized";
toolId: AllowedToolId;
}

interface ChatTestCase {
prompt: string;
allowedToolIds: AllowedToolId[];
}

interface ChatTestResult {
prompt: string;
success: boolean;
cause: string | null;
}

function isToolCallResponse(response: Response): response is ToolCallResponse {
// Smart-casting with `Exclude<string, "literal">` does not work, which is why this utility function is necessary
return response.kind === "toolInvocationSerialized";
}

const testCases: ChatTestCase[] = [
{
prompt: "How to use Shared Element Transitions in Reanimated 4?",
allowedToolIds: ["query_documentation"],
},
{
prompt: "How to use SETs in Reanimated?",
allowedToolIds: ["query_documentation"],
},
{
prompt: "Implement an example interaction with a local LLM in my app.",
allowedToolIds: ["query_documentation"],
},
{
prompt: "Add LLM chat to my app.",
allowedToolIds: ["query_documentation"],
},

{
prompt: "My button in the center of the screen is malformed.",
allowedToolIds: ["view_component_tree", "view_screenshot"],
},
{
prompt: "The orange button is ugly. Fix it.",
allowedToolIds: ["view_component_tree", "view_screenshot"],
},

{
prompt: "Restart the app.",
allowedToolIds: ["reload_application"],
},
{
prompt: "The app is frozen. Can you reset it?",
allowedToolIds: ["reload_application"],
},

{
prompt: "Why did the app just crash?",
allowedToolIds: ["view_application_logs"],
},
{
prompt: "Are there any errors in the logs?",
allowedToolIds: ["view_application_logs"],
},
{
prompt: "Debug the error thrown when I clicked the login button.",
allowedToolIds: ["view_application_logs", "view_component_tree"],
},

{
prompt: "Does the layout look broken to you?",
allowedToolIds: ["view_screenshot"],
},
{
prompt: "I think the text is being cut off on the right side.",
allowedToolIds: ["view_screenshot"],
},
{
prompt: "Verify if the dark mode colors are applied correctly.",
allowedToolIds: ["view_screenshot"],
},
{
prompt: "Take a look at the current screen.",
allowedToolIds: ["view_screenshot"],
},

{
prompt: "What is the hierarchy of the current screen?",
allowedToolIds: ["view_component_tree"],
},
{
prompt: "Show me the props passed to the Header component.",
allowedToolIds: ["view_component_tree"],
},
{
prompt: "Is the 'Submit' button currently inside a SafeAreaView?",
allowedToolIds: ["view_component_tree"],
},
{
prompt: "Find the component ID for the bottom navigation bar.",
allowedToolIds: ["view_component_tree"],
},

{
prompt: "Why is the banner not showing up?",
allowedToolIds: ["view_component_tree", "view_application_logs", "view_screenshot"],
},
{
prompt: "Inspect the padding on the user profile card.",
allowedToolIds: ["view_component_tree", "view_screenshot"],
},
];

async function clearEdits() {
// Stop previous response - prevents pop-ups on `workbench.action.chat.newChat`.
await commands.executeCommand("workbench.action.chat.cancel");

// Move cursor to input - REQUIRED for `chatEditing.acceptAllFiles`.
await commands.executeCommand("workbench.panel.chat.view.copilot.focus");

// Rejection requires user confirmation, acceptance does not.
await commands.executeCommand("chatEditing.acceptAllFiles");

const gitUri = workspace.workspaceFolders?.[0].uri;

if (!gitUri) {
// This case will never occur when tests are being run in a test up.
return;
}

// Revert all changes via git - we CANNOT use `commands.executeCommand`, as it requires user confirmation.
await exec(GIT_PATH, ["-C", gitUri.fsPath, "restore", "."]);
}

async function setGlobalTestsRunning(areTestsRunning: boolean) {
await commands.executeCommand("setContext", "RNIDE.MCPToolTestsRunning", areTestsRunning);
}

function awaitTestTerminationOrTimeout(ideInstance: IDE, testTimeout: number): Promise<void> {
return new Promise((resolve) => {
const disposable = ideInstance.onStateChanged((partialState) => {
const continueRunningTests = partialState.areMCPTestsRunning;
if (continueRunningTests === false) {
resolve();
}
});

setTimeout(() => {
disposable.dispose();
resolve();
}, testTimeout);
});
}

async function setTestStatus(areTestsRunning: boolean, ideInstance: IDE) {
await setGlobalTestsRunning(areTestsRunning);
await ideInstance.updateState({
areMCPTestsRunning: !areTestsRunning,
});
}

function getIdeInstance() {
const stateManager = IDE.getInstanceIfExists();

if (!stateManager) {
// TODO: Use `throw` or just `return`?
throw Error();
}

return stateManager;
}

/**
* Executor for `RNIDE.terminateChatToolTest` VSCode command.
* Terminates ongoing MCP tool tests, which were initiated by `RNIDE.testChatToolUsage` VSCode command.
*/
export async function terminateChatToolTest() {
const ideInstance = getIdeInstance();
await setTestStatus(false, ideInstance);
}

/**
* Executor for `RNIDE.testChatToolUsage` VSCode command.
* Temporarily takes control over the AI chat tab, testing it's responses to various prompts.
* Running this command may interfere with other VSCode functionalities as well.
*/
export async function testChatToolUsage(): Promise<void> {
const ideInstance = getIdeInstance();
const runStatus: ChatTestResult[] = [];

await setTestStatus(true, ideInstance);

const fail = (testCase: ChatTestCase, cause: string) => {
runStatus.push({
cause,
success: false,
prompt: testCase.prompt,
});
};

const success = (testCase: ChatTestCase) => {
runStatus.push({
cause: null,
success: true,
prompt: testCase.prompt,
});
};

// - `showInformationMessage` cannot be programatically dismissed
// - `showQuickPick` is a list-selection - does not look right
// - `createStatusBarItem` looks good, and can be dismissed both programatically and by the user
const statusBar = window.createStatusBarItem(StatusBarAlignment.Left, 0);
statusBar.command = "RNIDE.terminateChatToolTest";
statusBar.text = "$(debug-stop) MCP tests running — Terminate";
statusBar.tooltip = "Click to terminate running E2E tests";
statusBar.color = new ThemeColor("statusBar.foreground");
statusBar.backgroundColor = new ThemeColor("statusBarItem.errorBackground");
statusBar.show();

const dir = await mkdtemp(path.join(tmpdir(), "radon-chat-exports-"));

for (const testCase of testCases) {
clearEdits();

await commands.executeCommand("workbench.action.chat.newChat");
await commands.executeCommand("workbench.action.chat.openagent", testCase.prompt);

await awaitTestTerminationOrTimeout(ideInstance, 10_000);

const filepath = dir + randomBytes(8).toString("hex") + ".json";

await commands.executeCommand("workbench.action.chat.export", Uri.parse(filepath));

let chatData;
try {
const exportedText = readFileSync(filepath).toString();
chatData = JSON.parse(exportedText) as ChatData;
} catch {
fail(testCase, "Internal error: `workbench.action.chat.export` did not work.");
continue;
}

if (chatData.requests.length === 0) {
fail(testCase, "Internal error: `workbench.action.chat.open` did not work.");
continue;
}

if (chatData.requests.length > 1) {
fail(testCase, "Internal error: `workbench.action.chat.newChat` did not work.");
continue;
}

const responses = chatData.requests[0].response;

const toolCalls = responses.filter((response) => isToolCallResponse(response));

if (toolCalls.length === 0) {
fail(testCase, "No tools were called.");
continue;
}

const otherCalledTools = [];
for (const toolCall of toolCalls) {
if (testCase.allowedToolIds.includes(toolCall.toolId)) {
success(testCase);
continue;
}

otherCalledTools.push(toolCall.toolId);
}

const expected = `Expected: ${testCase.allowedToolIds.join(" | ")}`;
const received = `Received: ${otherCalledTools.join(", ")}`;
const cause = `${expected}. ${received}`;
fail(testCase, cause);
}

await setTestStatus(false, ideInstance);

clearEdits();

statusBar.hide();
statusBar.dispose();

const failReasons = runStatus
.map((v) => `${v.success ? " OK " : "FAIL"}${v.cause !== null ? ` | Error: ${v.cause}` : ""}`)
.join("\n");

const correctCount = runStatus
.map((v) => (v.success ? 1 : 0) as number)
.reduce((v, acc) => v + acc);

const totalCount = runStatus.length;
const correctPercent = ((correctCount / totalCount) * 100).toFixed(1);

const response = `\n=== AI TEST RESULTS ===\n${failReasons}\n# TOTAL CORRECT: ${correctCount}/${totalCount} (${correctPercent}%)`;
Logger.log(response);
}
2 changes: 2 additions & 0 deletions packages/vscode-extension/src/common/State.ts
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ export type State = {
projectState: ProjectStore;
telemetry: TelemetryState;
workspaceConfiguration: WorkspaceConfiguration;
areMCPTestsRunning: boolean; // FIXME: Move
};

export type StateListener = (state: RecursivePartial<State>) => void;
Expand Down Expand Up @@ -589,6 +590,7 @@ const initialDeviceSessionStore: DeviceSessionStore = {
};

export const initialState: State = {
areMCPTestsRunning: false, // FIXME: Move
applicationRoots: [],
devicesState: {
devicesByType: {
Expand Down
10 changes: 10 additions & 0 deletions packages/vscode-extension/src/extension.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import { AdminRestrictedFunctionalityError, PaywalledFunctionalityError } from "
import { registerRadonAI } from "./ai/mcp/RadonMcpController";
import { MaestroCodeLensProvider } from "./providers/MaestroCodeLensProvider";
import { removeLicense } from "./utilities/license";
import { terminateChatToolTest, testChatToolUsage } from "./ai/tests/aiChatTester";
import { getTelemetryReporter } from "./utilities/telemetry";
import { getEditorType } from "./utilities/editorType";

Expand Down Expand Up @@ -325,6 +326,15 @@ export async function activate(context: ExtensionContext) {
context.subscriptions.push(
commands.registerCommand("RNIDE.removeLicense", removeLicenseWithConfirmation)
);

context.subscriptions.push(
commands.registerCommand("RNIDE.testChatToolUsage", testChatToolUsage)
);

context.subscriptions.push(
commands.registerCommand("RNIDE.terminateChatToolTest", terminateChatToolTest)
);

// Debug adapter used by custom launch configuration, we register it in case someone tries to run the IDE configuration
// The current workflow is that people shouldn't run it, but since it is listed under launch options it might happen
// When it does happen, we open the IDE panel and restart the app.
Expand Down
Loading