azure-sdk-for-js/sdk/ai/ai-projects/samples-dev/agents/tools/agentComputerUse.ts at 49e9afd5e6d2204cf8561614b0c70480b7c347e6 · Azure/azure-sdk-for-js · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

/**
 * This sample demonstrates how to use Computer Use Agent (CUA) functionality
 * with the Azure AI Projects client. It simulates browser automation by
 * creating an agent that can interact with computer interfaces through
 * simulated actions and screenshots.
 *
 * The sample creates a Computer Use Agent that performs a web search simulation,
 * demonstrating how to handle computer actions like typing, clicking, and
 * taking screenshots in a controlled environment.
 *
 * @summary This sample demonstrates how to create a Computer Use Agent that can interact
 * with computer interfaces through simulated actions and screenshots.
 *
 * @azsdk-weight 100
 */

import { DefaultAzureCredential } from "@azure/identity";
import { AIProjectClient } from "@azure/ai-projects";
import "dotenv/config";
import {
  SearchState,
  loadScreenshotAssets,
  handleComputerActionAndTakeScreenshot,
  printFinalOutput,
} from "./computerUseUtil.ts";

const projectEndpoint = process.env["FOUNDRY_PROJECT_ENDPOINT"] || "<project endpoint>";
const deploymentName =
  process.env["COMPUTER_USE_MODEL_DEPLOYMENT_NAME"] || "<model deployment name>";

export async function main(): Promise<void> {
  // Initialize state machine
  let currentState = SearchState.INITIAL;

  // Create AI Project client
  const project = new AIProjectClient(projectEndpoint, new DefaultAzureCredential());
  const openAIClient = project.getOpenAIClient();

  // Load screenshot assets
  const screenshots = await loadScreenshotAssets(openAIClient);
  console.log("Successfully loaded screenshot assets");

  console.log("Creating Computer Use Agent...");
  const agent = await project.agents.createVersion("ComputerUseAgent", {
    kind: "prompt" as const,
    model: deploymentName,
    instructions: `
You are a computer automation assistant.

Be direct and efficient. When you reach the search results page, read and describe the actual search result titles and descriptions you can see.
    `.trim(),
    tools: [
      {
        type: "computer_use_preview",
        display_width: 1026,
        display_height: 769,
        environment: "windows" as const,
      },
    ],
  });
  console.log(`Agent created (id: ${agent.id}, name: ${agent.name}, version: ${agent.version})`);

  // Initial request with screenshot - start with Bing search page
  console.log(
    "Starting computer automation session (initial screenshot: cua_browser_search.png)...",
  );
  let response = await openAIClient.responses.create(
    {
      input: [
        {
          role: "user" as const,
          content: [
            {
              type: "input_text",
              text: "I need you to help me search for 'OpenAI news'. Please type 'OpenAI news' and submit the search. Once you see search results, the task is complete.",
            },
            {
              type: "input_image",
              file_id: screenshots.browser_search.fileId,
              detail: "high",
            },
          ],
        },
      ],
      truncation: "auto",
    },
    {
      body: { agent_reference: { name: agent.name, type: "agent_reference" } },
    },
  );

  console.log(`Initial response received (ID: ${response.id})`);

  // Main interaction loop with deterministic completion
  const maxIterations = 10; // Allow enough iterations for completion
  let iteration = 0;

  while (iteration < maxIterations) {
    iteration++;
    console.log(`\n--- Iteration ${iteration} ---`);

    // Check for computer calls in the response
    const computerCalls = response.output.filter((item) => item.type === "computer_call");

    if (computerCalls.length === 0) {
      printFinalOutput({
        output: response.output,
        status: response.status ?? "",
      });
      break;
    }

    // Process the first computer call
    const computerCall = computerCalls[0];
    if (!computerCall.action || !computerCall.call_id) {
      console.log("Incomplete computer call, skipping...");
      continue;
    }
    const action = computerCall.action;
    const callId: string = computerCall.call_id;

    console.log(`Processing computer call (ID: ${callId})`);

    // Handle the action and get the screenshot info
    const [screenshotInfo, updatedState] = handleComputerActionAndTakeScreenshot(
      action,
      currentState,
      screenshots,
    );
    currentState = updatedState;

    console.log(`Sending action result back to agent (using ${screenshotInfo.filename})...`);
    // Regular response with just the screenshot
    response = await openAIClient.responses.create(
      {
        previous_response_id: response.id,
        input: [
          {
            call_id: callId,
            type: "computer_call_output",
            output: {
              type: "computer_screenshot",
              file_id: screenshotInfo.fileId,
            },
          },
        ],
        truncation: "auto",
      },
      {
        body: { agent_reference: { name: agent.name, type: "agent_reference" } },
      },
    );

    console.log(`Follow-up response received (ID: ${response.id})`);
  }

  if (iteration >= maxIterations) {
    console.log(`\nReached maximum iterations (${maxIterations}). Stopping.`);
  }

  // Clean up resources
  console.log("\nCleaning up...");
  await project.agents.deleteVersion(agent.name, agent.version);
  console.log("Agent deleted");

  console.log("\nComputer Use Agent sample completed!");
}

main().catch((err) => {
  console.error("The sample encountered an error:", err);
});