Enable using different LLMs for extract/query (#87)

anerli · web-flow · commit 12058503d1d6 · 2025-07-12T16:02:57.000-07:00
diff --git a/docs/advanced/roles.mdx b/docs/advanced/roles.mdx
@@ -0,0 +1,72 @@
+---
+title: LLM Roles
+description: "Designate different LLMs for different responsibilities"
+icon: venetian-mask
+---
+
+You can customize the Magnitude agent to use different LLMs for each of the three primary operations: `act`, `extract`, `query`.
+
+By default when a single LLM is provided, all responsibilites will be handled by that LLM. However, by specifying different LLMs for certain roles you may be able to save on cost and speed.
+
+Example:
+```typescript
+import { startBrowserAgent } from '../src/agent/browserAgent';
+import z from 'zod';
+
+async function main() {
+    const agent = await startBrowserAgent({
+        url: 'https://magnitasks.com/tasks',
+        narrate: true,
+        llm: [
+            {
+                provider: 'claude-code',
+                options: {
+                    model: 'claude-sonnet-4-20250514'
+                }
+            },
+            {
+                roles: ['extract'],
+                provider: 'google-ai',
+                options: {
+                    model: 'gemini-2.5-flash-lite-preview-06-17',//'gemini-2.5-flash'
+                }
+            },
+            {
+                roles: ['query'],
+                provider: 'google-ai',
+                options: {
+                    // Balance intelligent querying and cheap tokens
+                    model: 'gemini-2.5-flash'
+                }
+            }
+        ]
+    });
+    
+    const tasks = await agent.extract(
+        'Extract all tasks in To Do column',
+        z.array(z.object({ title: z.string(), desc: z.string() }))
+    ); // ^ this will use gemini-2.5-flash-lite-preview-06-17
+
+    await agent.act('Move each to in progress', { data: tasks });
+    // ^ this will use Claude
+
+    const numTodosMoved = await agent.query(
+        'How many todos were moved?',
+        z.number()
+    ); // ^ this will use gemini-2.5-flash
+
+    console.log(numTodosMoved);
+
+    await agent.stop();
+}
+
+main();
+```
+
+One great use case for this is to reduce the cost of extracting data. While `act` requires an intelligent and [visually grounded model](/core-concepts/compatible-llms), `extract` and `query` do not require grounded models, and can often work fine with less intelligent models.
+
+General recommendations:
+- `act`: MUST use an [intelligent, visually grounded model](/core-concepts/compatible-llms)
+- `extract`: Can use a fast and cheap model, like `gemini-2.5-flash` or even `gemini-2.5-flash-lite`
+- `query`: Can use any model that's reasonably intelligent but fast, depending on the complexity of the queries you plan to ask. `gemini-2.5-flash` might be a good option.
+
diff --git a/docs/core-concepts/query.mdx b/docs/core-concepts/query.mdx
@@ -0,0 +1,7 @@
+---
+title: Asking Questions
+description: "Query the agent about things that happened"
+icon: message-circle
+---
+
+You can use `agent.query` in order to ask the agent about the actions it just took or anything that happened in the last call to `agent.act`.
diff --git a/docs/docs.json b/docs/docs.json
@@ -81,7 +81,8 @@
                     {
                         "group": "Advanced",
                         "pages": [
-                            "advanced/memory"
+                            "advanced/memory",
+                            "advanced/roles"
                         ]
                     },
                     {
diff --git a/packages/magnitude-core/src/agent/browserAgent.ts b/packages/magnitude-core/src/agent/browserAgent.ts
@@ -152,7 +152,7 @@ export class BrowserAgent extends Agent {
         const markdown = serializeToMarkdown(result, markdownOptions);
 
         const screenshot = await this.require(BrowserConnector).getHarness().screenshot();
-        const data = await this.model.extract(instructions, schema, screenshot, markdown);
+        const data = await this.models.extract(instructions, schema, screenshot, markdown);
 
         this.browserAgentEvents.emit('extractDone', instructions, data);
 
diff --git a/packages/magnitude-core/src/agent/index.ts b/packages/magnitude-core/src/agent/index.ts
@@ -17,10 +17,11 @@ import { telemetrifyAgent } from '@/telemetry/events';
 import { isClaude } from '@/ai/util';
 import { retryOnError } from '@/common';
 import { renderContentParts } from '@/memory/rendering';
+import { MultiModelHarness } from '@/ai/multiModelHarness';
 
 
 export interface AgentOptions {
-    llm?: LLMClient;
+    llm?: LLMClient | LLMClient[];
     connectors?: AgentConnector[];
     actions?: ActionDefinition<any>[]; // any additional actions not provided by connectors
     prompt?: string | null; // additional agent-level system prompt instructions
@@ -58,7 +59,9 @@ export class Agent {
 
     private memoryOptions: AgentMemoryOptions;
 
-    public readonly model: ModelHarness;
+    public readonly models: MultiModelHarness;
+
+    //public readonly model: ModelHarness;
     //public readonly micro: GroundingService;
     //public readonly events: EventEmitter<AgentEvents>;
 
@@ -90,15 +93,27 @@ export class Agent {
         // TODO: maybe error instead, or automatically differentiate them?
         //this.options.actions = Array.from(new Map(aggregatedActions.map(actDef => [actDef.name, actDef])).values());
 
-        const doPromptCaching = ('promptCaching' in this.options.llm.options) ? this.options.llm.options.promptCaching : isClaude(this.options.llm) && (this.options.llm.provider === 'anthropic' || this.options.llm.provider === 'claude-code');
-        //console.log('doPromptCaching?', doPromptCaching)
-        
-        //promptCaching: doPromptCaching
-        //if ('promptCaching' in this.options.llm.options) this.options.llm.options.promptCaching = doPromptCaching;
-        // needs testing
-        if (this.options.llm.provider === 'anthropic' || this.options.llm.provider === 'claude-code') this.options.llm.options.promptCaching = doPromptCaching;
-        this.model = new ModelHarness({ llm: this.options.llm });
-        this.model.events.on('tokensUsed', (usage) => this.events.emit('tokensUsed', usage), this);
+        const llms = Array.isArray(this.options.llm) ? this.options.llm : [this.options.llm];
+
+        let doPromptCaching = false;
+        for (const client of llms ) {
+            // If any LLM is prompt-caching compatible, turn on prompt caching overall for memory etc.
+            if (isClaude(client) && (client.provider === 'anthropic' || client.provider === 'claude-code')) {
+                // Prompt-caching compatible client
+
+                if ('promptCaching' in client.options && client.options.promptCaching !== undefined) {
+                    doPromptCaching = client.options.promptCaching;
+                } else {
+                    // Default to true if not specified, and override on client config to true
+                    doPromptCaching = true;
+                    client.options.promptCaching = true;
+                }
+            }
+        }
+
+        //this.model = new ModelHarness({ llm: this.options.llm });
+        this.models = new MultiModelHarness(llms);
+        this.models.events.on('tokensUsed', (usage) => this.events.emit('tokensUsed', usage), this);
         this.doneActing = false;
 
         this.memoryOptions = {
@@ -130,7 +145,7 @@ export class Agent {
         if (this.options.telemetry) telemetrifyAgent(this);
 
         //console.log('setting up model')
-        await this.model.setup();
+        await this.models.setup();
         //console.log('done setting up model')
 
         logger.info("Agent: Starting connectors...");
@@ -329,7 +344,7 @@ export class Agent {
                 const memoryContext = await this._buildContext(memory);
                 await retryOnError(
                     async () => {
-                        ({ reasoning, actions } = await this.model.createPartialRecipe(
+                        ({ reasoning, actions } = await this.models.partialAct(
                             memoryContext,
                             description,
                             dataContentParts,
@@ -395,7 +410,7 @@ export class Agent {
         // Record observations in case no act() was used beforehand
         await this._recordConnectorObservations(this.latestTaskMemory);
         const memoryContext = await this._buildContext(this.memory);//this.memory.buildContext(this.connectors);
-        return await this.model.query(memoryContext, query, schema);
+        return await this.models.query(memoryContext, query, schema);
     }
 
     async queueDone() {
diff --git a/packages/magnitude-core/src/agent/narrator.ts b/packages/magnitude-core/src/agent/narrator.ts
@@ -22,15 +22,15 @@ export function narrateAgent(agent: Agent) {
     });
 
     agent.events.on('start', () => {
-        console.log(bold(blueBright(`▶ [start] agent started with ${agent.model.describeModel()}`)));
+        console.log(bold(blueBright(`▶ [start] agent started with ${agent.models.describe()}`)));
     });
 
     agent.events.on('stop', () => {
         console.log(bold(blueBright(`■ [stop] agent stopped`)));
 
         console.log(`  Total usage: ` + bold`${totalInputTokens + totalCachedWriteInputTokens + totalCachedReadInputTokens}` + ` input tokens` + (totalCachedWriteInputTokens > 0 || totalCachedReadInputTokens > 0 ? ` (${totalCachedWriteInputTokens} cache write, ${totalCachedReadInputTokens} cache read)` : '') + ` / ` + bold`${totalOutputTokens}` + ` output tokens`);
         if (totalInputTokenCost > 0 || totalOutputTokenCost > 0) {
-            if (agent.model.describeModel().startsWith('claude-code')) {
+            if (agent.models.numUniqueModels === 1 && agent.models.describe().startsWith('claude-code')) {
                 console.log(`  Cost: ` + cyanBright`None - using Claude Pro or Max subscription`)
             } else {
                 console.log(`  Cost: $${(totalInputTokenCost + totalOutputTokenCost).toFixed(3)}`);
diff --git a/packages/magnitude-core/src/ai/modelHarness.ts b/packages/magnitude-core/src/ai/modelHarness.ts
@@ -77,7 +77,7 @@ export class ModelHarness {
         return `${this.options.llm.provider}:${'model' in this.options.llm.options ? this.options.llm.options.model : 'unknown'}`;
     }
 
-    reportUsage(): void {
+    private _reportUsage(): void {
         // console.log('this.collector.last', this.collector.last)
         // if (this.collector.last) console.log("calls:", this.collector.last.calls)//console.log("Response: ", this.collector.last.calls[-1].httpResponse);
         //console.log('last call:', this.collector.last?.calls.at(-1)?.httpResponse?.body.json());
@@ -116,7 +116,8 @@ export class ModelHarness {
         const knownCostMap: Record<string, { inputTokens: number, outputTokens: number, cacheWriteInputTokens?: number, cacheReadInputTokens?: number }> = {
             // TODO: track cached savings on Gemini
             'gemini-2.5-pro': { inputTokens: 1.25, outputTokens: 10.0 },
-            'gemini-2.5-flash': { inputTokens: 0.15, outputTokens: 0.60 },
+            'gemini-2.5-flash': { inputTokens: 0.30, outputTokens: 2.50 },
+            'gemini-2.5-flash-lite': { inputTokens: 0.10, outputTokens: 0.40 },
             'claude-3.5-sonnet': { inputTokens: 3.00, outputTokens: 15.00, cacheWriteInputTokens: 3.75, cacheReadInputTokens: 0.30 },
             'claude-3.7-sonnet': { inputTokens: 3.00, outputTokens: 15.00, cacheWriteInputTokens: 3.75, cacheReadInputTokens: 0.30 },
             'claude-sonnet-4': { inputTokens: 3.00, outputTokens: 15.00, cacheWriteInputTokens: 3.75, cacheReadInputTokens: 0.30 },
@@ -172,7 +173,7 @@ export class ModelHarness {
         this.prevTotalOutputTokens = outputTokens;
     }
 
-    async createPartialRecipe<T>(
+    async partialAct<T>(
         context: AgentContext, // Changed to ModularMemoryContext
         task: string,
         data: MultiMediaContentPart[],
@@ -195,7 +196,7 @@ export class ModelHarness {
         this.logger.trace(`createPartialRecipe took ${Date.now()-start}ms`);
         // BAML does not carry over action type to @@dynamic of PartialRecipe, so forced cast necssary
         //return response as unknown as { actions: z.infer<ActionDefinition<T>['schema']>[] };//, finished: boolean };
-        this.reportUsage();
+        this._reportUsage();
         return {
             reasoning: response.reasoning,//(response.observations ? response.observations + " " : "") + response.meta_reasoning + " " + response.reasoning,
             actions: response.actions// as z.infer<ActionDefinition<T>['schema']>[]
@@ -225,7 +226,7 @@ export class ModelHarness {
             this.options.llm.provider === 'claude-code',
             { tb }
         );
-        this.reportUsage();
+        this._reportUsage();
 
         if (schema instanceof z.ZodObject) {
             return resp;
@@ -254,7 +255,7 @@ export class ModelHarness {
             this.options.llm.provider === 'claude-code',
             { tb }
         );
-        this.reportUsage();
+        this._reportUsage();
         
         if (schema instanceof z.ZodObject) {
             return resp;
diff --git a/packages/magnitude-core/src/ai/multiModelHarness.ts b/packages/magnitude-core/src/ai/multiModelHarness.ts
@@ -0,0 +1,73 @@
+import { MultiMediaContentPart } from "@/memory/rendering";
+import { ModelHarness, ModelHarnessEvents } from "./modelHarness";
+import { allBrowserAgentRoles, BrowserAgentRole, LLMClient } from "./types";
+import { ActionDefinition } from "@/actions";
+import { AgentContext } from "./baml_client";
+import { Action } from "@/actions/types";
+import { Image } from '@/memory/image';
+import EventEmitter from "eventemitter3";
+import z from "zod";
+
+
+export class MultiModelHarness {
+    /**
+     * Delegates model responsibilites to different LLMs and consolidates their usage
+     */
+    // Roles may reference the same harness
+    private roles: Record<BrowserAgentRole, ModelHarness> = {} as Record<BrowserAgentRole, ModelHarness>;
+    private uniqueModels: ModelHarness[] = [];
+
+    public readonly events: EventEmitter<ModelHarnessEvents> = new EventEmitter();
+
+    constructor(clients: LLMClient[]) {
+        // Sort by specificity (from least specific to most specific)
+        const sortedClients = clients.toSorted((a, b) => (b.roles ? b.roles.length : 9999) - (a.roles ? a.roles.length : 9999));
+        for (const client of sortedClients) {
+            const harness = new ModelHarness({ llm: client });
+            this.uniqueModels.push(harness);
+            if (client.roles) {
+                for (const role of client.roles) {
+                    this.roles[role] = harness;
+                }
+            } else {
+                for (const role of allBrowserAgentRoles) {
+                    this.roles[role] = harness;
+                }
+            }
+
+            // Forward token usage events upward
+            harness.events.on('tokensUsed', (usage) => { this.events.emit('tokensUsed', usage) }, this);
+        }
+    }
+
+    async setup() {
+        await Promise.all(this.uniqueModels.map(model => model.setup()));
+    }
+
+    describe(): string {
+        // for now - describe least specific model
+        return this.uniqueModels[0].describeModel();
+    }
+    
+    // TODO: generalize responsibility delegation
+    async partialAct<T>(
+        context: AgentContext,
+        task: string,
+        data: MultiMediaContentPart[],
+        actionVocabulary: ActionDefinition<T>[]
+    ): Promise<{ reasoning: string, actions: Action[] }> {
+        return await this.roles['act'].partialAct(context, task, data, actionVocabulary);
+    }
+
+    async extract<T extends z.Schema>(instructions: string, schema: T, screenshot: Image, domContent: string): Promise<z.infer<T>> {
+        return await this.roles['extract'].extract(instructions, schema, screenshot, domContent);
+    }
+
+    async query<T extends z.Schema>(context: AgentContext, query: string, schema: T): Promise<z.infer<T>> {
+        return await this.roles['query'].query(context, query, schema);
+    }
+
+    get numUniqueModels() {
+        return this.uniqueModels.length;
+    }
+}
diff --git a/packages/magnitude-core/src/ai/types.ts b/packages/magnitude-core/src/ai/types.ts
@@ -3,8 +3,12 @@
 //     confidence: number
 // }
 
+export type BrowserAgentRole= 'act' | 'extract' | 'query';
+export const allBrowserAgentRoles: BrowserAgentRole[] = ['act', 'extract', 'query'] as const;
+
 // Approximately mirrors https://docs.boundaryml.com/ref/llm-client-providers
-export type LLMClient = AnthropicClient | ClaudeCodeClient | BedrockClient | GoogleAIClient | GoogleVertexClient | OpenAIClient | OpenAIGenericClient | AzureOpenAIClient;
+export type LLMClient = (AnthropicClient | ClaudeCodeClient | BedrockClient | GoogleAIClient | GoogleVertexClient | OpenAIClient | OpenAIGenericClient | AzureOpenAIClient) &
+    { roles?: BrowserAgentRole[] };
 export type GroundingClient = MoondreamClient;
 
 export interface AnthropicClient {
diff --git a/packages/magnitude-core/src/ai/util.ts b/packages/magnitude-core/src/ai/util.ts
@@ -238,28 +238,33 @@ export function buildDefaultBrowserAgentOptions(
     //const { llm: envLlm, grounding: envGrounding } = tryDeriveUIGroundedClients();
     const envLlm = tryDeriveUIGroundedClient();
     
-    let llm: LLMClient | null = agentOptions.llm ?? envLlm;
+    //let llm: LLMClient | null = agentOptions.llm ?? envLlm;
+    let llms: LLMClient[] = agentOptions.llm ? (Array.isArray(agentOptions.llm) ? agentOptions.llm : [agentOptions.llm]) : (envLlm ? [envLlm] : []);
     const grounding = browserOptions.grounding;//(llm && isGroundedLlm(llm)) ? null : (browserOptions.grounding ?? envGrounding);
     
-    if (!llm) {
+    if (llms.length == 0) {
         throw new Error("No LLM configured or available from environment. Set environment variable ANTHROPIC_API_KEY and try again. See https://docs.magnitude.run/customizing/llm-configuration for details");
     }
     // else if (!isGroundedLlm(llm) && !grounding) {
     //     throw new Error("Ungrounded LLM is configured without Moondream. Either use Anthropic (set ANTHROPIC_API_KEY) or provide a MOONDREAM_API_KEY");
     // }
 
     // Set reasonable temp if not provided
-    let llmOptions: LLMClient['options'] = { temperature: DEFAULT_BROWSER_AGENT_TEMP, ...(llm?.options ?? {}) };
-    llm = {...llm, options: llmOptions as any }
-
     let virtualScreenDimensions = null;
-    if (isClaude(llm)) {
-        // Claude grounding only really works on 1024x768 screenshots
-        virtualScreenDimensions = { width: 1024, height: 768 };
+    for (const llm of llms) {
+        let llmOptions: LLMClient['options'] = { temperature: DEFAULT_BROWSER_AGENT_TEMP, ...(llm?.options ?? {}) };
+        //let modifiedLlm = {...llm, options: llmOptions as any }
+        llm.options = llmOptions;
+
+        if (isClaude(llm)) {
+            // Claude grounding only really works on 1024x768 screenshots
+            // if any model is claude, use virtual screen dimensions
+            virtualScreenDimensions = { width: 1024, height: 768 };
+        }
     }
 
     return {
-        agentOptions: {...agentOptions, llm: llm },
+        agentOptions: {...agentOptions, llm: llms },
         browserOptions: {...browserOptions, grounding: grounding ?? undefined, virtualScreenDimensions: virtualScreenDimensions ?? undefined }
     };
 }
diff --git a/packages/magnitude-core/src/connectors/browserConnector.ts b/packages/magnitude-core/src/connectors/browserConnector.ts

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,8 @@`
`81`	`81`	`{`
`82`	`82`	`"group": "Advanced",`
`83`	`83`	`"pages": [`
`84`		`- "advanced/memory"`
	`84`	`+ "advanced/memory",`
	`85`	`+ "advanced/roles"`
`85`	`86`	`]`
`86`	`87`	`},`
`87`	`88`	`{`