Skip to content

Commit ebbf4dc

Browse files
committed
feat(agent-tars): reuse browser between gui-agent and dom-tree-powered browser agent
1 parent 2ac6738 commit ebbf4dc

File tree

5 files changed

+127
-62
lines changed

5 files changed

+127
-62
lines changed

packages/agent-infra/mcp-servers/browser/src/server.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ interface GlobalConfig {
5050
launchOptions?: LaunchOptions;
5151
remoteOptions?: RemoteBrowserOptions;
5252
logger?: Partial<Logger>;
53+
externalBrowser?: LocalBrowser;
5354
}
5455

5556
// Global state
@@ -129,7 +130,13 @@ async function setInitialBrowser(
129130
globalPage = _page;
130131
}
131132

132-
// priority 2: create new browser and page
133+
// priority 2: use external browser from config if available
134+
if (!globalBrowser && globalConfig.externalBrowser) {
135+
globalBrowser = await globalConfig.externalBrowser.getBrowser();
136+
logger.info('Using external browser instance');
137+
}
138+
139+
// priority 3: create new browser and page
133140
if (!globalBrowser) {
134141
const browser = globalConfig.remoteOptions
135142
? new RemoteBrowser(globalConfig.remoteOptions)

packages/multimodal/agent-tars/examples/default.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
*/
55

66
import { join } from 'path';
7-
import { AgentTARS, AgentTARSOptions } from '../src';
7+
import { AgentTARS, AgentTARSOptions, LogLevel } from '../src';
88
import { TEST_MODEL_PROVIDERS } from '@multimodal/agent/_config';
99

1010
export const DEFUALT_OPTIONS: AgentTARSOptions = {
@@ -35,6 +35,7 @@ export const DEFUALT_OPTIONS: AgentTARSOptions = {
3535
experimental: {
3636
dumpMessageHistory: true,
3737
},
38+
logLevel: LogLevel.DEBUG,
3839
};
3940

4041
export async function runAgentTARS(query: string) {

packages/multimodal/agent-tars/src/agent-tars.ts

Lines changed: 74 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import { DEFAULT_SYSTEM_PROMPT } from './shared';
2525
import { InMemoryTransport } from '@modelcontextprotocol/sdk/inMemory.js';
2626
import { Client } from '@modelcontextprotocol/sdk/client/index.js';
2727
import { GUIAgent } from './gui-agent';
28+
import { LocalBrowser } from '@agent-infra/browser';
2829

2930
/**
3031
* A Agent TARS that uses in-memory MCP tool call
@@ -36,6 +37,7 @@ export class AgentTARS extends MCPAgent {
3637
private mcpServers: BuiltInMCPServers = {};
3738
private inMemoryMCPClients: Partial<Record<BuiltInMCPServerName, Client>> = {};
3839
private guiAgent?: GUIAgent;
40+
private sharedBrowser?: LocalBrowser;
3941

4042
// Message history storage for experimental dump feature
4143
private traces: Array<{
@@ -91,15 +93,6 @@ export class AgentTARS extends MCPAgent {
9193
command: 'npx',
9294
args: ['-y', '@agent-infra/mcp-server-browser'],
9395
},
94-
// Only include browser server if GUI agent is not enabled
95-
// ...(tarsOptions.browser?.controlSolution !== 'gui-agent'
96-
// ? {
97-
// browser: {
98-
// command: 'npx',
99-
// args: ['-y', '@agent-infra/mcp-server-browser'],
100-
// },
101-
// }
102-
// : {}),
10396
filesystem: {
10497
command: 'npx',
10598
args: ['-y', '@agent-infra/mcp-server-filesystem', workingDirectory],
@@ -137,31 +130,69 @@ export class AgentTARS extends MCPAgent {
137130
async initialize(): Promise<void> {
138131
this.logger.info('Initializing AgentTARS ...');
139132

140-
const initPromises: Promise<void>[] = [
133+
try {
134+
// First initialize shared browser instance
135+
await this.initializeSharedBrowser();
136+
137+
const initPromises: Promise<void>[] = [
138+
/**
139+
* Base mcp-agent's initialization process.
140+
*/
141+
super.initialize(),
142+
];
143+
141144
/**
142-
* Base mcp-agent's initialization process.
145+
* Initialize GUI Agent if enabled
143146
*/
144-
super.initialize(),
145-
];
146-
147-
/**
148-
* Initialize GUI Agent if enabled
149-
*/
150-
if (this.tarsOptions.browser?.controlSolution === 'gui-agent') {
151-
await this.initializeGUIAgent();
152-
}
147+
if (this.tarsOptions.browser?.controlSolution === 'gui-agent') {
148+
await this.initializeGUIAgent();
149+
}
150+
151+
/**
152+
* In-process MCP initialization.
153+
*/
154+
if (this.tarsOptions.mcpImpl === 'in-memory') {
155+
initPromises.push(this.initializeInMemoryMCPForBuiltInMCPServers());
156+
}
153157

154-
/**
155-
* In-process MCP initialization.
156-
*/
157-
if (this.tarsOptions.mcpImpl === 'in-memory') {
158-
initPromises.push(this.initializeInMemoryMCPForBuiltInMCPServers());
158+
await Promise.all(initPromises);
159+
this.logger.info('✅ AgentTARS initialization complete');
160+
// Log all registered tools in a beautiful format
161+
this.logRegisteredTools();
162+
} catch (error) {
163+
this.logger.error('❌ Failed to initialize AgentTARS:', error);
164+
await this.cleanup();
165+
throw error;
159166
}
167+
}
168+
169+
/**
170+
* Initialize shared browser instance
171+
*/
172+
/**
173+
* Initialize shared browser instance
174+
*/
175+
private async initializeSharedBrowser(): Promise<void> {
176+
try {
177+
this.logger.info('🌐 Initializing shared browser instance...');
178+
179+
this.sharedBrowser = new LocalBrowser({
180+
logger: this.logger.spawn('SharedBrowser'),
181+
});
182+
183+
// Configure browser based on options
184+
const launchOptions = {
185+
headless: this.tarsOptions.browser?.headless,
186+
};
187+
188+
// Launch the browser
189+
await this.sharedBrowser.launch(launchOptions);
160190

161-
await Promise.all(initPromises);
162-
this.logger.info('✅ AgentTARS initialization complete');
163-
// Log all registered tools in a beautiful format
164-
this.logRegisteredTools();
191+
this.logger.success('✅ Shared browser instance initialized with initial page');
192+
} catch (error) {
193+
this.logger.error(`❌ Failed to initialize shared browser: ${error}`);
194+
throw error;
195+
}
165196
}
166197

167198
/**
@@ -226,10 +257,11 @@ export class AgentTARS extends MCPAgent {
226257
try {
227258
this.logger.info('🖥️ Initializing GUI Agent for visual browser control');
228259

229-
// Create GUI Agent instance
260+
// Create GUI Agent instance with shared browser
230261
this.guiAgent = new GUIAgent({
231-
logger: this.logger.spawn('GUIAgent'),
262+
logger: this.logger,
232263
headless: this.tarsOptions.browser?.headless,
264+
externalBrowser: this.sharedBrowser, // Pass the shared browser instance
233265
});
234266

235267
// Initialize the browser
@@ -256,10 +288,6 @@ export class AgentTARS extends MCPAgent {
256288
const moduleImports = [
257289
this.dynamicImport('@agent-infra/mcp-server-search'),
258290
this.dynamicImport('@agent-infra/mcp-server-browser'),
259-
// Only import browser module if GUI agent is not enabled
260-
// ...(this.tarsOptions.browser?.controlSolution !== 'gui-agent'
261-
// ? [this.dynamicImport('@agent-infra/mcp-server-browser')]
262-
// : [Promise.resolve({ default: { createServer: () => undefined } })]),
263291
this.dynamicImport('@agent-infra/mcp-server-filesystem'),
264292
this.dynamicImport('@agent-infra/mcp-server-commands'),
265293
];
@@ -280,20 +308,11 @@ export class AgentTARS extends MCPAgent {
280308
baseUrl: this.tarsOptions.search!.baseUrl,
281309
}),
282310
browser: browserModule.default.createServer({
311+
externalBrowser: this.sharedBrowser, // Pass the shared browser instance
283312
launchOptions: {
284313
headless: this.tarsOptions.browser?.headless,
285314
},
286315
}),
287-
// Only create browser server if GUI agent is not enabled
288-
// ...(this.tarsOptions.browser?.controlSolution !== 'gui-agent'
289-
// ? {
290-
// browser: browserModule.default.createServer({
291-
// launchOptions: {
292-
// headless: this.tarsOptions.browser?.headless,
293-
// },
294-
// }),
295-
// }
296-
// : {}),
297316
filesystem: filesystemModule.default.createServer({
298317
allowedDirectories: [this.workingDirectory],
299318
}),
@@ -462,13 +481,23 @@ export class AgentTARS extends MCPAgent {
462481
}
463482
}
464483

484+
// Finally close the shared browser instance
485+
if (this.sharedBrowser) {
486+
cleanupPromises.push(
487+
this.sharedBrowser.close().catch((error) => {
488+
this.logger.warn(`⚠️ Error while closing shared browser: ${error}`);
489+
}),
490+
);
491+
}
492+
465493
// Wait for all cleanup operations to complete
466494
await Promise.allSettled(cleanupPromises);
467495

468496
// Clear references
469497
this.inMemoryMCPClients = {};
470498
this.mcpServers = {};
471499
this.guiAgent = undefined;
500+
this.sharedBrowser = undefined;
472501

473502
this.logger.info('✅ Cleanup complete');
474503
}

packages/multimodal/agent-tars/src/gui-agent.ts

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ export interface ActionInputs {
2727
end_coords?: Coords;
2828
}
2929

30+
function sleep(time: number) {
31+
return new Promise(function (resolve) {
32+
setTimeout(resolve, time);
33+
});
34+
}
35+
3036
/**
3137
* Parsed prediction from GUI agent
3238
*/
@@ -49,6 +55,8 @@ export interface GUIAgentOptions {
4955
headless?: boolean;
5056
/** Scaling factors for coordinates */
5157
factors?: [number, number];
58+
/** External browser instance to use (optional) */
59+
externalBrowser?: LocalBrowser;
5260
}
5361

5462
/**
@@ -62,6 +70,7 @@ export class GUIAgent {
6270
private guiAgentTool: ToolDefinition;
6371
private logger: ConsoleLogger;
6472
private factors: [number, number];
73+
private externalBrowserInstance: boolean;
6574

6675
/**
6776
* Creates a new GUI Agent
@@ -70,11 +79,14 @@ export class GUIAgent {
7079
constructor(private options: GUIAgentOptions) {
7180
this.logger = options.logger;
7281
this.factors = options.factors || [1000, 1000];
82+
this.externalBrowserInstance = !!options.externalBrowser;
7383

74-
// Initialize browser
75-
this.browser = new LocalBrowser({
76-
logger: this.logger,
77-
});
84+
// Initialize browser - use external browser if provided, otherwise create new one
85+
this.browser =
86+
options.externalBrowser ||
87+
new LocalBrowser({
88+
logger: this.logger,
89+
});
7890

7991
// Initialize browser operator
8092
this.browserOperator = new BrowserOperator({
@@ -145,6 +157,8 @@ finished(content='xxx') # Use escape characters \\', \", and \\n in content part
145157
screenHeight: this.screenHeight || 1080,
146158
});
147159

160+
await sleep(500);
161+
148162
return { action, status: 'success', result };
149163
} catch (error) {
150164
this.logger.error(
@@ -164,13 +178,19 @@ finished(content='xxx') # Use escape characters \\', \", and \\n in content part
164178
* Initialize the GUI Agent and launch the browser
165179
*/
166180
async initialize(): Promise<void> {
167-
await this.browser.launch({
168-
headless: this.options.headless,
169-
});
170-
const openingPage = await this.browser.createPage();
171-
await openingPage.goto('about:blank', {
172-
waitUntil: 'networkidle2',
173-
});
181+
// Only launch browser if it wasn't provided externally
182+
if (!this.externalBrowserInstance) {
183+
await this.browser.launch({
184+
headless: this.options.headless,
185+
});
186+
187+
// Create new page only when using internal browser
188+
const openingPage = await this.browser.createPage();
189+
await openingPage.goto('about:blank', {
190+
waitUntil: 'networkidle2',
191+
});
192+
}
193+
// Skip page creation when using external browser since it should already have a page
174194

175195
this.logger.info('GUI Agent browser initialized');
176196
}
@@ -189,6 +209,8 @@ finished(content='xxx') # Use escape characters \\', \", and \\n in content part
189209
* - Sends the screenshot to the event stream
190210
*/
191211
async onEachAgentLoopStart(eventStream: any): Promise<void> {
212+
console.log('Agent Loop Start');
213+
192214
// Record screenshot start time
193215
const startTime = performance.now();
194216

@@ -207,7 +229,8 @@ finished(content='xxx') # Use escape characters \\', \", and \\n in content part
207229
const sizeInBytes = Math.ceil((base64Data.length * 3) / 4);
208230
const sizeInKB = (sizeInBytes / 1024).toFixed(2);
209231

210-
this.logger.debug('Screenshot info:', {
232+
// FIXME: using logger
233+
console.log('Screenshot info:', {
211234
width: this.screenWidth,
212235
height: this.screenHeight,
213236
size: `${sizeInKB} KB`,
@@ -242,8 +265,13 @@ finished(content='xxx') # Use escape characters \\', \", and \\n in content part
242265
*/
243266
async cleanup(): Promise<void> {
244267
try {
245-
await this.browser.close();
246-
this.logger.info('Browser closed successfully');
268+
// Only close browser if it wasn't provided externally
269+
if (!this.externalBrowserInstance) {
270+
await this.browser.close();
271+
this.logger.info('Browser closed successfully');
272+
} else {
273+
this.logger.info('Skipping browser close - using external browser instance');
274+
}
247275
} catch (error) {
248276
this.logger.error(`Error closing browser: ${error}`);
249277
}

packages/multimodal/agent-tars/src/types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,4 +218,4 @@ export type AgentTARSOptions = Partial<MCPAgentOptions> & {
218218
* Experimental features configuration
219219
*/
220220
experimental?: AgentTARSExperimentalOptions;
221-
};
221+
};

0 commit comments

Comments
 (0)