Skip to content

Commit d0878b8

Browse files
authored
feat(mcp-browser): vision mode add browser_vision_click and fullPage (#637)
1 parent a85ef50 commit d0878b8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+306
-36
lines changed

.changeset/pre.json

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@
2727
"@multimodal/agent": "0.1.1-beta.3",
2828
"@multimodal/agent-interface": "0.1.1-beta.3",
2929
"@multimodal/agent-snapshot": "0.1.1-beta.3",
30-
"@agent-tars/core": "0.0.1-beta.4",
31-
"@agent-tars/cli": "0.0.1-beta.4",
32-
"@agent-tars/server": "0.0.1-beta.4",
33-
"@agent-tars/test-workspace": "0.0.1-beta.4",
34-
"@agent-tars/web-ui": "0.0.1-beta.4",
30+
"@agent-tars/core": "0.0.1-beta.5",
31+
"@agent-tars/cli": "0.0.1-beta.5",
32+
"@agent-tars/server": "0.0.1-beta.5",
33+
"@agent-tars/test-workspace": "0.0.1-beta.5",
34+
"@agent-tars/web-ui": "0.0.1-beta.5",
3535
"@multimodal/codeact-agent": "0.1.1-beta.3",
3636
"@multimodal/config-loader": "0.1.1-beta.3",
3737
"@multimodal/deep-research-agent": "0.1.1-beta.3",
@@ -56,6 +56,7 @@
5656
"metal-flies-ring",
5757
"serious-spiders-wash",
5858
"sour-rocks-sip",
59-
"spicy-kids-allow"
59+
"spicy-kids-allow",
60+
"twenty-snakes-refuse"
6061
]
6162
}

.changeset/twenty-snakes-refuse.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@agent-infra/mcp-server-browser': patch
3+
---
4+
5+
feat: support vision mode

packages/agent-infra/mcp-client/CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# @agent-infra/mcp-client
22

3+
## 1.1.6-beta.3
4+
5+
### Patch Changes
6+
7+
- @agent-infra/mcp-shared@1.1.6-beta.3
8+
39
## 1.1.6-beta.2
410

511
### Patch Changes

packages/agent-infra/mcp-client/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@agent-infra/mcp-client",
3-
"version": "1.1.6-beta.2",
3+
"version": "1.1.6-beta.3",
44
"description": "An MCP Client to run servers for Electron apps, support same-process approaching",
55
"types": "./dist/index.d.ts",
66
"main": "./dist/index.js",

packages/agent-infra/mcp-servers/browser/CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# @agent-infra/mcp-server-browser
22

3+
## 1.1.6-beta.3
4+
5+
### Patch Changes
6+
7+
- feat: support vision mode
8+
39
## 1.1.6-beta.2
410

511
### Patch Changes
@@ -10,6 +16,10 @@
1016

1117
## 1.1.6-beta.0
1218

19+
### Patch Changes
20+
21+
- feat: add fullPage
22+
1323
## 1.1.5
1424

1525
### Patch Changes

packages/agent-infra/mcp-servers/browser/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@agent-infra/mcp-server-browser",
3-
"version": "1.1.6-beta.2",
3+
"version": "1.1.6-beta.3",
44
"description": "MCP server for browser use access",
55
"license": "MIT",
66
"homepage": "https://github.com/bytedance/UI-TARS-desktop",
@@ -23,10 +23,12 @@
2323
"scripts": {
2424
"build": "shx rm -rf dist && rslib build && shx chmod +x dist/*.{js,cjs}",
2525
"dev": "npx -y @modelcontextprotocol/inspector tsx src/index.ts",
26+
"dev:vision": "npx -y @modelcontextprotocol/inspector tsx src/index.ts --vision",
2627
"prepare": "npm run build",
2728
"watch": "rslib build --watch"
2829
},
2930
"dependencies": {
31+
"@ui-tars/action-parser": "workspace:*",
3032
"mcp-http-server": "workspace:*",
3133
"@modelcontextprotocol/sdk": "^1.11.2"
3234
},

packages/agent-infra/mcp-servers/browser/src/index.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,10 @@ program
9494
'--viewport-size <size>',
9595
'specify browser viewport size in pixels, for example "1280, 720"',
9696
)
97-
// .option(
98-
// '--vision',
99-
// 'Run server that uses screenshots (Aria snapshots are used by default)',
100-
// )
97+
.option(
98+
'--vision',
99+
'Run server that uses screenshots (Aria snapshots are used by default)',
100+
)
101101
.action(async (options) => {
102102
try {
103103
console.log('[mcp-server-browser] options', options);
@@ -110,6 +110,7 @@ program
110110
cdpEndpoint: options.cdpEndpoint,
111111
},
112112
}),
113+
vision: options.vision,
113114
launchOptions: {
114115
headless: options.headless,
115116
executablePath: options.executablePath,

packages/agent-infra/mcp-servers/browser/src/server.ts

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ import merge from 'lodash.merge';
4343
import { parseProxyUrl } from './utils.js';
4444
import { ElementHandle, KeyInput } from 'puppeteer-core';
4545
import { keyInputValues } from './constants.js';
46+
import { getVisionTools, visionToolsMap } from './tools/vision.js';
47+
import { ToolContext } from './typings.js';
4648

4749
const consoleLogs: string[] = [];
4850

@@ -72,6 +74,11 @@ interface GlobalConfig {
7274
* @defaultValue true
7375
*/
7476
enableAdBlocker?: boolean;
77+
/**
78+
* Whether to add vision tools
79+
* @defaultValue false
80+
*/
81+
vision?: boolean;
7582
}
7683

7784
// Global state
@@ -81,6 +88,7 @@ let globalConfig: GlobalConfig = {
8188
},
8289
contextOptions: {},
8390
enableAdBlocker: true,
91+
vision: false,
8492
};
8593

8694
let globalBrowser: LocalBrowser['browser'] | undefined;
@@ -273,6 +281,10 @@ export const toolsMap = {
273281
.number()
274282
.optional()
275283
.describe('Height in pixels (default: viewport height)'),
284+
fullPage: z
285+
.boolean()
286+
.optional()
287+
.describe('Full page screenshot (default: false)'),
276288
highlight: z
277289
.boolean()
278290
.optional()
@@ -285,7 +297,8 @@ export const toolsMap = {
285297
},
286298
browser_click: {
287299
name: 'browser_click',
288-
description: 'Click an element on the page',
300+
description:
301+
'Click an element on the page, before using the tool, use `browser_get_clickable_elements` to get the index of the element, but not call `browser_get_clickable_elements` multiple times',
289302
inputSchema: z.object({
290303
// selector: z
291304
// .string()
@@ -432,9 +445,11 @@ export const toolsMap = {
432445
},
433446
};
434447

435-
type ToolNames = keyof typeof toolsMap;
448+
type ToolNames = keyof typeof toolsMap | keyof typeof visionToolsMap;
436449
type ToolInputMap = {
437-
[K in ToolNames]: (typeof toolsMap)[K] extends { inputSchema: infer S }
450+
[K in ToolNames]: (typeof toolsMap & typeof visionToolsMap)[K] extends {
451+
inputSchema: infer S;
452+
}
438453
? S extends z.ZodType<any, any, any>
439454
? z.infer<S>
440455
: unknown
@@ -498,9 +513,13 @@ const handleToolCall = async ({
498513
};
499514
}
500515

516+
const ctx: ToolContext = { page, browser, logger };
517+
501518
const handlers: {
502519
[K in ToolNames]: (args: ToolInputMap[K]) => Promise<CallToolResult>;
503520
} = {
521+
// vision tools
522+
...getVisionTools(ctx),
504523
browser_go_back: async (args) => {
505524
try {
506525
await Promise.all([waitForPageAndFramesLoad(page), page.goBack()]);
@@ -658,7 +677,10 @@ const handleToolCall = async ({
658677
// if screenshot is still undefined, take a screenshot of the whole page
659678
screenshot =
660679
screenshot ||
661-
(await page.screenshot({ encoding: 'base64', fullPage: false }));
680+
(await page.screenshot({
681+
encoding: 'base64',
682+
fullPage: args.fullPage ?? false,
683+
}));
662684

663685
// if screenshot is still undefined, return an error
664686
if (!screenshot) {
@@ -702,6 +724,7 @@ const handleToolCall = async ({
702724

703725
try {
704726
const { clickableElements } = (await buildDomTree(page)) || {};
727+
await removeHighlights(page);
705728
if (clickableElements) {
706729
return {
707730
content: [
@@ -1295,8 +1318,13 @@ function createServer(config: GlobalConfig = {}): McpServer {
12951318
version: process.env.VERSION || '0.0.1',
12961319
});
12971320

1321+
const mergedToolsMap = {
1322+
...toolsMap,
1323+
...(config.vision ? visionToolsMap : {}),
1324+
};
1325+
12981326
// === Tools ===
1299-
Object.entries(toolsMap).forEach(([name, tool]) => {
1327+
Object.entries(mergedToolsMap).forEach(([name, tool]) => {
13001328
// @ts-ignore
13011329
if (tool?.inputSchema) {
13021330
server.tool(
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import {
2+
CallToolResult,
3+
TextContent,
4+
} from '@modelcontextprotocol/sdk/types.js';
5+
import { z } from 'zod';
6+
import { ToolContext } from '../typings.js';
7+
8+
type ToolNames = keyof typeof visionToolsMap;
9+
type ToolInputMap = {
10+
[K in ToolNames]: (typeof visionToolsMap)[K] extends {
11+
inputSchema: infer S;
12+
}
13+
? S extends z.ZodType<any, any, any>
14+
? z.infer<S>
15+
: unknown
16+
: unknown;
17+
};
18+
19+
export const visionToolsMap = {
20+
browser_vision_screen_capture: {
21+
description: 'Take a screenshot of the current page for vision mode',
22+
},
23+
browser_vision_screen_click: {
24+
description:
25+
'Click left mouse button on the page with vision and snapshot, before calling this tool, you should call `browser_vision_screen_capture` first only once, fallback to `browser_click` if failed',
26+
inputSchema: z.object({
27+
factors: z
28+
.array(z.number())
29+
.optional()
30+
.describe('Vision Model scaling factors, [width_scale, height_scale]'),
31+
x: z.number().describe('X coordinate'),
32+
y: z.number().describe('Y coordinate'),
33+
}),
34+
},
35+
};
36+
37+
export const getVisionTools = (ctx: ToolContext) => {
38+
const { page, logger } = ctx;
39+
40+
const visionTools: {
41+
[K in ToolNames]: (args: ToolInputMap[K]) => Promise<CallToolResult>;
42+
} = {
43+
browser_vision_screen_capture: async () => {
44+
const viewport = page.viewport();
45+
46+
const screenshot = await page.screenshot({
47+
type: 'jpeg' as const,
48+
quality: 50,
49+
fullPage: false,
50+
omitBackground: false,
51+
encoding: 'base64',
52+
});
53+
54+
return {
55+
content: [
56+
{
57+
type: 'text',
58+
text: `Screenshot taken at ${viewport?.width}x${viewport?.height}`,
59+
} as TextContent,
60+
{
61+
type: 'image' as const,
62+
data: screenshot,
63+
mimeType: 'image/jpeg',
64+
},
65+
],
66+
};
67+
},
68+
browser_vision_screen_click: async (args) => {
69+
try {
70+
let x = args.x;
71+
let y = args.y;
72+
73+
if (args.factors) {
74+
const { actionParser } = await import('@ui-tars/action-parser');
75+
76+
const viewport = page.viewport();
77+
const { parsed } = actionParser({
78+
prediction: `Action: click(start_box='(${args.x},${args.y})')`,
79+
factor: args.factors as [number, number],
80+
screenContext: {
81+
width: viewport?.width ?? 0,
82+
height: viewport?.height ?? 0,
83+
},
84+
});
85+
86+
const { start_coords } = parsed?.[0]?.action_inputs ?? {};
87+
logger.info('[vision] start_coords', start_coords);
88+
89+
x = start_coords?.[0] ?? x;
90+
y = start_coords?.[1] ?? y;
91+
}
92+
93+
await page.mouse.move(x, y);
94+
await page.mouse.down();
95+
await page.mouse.up();
96+
97+
return {
98+
content: [
99+
{
100+
type: 'text',
101+
text: `Vision click at ${args.x}, ${args.y}`,
102+
},
103+
],
104+
isError: false,
105+
};
106+
} catch (error) {
107+
return {
108+
content: [{ type: 'text', text: 'Error clicking on the page' }],
109+
isError: true,
110+
};
111+
}
112+
},
113+
};
114+
return visionTools;
115+
};
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { Logger } from '@agent-infra/logger';
2+
import { Browser, Page } from 'puppeteer-core';
3+
4+
export type ToolContext = {
5+
page: Page;
6+
browser: Browser;
7+
logger: Logger;
8+
};

packages/agent-infra/mcp-servers/commands/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
## 0.4.0 2024-12-10 - add logging
22

3+
## 1.1.6-beta.3
4+
35
## 1.1.6-beta.2
46

57
### Patch Changes

packages/agent-infra/mcp-servers/commands/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@agent-infra/mcp-server-commands",
3-
"version": "1.1.6-beta.2",
3+
"version": "1.1.6-beta.3",
44
"description": "An MCP server to run arbitrary commands",
55
"type": "module",
66
"main": "./dist/server.cjs",

packages/agent-infra/mcp-servers/filesystem/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# @agent-infra/mcp-server-filesystem
22

3+
## 1.1.6-beta.3
4+
35
## 1.1.6-beta.2
46

57
### Patch Changes

packages/agent-infra/mcp-servers/filesystem/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@agent-infra/mcp-server-filesystem",
3-
"version": "1.1.6-beta.2",
3+
"version": "1.1.6-beta.3",
44
"description": "MCP server for filesystem access",
55
"license": "MIT",
66
"type": "module",

packages/agent-infra/mcp-servers/search/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# @agent-infra/mcp-server-search
22

3+
## 1.1.6-beta.3
4+
35
## 1.1.6-beta.2
46

57
### Patch Changes

0 commit comments

Comments
 (0)