Skip to content

Commit f375a2e

Browse files
committed
feat: refine multimodal agent architecture
1. Support model providers 2. Support `volcengine` provider 3. Better interface design.
1 parent f157006 commit f375a2e

32 files changed

+1019
-578
lines changed

.editorconfig

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ indent_style = space
66
indent_size = 2
77
end_of_line = lf
88
insert_final_newline = true
9-
trim_trailing_whitespace = true
9+
trim_trailing_whitespace = true
10+
max_line_length= 100

packages/multimodal/agent/examples/weather.ts renamed to packages/multimodal/agent/examples/agent/basic.ts

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,8 @@
22
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
5-
import {
6-
Agent,
7-
Tool,
8-
Model,
9-
OpenAI,
10-
OpenAIToolCallProvider,
11-
InstructionToolCallProvider,
12-
} from '../src';
13-
import { z } from 'zod';
14-
import { getModel } from './model';
15-
16-
const model = getModel('qwen3:1.7b');
17-
// Create tool call provider explicitly
18-
const toolCallProvider = new OpenAIToolCallProvider();
5+
import { Agent, Tool, z } from '../../src';
6+
import { TEST_MODEL_PROVIDERS } from './model';
197

208
const locationTool = new Tool({
219
id: 'getCurrentLocation',
@@ -46,7 +34,6 @@ const weatherTool = new Tool({
4634
});
4735

4836
const agent = new Agent({
49-
model,
5037
name: 'Agent TARS',
5138
tools: [locationTool, weatherTool],
5239
instructions: `
@@ -55,8 +42,10 @@ const agent = new Agent({
5542
1. DO NOT make any fake informations
5643
2. "finish_reason" should always be "tool_calls"
5744
`,
58-
toolCallProvider, // Pass the provider explicitly
5945
maxIterations: 10,
46+
model: {
47+
providers: TEST_MODEL_PROVIDERS,
48+
},
6049
});
6150

6251
async function main() {
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
import { ModelProvider } from '../../src';
6+
7+
export const TEST_MODEL_PROVIDERS: ModelProvider[] = [
8+
{
9+
name: 'volcengine',
10+
apiKey: process.env.MM_TEST_API_KEY,
11+
models: [
12+
{
13+
id: 'ep-20250510145437-5sxhs',
14+
label: 'doubao-1.5-thinking-vision-pro',
15+
},
16+
],
17+
},
18+
{
19+
name: 'lm-studio',
20+
models: [
21+
{
22+
id: 'qwen2.5-coder-3b-instruct',
23+
},
24+
{
25+
id: 'qwen2.5-7b-instruct-1m',
26+
},
27+
],
28+
},
29+
{
30+
name: 'ollama',
31+
models: [
32+
{
33+
id: 'qwen3:1.7b',
34+
},
35+
],
36+
},
37+
{
38+
name: 'openai',
39+
baseURL: process.env.OPENAI_API_BASE_URL,
40+
models: [
41+
{
42+
id: 'gpt-4o-2024-11-20',
43+
},
44+
],
45+
},
46+
{
47+
name: 'azure-openai',
48+
baseURL: process.env.AWS_CLAUDE_API_BASE_URL,
49+
models: [
50+
{
51+
id: 'aws_sdk_claude37_sonnet',
52+
},
53+
],
54+
},
55+
];

packages/multimodal/agent/examples/github-reviewer-agent.ts renamed to packages/multimodal/agent/examples/agent/github-reviewer-agent.ts

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,11 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
import { join } from 'path';
6-
import { MCPAgent } from '../src';
7-
import { getModel } from './model';
8-
9-
const model = getModel('gpt-4o-2024-11-20');
6+
import { MCPAgent } from '../../src';
7+
import { TEST_MODEL_PROVIDERS } from './config';
108

119
async function main() {
1210
const agent = new MCPAgent({
13-
model,
1411
instructions:
1512
'You are GitHub Reviewer, a specialized assistant designed to help with code review tasks. ' +
1613
'You excel at analyzing pull requests, identifying potential bugs, security issues, and suggesting code improvements. ' +
@@ -38,18 +35,17 @@ async function main() {
3835
'Take screenshots of specific code sections when they help illustrate complex issues or changes. ' +
3936
'Your reviews should be thorough yet easy to understand, with code examples making your feedback concrete and actionable. ' +
4037
"Remember that including actual code snippets makes your reports more vivid and helps the developer understand exactly what you're referring to.",
38+
model: {
39+
providers: TEST_MODEL_PROVIDERS,
40+
},
4141
mcpServers: {
4242
playwright: {
4343
command: 'npx',
4444
args: ['@playwright/mcp@latest'],
4545
},
4646
filesystem: {
4747
command: 'npx',
48-
args: [
49-
'-y',
50-
'@modelcontextprotocol/server-filesystem',
51-
join(__dirname, 'result'),
52-
],
48+
args: ['-y', '@modelcontextprotocol/server-filesystem', join(__dirname, 'filesystem')],
5349
},
5450
},
5551
});
@@ -63,9 +59,7 @@ async function main() {
6359
}
6460

6561
// GitHub PR review related queries
66-
const queries = [
67-
'Review https://github.com/bytedance/UI-TARS-desktop/pull/534',
68-
];
62+
const queries = ['Review https://github.com/bytedance/UI-TARS-desktop/pull/534'];
6963

7064
for (const query of queries) {
7165
console.log('\n==================================================');

packages/multimodal/agent/examples/mcp-agent.ts renamed to packages/multimodal/agent/examples/agent/mcp-agent.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@
22
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
5-
import { MCPAgent } from '../src';
6-
import { getModel } from './model';
7-
8-
const model = getModel('qwen3:1.7b');
5+
import { MCPAgent } from '../../src';
6+
import { TEST_MODEL_PROVIDERS } from './config';
97

108
async function main() {
119
const agent = new MCPAgent({
12-
model,
1310
instructions:
1411
'You are Agent TARS, a helpful assistant that can use the tools available to help users with their questions.',
1512
mcpServers: {
@@ -18,6 +15,9 @@ async function main() {
1815
args: ['@playwright/mcp@latest'],
1916
},
2017
},
18+
model: {
19+
providers: TEST_MODEL_PROVIDERS,
20+
},
2121
});
2222

2323
try {

packages/multimodal/agent/examples/browser.html

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ <h3>Output:</h3>
6464
z,
6565
Model,
6666
AzureOpenAI,
67-
InstructionToolCallProvider
67+
PromptToolCallEngine
6868
} from '../dist/index.mjs';
6969

7070
const originalConsoleLog = console.log;
@@ -138,14 +138,14 @@ <h3>Output:</h3>
138138
});
139139

140140
// Create an instruction-based tool call provider for Claude models
141-
const toolCallProvider = new InstructionToolCallProvider();
141+
const ToolCallEngine = new PromptToolCallEngine();
142142

143143
const agent = new Agent({
144144
model,
145145
name: 'Agent TARS',
146146
instructions: `You are Agent TARS, a general AI agent that is good at understanding user needs and helping people solve problems.`,
147147
tools: [locationTool, weatherTool],
148-
toolCallProvider, // Pass the provider explicitly
148+
ToolCallEngine, // Pass the provider explicitly
149149
maxIterations: 3,
150150
});
151151

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"choices": [
3+
{
4+
"finish_reason": "stop",
5+
"index": 0,
6+
"logprobs": null,
7+
"message": {
8+
"content": "你好呀~有什么我可以帮忙的地方吗?",
9+
"reasoning_content": "用户现在打招呼说“你好”,需要友好回应。所以回复“你好呀~有什么我可以帮忙的地方吗?”这样亲切自然。",
10+
"role": "assistant"
11+
}
12+
}
13+
],
14+
"created": 1746900426,
15+
"id": "021746900424215384cfaa31ec6325bb0f7a8325e5c6fc5bc9319",
16+
"model": "doubao-1-5-thinking-vision-pro-250428",
17+
"service_tier": "default",
18+
"object": "chat.completion",
19+
"usage": {
20+
"completion_tokens": 48,
21+
"prompt_tokens": 60,
22+
"total_tokens": 108,
23+
"prompt_tokens_details": {
24+
"cached_tokens": 0
25+
},
26+
"completion_tokens_details": {
27+
"reasoning_tokens": 35
28+
}
29+
}
30+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/* eslint-disable @typescript-eslint/no-non-null-asserted-optional-chain */
2+
/* eslint-disable no-unsafe-optional-chaining */
3+
import { OpenAI } from 'openai';
4+
import { TokenJS } from 'token.js';
5+
6+
export const tokenjs = new TokenJS({
7+
apiKey: process.env.MM_TEST_API_KEY,
8+
baseURL: process.env.MM_TEST_BASE_URL,
9+
});
10+
11+
tokenjs.extendModelList('openai', 'ep-20250510145437-5sxhs', {
12+
streaming: true,
13+
json: true,
14+
toolCalls: true,
15+
images: true,
16+
});
17+
18+
// FIXME: remove as, we need to fix the type issue of token.js
19+
const client = tokenjs as unknown as OpenAI;
20+
21+
async function main() {
22+
console.time('> TTFT');
23+
24+
// @ts-expect-error
25+
const completion = await client.chat.completions.create({
26+
provider: 'openai',
27+
stream: true as const,
28+
model: process.env.MM_TEST_MODEL!,
29+
messages: [
30+
{
31+
role: 'user',
32+
content: 'Hello!',
33+
},
34+
],
35+
});
36+
37+
console.timeEnd('> TTFT');
38+
39+
console.time('> Streaming Duration');
40+
41+
let reachReasoning = false;
42+
let reachContent = false;
43+
for await (const part of completion) {
44+
// @ts-expect-error
45+
const { content, reasoning_content } = part.choices[0]?.delta!;
46+
if (reasoning_content) {
47+
if (!reachReasoning) {
48+
reachReasoning = true;
49+
console.log('> Reasoning Content:');
50+
}
51+
process.stdout.write(reasoning_content);
52+
}
53+
if (content) {
54+
if (!reachContent) {
55+
reachContent = true;
56+
console.log('\n> Content:');
57+
}
58+
process.stdout.write(content);
59+
}
60+
}
61+
62+
console.log('');
63+
console.timeEnd('> Streaming Duration');
64+
}
65+
66+
main();
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/* eslint-disable @typescript-eslint/no-non-null-asserted-optional-chain */
2+
/* eslint-disable no-unsafe-optional-chaining */
3+
import { OpenAI } from 'openai';
4+
import { TokenJS } from 'token.js';
5+
6+
export const tokenjs = new TokenJS({
7+
apiKey: process.env.MM_TEST_API_KEY,
8+
baseURL: process.env.MM_TEST_BASE_URL,
9+
});
10+
11+
tokenjs.extendModelList('openai', 'ep-20250510145437-5sxhs', {
12+
streaming: true,
13+
json: true,
14+
toolCalls: true,
15+
images: true,
16+
});
17+
18+
// FIXME: remove as, we need to fix the type issue of token.js
19+
const client = tokenjs as unknown as OpenAI;
20+
21+
async function main() {
22+
console.time('> TTFT');
23+
console.log(process.env.MM_TEST_MODEL!);
24+
25+
const completion = await client.chat.completions.create({
26+
// @ts-expect-error
27+
provider: 'openai',
28+
model: process.env.MM_TEST_MODEL!,
29+
messages: [
30+
{
31+
role: 'system',
32+
content:
33+
'\n You are a tool call agent, you MUST SELECT a TOOL to handle user\'s request.\n \n 1. DO NOT make any fake informations\n 2. "finish_reason" should always be "tool_calls"\n \n\nCurrent time: 5/11/2025, 3:41:25 PM',
34+
},
35+
{
36+
role: 'user',
37+
content: "How's the weather today?",
38+
},
39+
],
40+
tools: [
41+
{
42+
type: 'function',
43+
function: {
44+
name: 'getCurrentLocation',
45+
description: "Get user's current location",
46+
parameters: {
47+
type: 'object',
48+
properties: {},
49+
},
50+
},
51+
},
52+
{
53+
type: 'function',
54+
function: {
55+
name: 'getWeather',
56+
description: 'Get weather information for a specified location',
57+
parameters: {
58+
type: 'object',
59+
properties: {
60+
location: {
61+
type: 'string',
62+
description: 'Location name, such as city name',
63+
},
64+
},
65+
required: ['location'],
66+
},
67+
},
68+
},
69+
],
70+
temperature: 0.7,
71+
});
72+
73+
console.timeEnd('> TTFT');
74+
75+
console.time('> Request Duration');
76+
77+
console.log(JSON.stringify(completion));
78+
79+
console.log('');
80+
console.timeEnd('> Request Duration');
81+
}
82+
83+
main();

0 commit comments

Comments
 (0)