Skip to content

Commit f72ef34

Browse files
committed
feat: add action parser supporting for M8's ouput
The M8 model's output: <bbox>637 964 637 964</bbox>
1 parent 98a7481 commit f72ef34

File tree

3 files changed

+97
-5
lines changed

3 files changed

+97
-5
lines changed

packages/ui-tars/action-parser/src/actionParser.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,19 +203,24 @@ function parseAction(actionStr: string) {
203203
const kwargs = {};
204204

205205
if (argsStr.trim()) {
206-
// Split on commas that aren't inside quotes or parentheses
207-
const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
206+
const argPairs =
207+
argsStr.match(/([^,']|'[^']*'|'<bbox>.*?<\/bbox>')+/g) || [];
208208

209209
for (const pair of argPairs) {
210210
const [key, ...valueParts] = pair.split('=');
211211
if (!key) continue;
212212

213-
// Join value parts back together in case there were = signs in the value
214-
const value = valueParts
213+
let value = valueParts
215214
.join('=')
216215
.trim()
217216
.replace(/^['"]|['"]$/g, ''); // Remove surrounding quotes
218217

218+
// 处理 bbox 格式
219+
if (value.includes('<bbox>')) {
220+
value = value.replace(/<bbox>|<\/bbox>/g, '').replace(/\s+/g, ',');
221+
value = `(${value})`;
222+
}
223+
219224
//@ts-ignore
220225
kwargs[key.trim()] = value;
221226
}

packages/ui-tars/action-parser/test/actionParser.test.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,93 @@ Action: click(start_box='(100,200)')
140140
});
141141
});
142142

143+
// M8 mode tests
144+
describe('M8 mode', () => {
145+
it('should correctly parse M8 format input', () => {
146+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
147+
Action: click(start_box='<bbox>637 964 637 964</bbox>')`;
148+
149+
const result = parseActionVlm(input, [1000, 1000], 'bc');
150+
151+
expect(result).toEqual([
152+
{
153+
reflection: null,
154+
thought:
155+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
156+
action_type: 'click',
157+
action_inputs: {
158+
start_box: '[0.637,0.964,0.637,0.964]',
159+
},
160+
},
161+
]);
162+
});
163+
164+
it('should correctly parse M8 format input', () => {
165+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
166+
Action: click(start_box='<bbox>637 964 637 964</bbox>')`;
167+
168+
const result = parseActionVlm(input, [1000, 1000], 'bc', {
169+
width: 2560,
170+
height: 1440,
171+
});
172+
173+
expect(result).toEqual([
174+
{
175+
reflection: null,
176+
thought:
177+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
178+
action_type: 'click',
179+
action_inputs: {
180+
start_box: '[0.637,0.964,0.637,0.964]',
181+
start_coords: [1630.72, 1388.16],
182+
},
183+
},
184+
]);
185+
});
186+
187+
it('should correctly parse M8 format input', () => {
188+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
189+
Action: click(start_box='[637,964,637,964]')`;
190+
191+
const result = parseActionVlm(input, [1000, 1000], 'bc');
192+
193+
expect(result).toEqual([
194+
{
195+
reflection: null,
196+
thought:
197+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
198+
action_type: 'click',
199+
action_inputs: {
200+
start_box: '[0.637,0.964,0.637,0.964]',
201+
},
202+
},
203+
]);
204+
});
205+
206+
it('should correctly parse M8 format input', () => {
207+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
208+
Action: click(start_box='[637,964,637,964]')`;
209+
210+
const result = parseActionVlm(input, [1000, 1000], 'bc', {
211+
width: 2560,
212+
height: 1440,
213+
});
214+
215+
expect(result).toEqual([
216+
{
217+
reflection: null,
218+
thought:
219+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
220+
action_type: 'click',
221+
action_inputs: {
222+
start_box: '[0.637,0.964,0.637,0.964]',
223+
start_coords: [1630.72, 1388.16],
224+
},
225+
},
226+
]);
227+
});
228+
});
229+
143230
describe('Box coordinates normalization', () => {
144231
it('should correctly normalize box with four coordinates', () => {
145232
const input = `Thought: I need to click on this element

packages/ui-tars/action-parser/vitest.config.mts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export default defineProject({
1414
root: './',
1515
test: {
1616
globals: true,
17-
setupFiles: [resolve(__dirname, '../../scripts/vitest-setup.ts')],
17+
setupFiles: [resolve(__dirname, '../../../scripts/vitest-setup.ts')],
1818
environment: 'node',
1919
includeSource: [resolve(__dirname, '.')],
2020
},

0 commit comments

Comments
 (0)