Skip to content

Commit 1b9712e

Browse files
ZhaoHehycjcl868
andauthored
feat(action-parser): add action parser supporting for new format (#234)
Co-authored-by: ycjcl868 <chaolinjin@gmail.com>
1 parent 941363d commit 1b9712e

File tree

4 files changed

+144
-3
lines changed

4 files changed

+144
-3
lines changed

packages/ui-tars/action-parser/src/actionParser.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,12 +210,17 @@ function parseAction(actionStr: string) {
210210
const [key, ...valueParts] = pair.split('=');
211211
if (!key) continue;
212212

213-
// Join value parts back together in case there were = signs in the value
214-
const value = valueParts
213+
let value = valueParts
215214
.join('=')
216215
.trim()
217216
.replace(/^['"]|['"]$/g, ''); // Remove surrounding quotes
218217

218+
// Process output with bbox
219+
if (value.includes('<bbox>')) {
220+
value = value.replace(/<bbox>|<\/bbox>/g, '').replace(/\s+/g, ',');
221+
value = `(${value})`;
222+
}
223+
219224
//@ts-ignore
220225
kwargs[key.trim()] = value;
221226
}

packages/ui-tars/action-parser/test/actionParser.test.ts

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,142 @@ Action: click(start_box='(100,200)')
140140
});
141141
});
142142

143+
// bc mode but new format output tests
144+
describe('bc mode but hallucination', () => {
145+
it('should correctly parse new format input', () => {
146+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
147+
Action: click(start_box='<bbox>637 964 637 964</bbox>')`;
148+
149+
const result = parseActionVlm(input, [1000, 1000], 'bc');
150+
151+
expect(result).toEqual([
152+
{
153+
reflection: null,
154+
thought:
155+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
156+
action_type: 'click',
157+
action_inputs: {
158+
start_box: '[0.637,0.964,0.637,0.964]',
159+
},
160+
},
161+
]);
162+
});
163+
164+
it('should correctly parse new format input', () => {
165+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
166+
Action: click(start_box='<bbox>637 964 637 964</bbox>')`;
167+
168+
const result = parseActionVlm(input, [1000, 1000], 'bc', {
169+
width: 2560,
170+
height: 1440,
171+
});
172+
173+
expect(result).toEqual([
174+
{
175+
reflection: null,
176+
thought:
177+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
178+
action_type: 'click',
179+
action_inputs: {
180+
start_box: '[0.637,0.964,0.637,0.964]',
181+
start_coords: [1630.72, 1388.16],
182+
},
183+
},
184+
]);
185+
});
186+
187+
it('should correctly parse input with Reflection and Action_Summary', () => {
188+
const input = `Reflection: This is a reflection
189+
Action_Summary: This is a summary
190+
Action: type(text='Hello', start_box='<bbox>300 400</bbox>')`;
191+
192+
const result = parseActionVlm(input);
193+
194+
expect(result).toEqual([
195+
{
196+
reflection: 'This is a reflection',
197+
thought: 'This is a summary',
198+
action_type: 'type',
199+
action_inputs: {
200+
text: 'Hello',
201+
start_box: '[0.3,0.4,0.3,0.4]',
202+
},
203+
},
204+
]);
205+
});
206+
207+
it('should handle multiple actions', () => {
208+
const input = `Thought: Perform multiple actions
209+
Action: click(start_box='<bbox>100 200</bbox>')
210+
211+
type(text='Hello', start_box='<bbox>300 400</bbox>')`;
212+
213+
const result = parseActionVlm(input);
214+
215+
expect(result).toEqual([
216+
{
217+
thought: 'Perform multiple actions',
218+
reflection: null,
219+
action_type: 'click',
220+
action_inputs: {
221+
start_box: '[0.1,0.2,0.1,0.2]',
222+
},
223+
},
224+
{
225+
thought: 'Perform multiple actions',
226+
reflection: null,
227+
action_type: 'type',
228+
action_inputs: {
229+
text: 'Hello',
230+
start_box: '[0.3,0.4,0.3,0.4]',
231+
},
232+
},
233+
]);
234+
});
235+
236+
it('should correctly parse new format input', () => {
237+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
238+
Action: click(start_box='[637,964,637,964]')`;
239+
240+
const result = parseActionVlm(input, [1000, 1000], 'bc');
241+
242+
expect(result).toEqual([
243+
{
244+
reflection: null,
245+
thought:
246+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
247+
action_type: 'click',
248+
action_inputs: {
249+
start_box: '[0.637,0.964,0.637,0.964]',
250+
},
251+
},
252+
]);
253+
});
254+
255+
it('should correctly parse new format input', () => {
256+
const input = `Thought: 我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。
257+
Action: click(start_box='[637,964,637,964]')`;
258+
259+
const result = parseActionVlm(input, [1000, 1000], 'bc', {
260+
width: 2560,
261+
height: 1440,
262+
});
263+
264+
expect(result).toEqual([
265+
{
266+
reflection: null,
267+
thought:
268+
'我看到当前屏幕显示的是一个电子表格软件和一个聊天窗口,而任务要求我需要在浏览器中搜索北京明天天气。我需要先点击任务栏上的浏览器图标来启动浏览器。',
269+
action_type: 'click',
270+
action_inputs: {
271+
start_box: '[0.637,0.964,0.637,0.964]',
272+
start_coords: [1630.72, 1388.16],
273+
},
274+
},
275+
]);
276+
});
277+
});
278+
143279
describe('Box coordinates normalization', () => {
144280
it('should correctly normalize box with four coordinates', () => {
145281
const input = `Thought: I need to click on this element

packages/ui-tars/action-parser/vitest.config.mts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export default defineProject({
1414
root: './',
1515
test: {
1616
globals: true,
17-
setupFiles: [resolve(__dirname, '../../scripts/vitest-setup.ts')],
17+
setupFiles: [resolve(__dirname, '../../../scripts/vitest-setup.ts')],
1818
environment: 'node',
1919
includeSource: [resolve(__dirname, '.')],
2020
},

0 commit comments

Comments
 (0)