Skip to content

Commit 336a951

Browse files
authored
feat(share): add screen recording video sharing (#77)
* chore(ci): add test cases * feat(share): add screen recording video sharing * fix(share): black screen on the first frame * fix(share): export as video bug * chore(version): 0.0.4-beta.1
1 parent d0bd957 commit 336a951

File tree

13 files changed

+579
-85
lines changed

13 files changed

+579
-85
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "ui-tars-desktop",
3-
"version": "0.0.3",
3+
"version": "0.0.4-beta.1",
44
"private": true,
55
"packageManager": "pnpm@9.10.0",
66
"description": "A GUI Agent application based on UI-TARS(Vision-Lanuage Model) that allows you to control your computer using natural language.",

packages/action-parser/src/actionParser.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ export function actionParser(params: { prediction: string; factor: number }): {
1616
};
1717
}
1818

19-
function parseActionVlm(
19+
export function parseActionVlm(
2020
text: string,
2121
factor = 1000,
2222
mode: 'bc' | 'o1' = 'bc',
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
import { describe, it, expect } from 'vitest';
6+
import { parseActionVlm } from '../src/actionParser';
7+
8+
describe('parseActionVlm', () => {
9+
// BC mode tests
10+
describe('BC mode', () => {
11+
it('should correctly parse input with Thought', () => {
12+
const input = `Thought: I need to click this button
13+
Action: click(start_box='(100,200)')`;
14+
15+
const result = parseActionVlm(input);
16+
17+
expect(result).toEqual([
18+
{
19+
reflection: null,
20+
thought: 'I need to click this button',
21+
action_type: 'click',
22+
action_inputs: {
23+
start_box: '[0.1,0.2,0.1,0.2]',
24+
},
25+
},
26+
]);
27+
});
28+
29+
it('should correctly parse input with Reflection and Action_Summary', () => {
30+
const input = `Reflection: This is a reflection
31+
Action_Summary: This is a summary
32+
Action: type(text='Hello', start_box='(300,400)')`;
33+
34+
const result = parseActionVlm(input);
35+
36+
expect(result).toEqual([
37+
{
38+
reflection: 'This is a reflection',
39+
thought: 'This is a summary',
40+
action_type: 'type',
41+
action_inputs: {
42+
text: 'Hello',
43+
start_box: '[0.3,0.4,0.3,0.4]',
44+
},
45+
},
46+
]);
47+
});
48+
49+
it('should handle multiple actions', () => {
50+
const input = `Thought: Perform multiple actions
51+
Action: click(start_box='(100,200)')
52+
53+
type(text='Hello', start_box='(300,400)')`;
54+
55+
const result = parseActionVlm(input);
56+
57+
expect(result).toEqual([
58+
{
59+
thought: 'Perform multiple actions',
60+
reflection: null,
61+
action_type: 'click',
62+
action_inputs: {
63+
start_box: '[0.1,0.2,0.1,0.2]',
64+
},
65+
},
66+
{
67+
thought: 'Perform multiple actions',
68+
reflection: null,
69+
action_type: 'type',
70+
action_inputs: {
71+
text: 'Hello',
72+
start_box: '[0.3,0.4,0.3,0.4]',
73+
},
74+
},
75+
]);
76+
});
77+
});
78+
79+
// O1 mode tests
80+
describe('O1 mode', () => {
81+
it('should correctly parse O1 format input', () => {
82+
const input = `<Thought>I need to perform this action</Thought>
83+
Action_Summary: Click and type text
84+
Action: click(start_box='(100,200)')
85+
</Output>`;
86+
87+
const result = parseActionVlm(input, 1000, 'o1');
88+
89+
expect(result).toEqual([
90+
{
91+
reflection: null,
92+
thought:
93+
'I need to perform this action\n<Action_Summary>\nClick and type text',
94+
action_type: 'click',
95+
action_inputs: {
96+
start_box: '[0.1,0.2,0.1,0.2]',
97+
},
98+
},
99+
]);
100+
});
101+
102+
it('should handle complex O1 format input', () => {
103+
const input = `<Thought>Complex operation</Thought>
104+
Action_Summary: Multiple sequential actions
105+
Action: click(start_box='(100,200)')
106+
</Output>`;
107+
108+
const result = parseActionVlm(input, 1000, 'o1');
109+
110+
expect(result).toEqual([
111+
{
112+
reflection: null,
113+
thought:
114+
'Complex operation\n<Action_Summary>\nMultiple sequential actions',
115+
action_type: 'click',
116+
action_inputs: {
117+
start_box: '[0.1,0.2,0.1,0.2]',
118+
},
119+
},
120+
]);
121+
});
122+
});
123+
124+
// Edge cases
125+
describe('Edge cases', () => {
126+
it('should handle input without Action keyword', () => {
127+
const input = 'click(start_box="(100,200)")';
128+
const result = parseActionVlm(input);
129+
130+
expect(result).toEqual([
131+
{
132+
action_inputs: {
133+
start_box: '[0.1]',
134+
},
135+
action_type: 'click',
136+
reflection: null,
137+
thought: '',
138+
},
139+
]);
140+
});
141+
142+
it('should handle empty action input', () => {
143+
const input = 'Thought: Empty action\nAction:';
144+
const result = parseActionVlm(input);
145+
146+
expect(result).toEqual([
147+
{
148+
action_inputs: {},
149+
action_type: '',
150+
reflection: null,
151+
thought: 'Empty action',
152+
},
153+
]);
154+
});
155+
});
156+
});

packages/action-parser/src/index.test.ts renamed to packages/action-parser/test/index.test.ts

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
// @prettier
66
import { describe, expect, it } from 'vitest';
77

8-
import { actionParser } from './index';
8+
import { actionParser } from '../';
99

1010
describe('actionParser', () => {
1111
it('should return parsed action', () => {
@@ -140,6 +140,29 @@ describe('actionParser', () => {
140140
});
141141
});
142142

143+
it('should return Reflection', () => {
144+
const result = actionParser({
145+
prediction:
146+
"Reflection: 在桌面上我看到了Chrome浏览器的图标,根据任务要求需要打开Chrome浏览器,应该双击该图标来启动浏览器。\nAction_Summary: 在桌面上找到Chrome浏览器图标的位置,通过双击操作来打开浏览器。\nAction: left_double(start_box='21, 246, 21, 246')",
147+
factor: 1000,
148+
});
149+
150+
expect(result).toEqual({
151+
parsed: [
152+
{
153+
thought:
154+
'在桌面上找到Chrome浏览器图标的位置,通过双击操作来打开浏览器。',
155+
reflection:
156+
'在桌面上我看到了Chrome浏览器的图标,根据任务要求需要打开Chrome浏览器,应该双击该图标来启动浏览器。',
157+
action_type: 'left_double',
158+
action_inputs: {
159+
start_box: '[0.021,0.246,0.021,0.246]',
160+
},
161+
},
162+
],
163+
});
164+
});
165+
143166
it('should return parsed action with newline', () => {
144167
const result = actionParser({
145168
// prettier-ignore

src/main/main.ts

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,14 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
import { electronApp, optimizer } from '@electron-toolkit/utils';
6-
import { app, globalShortcut, ipcMain } from 'electron';
6+
import {
7+
app,
8+
desktopCapturer,
9+
globalShortcut,
10+
ipcMain,
11+
screen,
12+
session,
13+
} from 'electron';
714
import squirrelStartup from 'electron-squirrel-startup';
815
import ElectronStore from 'electron-store';
916
import { updateElectronApp, UpdateSourceType } from 'update-electron-app';
@@ -123,6 +130,20 @@ const initializeApp = async () => {
123130
// eslint-disable-next-line
124131
new AppUpdater();
125132

133+
session.defaultSession.setDisplayMediaRequestHandler(
134+
(_request, callback) => {
135+
desktopCapturer.getSources({ types: ['screen'] }).then((sources) => {
136+
// Grant access to the first screen found.
137+
callback({ video: sources[0], audio: 'loopback' });
138+
});
139+
// If true, use the system picker if available.
140+
// Note: this is currently experimental. If the system picker
141+
// is available, it will be used and the media request handler
142+
// will not be invoked.
143+
},
144+
{ useSystemPicker: true },
145+
);
146+
126147
logger.info('mainZustandBridge');
127148

128149
const { unsubscribe } = mainZustandBridge(
@@ -152,6 +173,14 @@ const registerIPCHandlers = () => {
152173
ipcMain.handle('utio:shareReport', async (_, params) => {
153174
await UTIOService.getInstance().shareReport(params);
154175
});
176+
177+
ipcMain.handle('get-screen-size', () => {
178+
const primaryDisplay = screen.getPrimaryDisplay();
179+
return {
180+
screenWidth: primaryDisplay.size.width,
181+
screenHeight: primaryDisplay.size.height,
182+
};
183+
});
155184
};
156185

157186
/**

src/main/store/ScreenMarker.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class ScreenMarker {
5959
});
6060

6161
this.screenWaterFlow.blur();
62-
this.screenWaterFlow.setContentProtection(false); // show for vlm model
62+
this.screenWaterFlow.setContentProtection(false);
6363
this.screenWaterFlow.setIgnoreMouseEvents(true);
6464

6565
this.screenWaterFlow.loadURL(`data:text/html;charset=UTF-8,
@@ -159,6 +159,7 @@ class ScreenMarker {
159159
},
160160
});
161161

162+
this.pauseButton.blur();
162163
this.pauseButton.setContentProtection(true); // not show for vlm model
163164
this.pauseButton.setPosition(Math.floor(screenWidth / 2 - 50), 0);
164165

@@ -292,7 +293,7 @@ class ScreenMarker {
292293

293294
showTextWithMarker(text: string, x: number, y: number) {
294295
logger.info('[showTextWithMarker] text', text, 'x', x, 'y', y);
295-
// 如果存在之前的窗口,先关闭它
296+
// close previous overlay if exists
296297
this.closeOverlay();
297298

298299
this.currentOverlay = new BrowserWindow({
@@ -317,6 +318,7 @@ class ScreenMarker {
317318
this.currentOverlay.setAlwaysOnTop(true, 'screen-saver');
318319
}
319320

321+
this.currentOverlay.blur();
320322
this.currentOverlay.setContentProtection(false); // show for vlm model
321323
this.currentOverlay.setIgnoreMouseEvents(true);
322324

src/main/window/index.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ export function createMainWindow() {
3131

3232
mainWindow = createWindow({
3333
routerPath: '/',
34-
width: 450,
35-
height: 600,
34+
width: 430,
35+
height: 580,
3636
alwaysOnTop: false,
3737
});
3838

@@ -145,8 +145,9 @@ export async function hideWindowBlock<T>(
145145
let originalBounds: Electron.Rectangle | undefined;
146146

147147
try {
148-
mainWindow?.setContentProtection(true);
148+
mainWindow?.setContentProtection(false);
149149
mainWindow?.setAlwaysOnTop(true);
150+
mainWindow?.blur();
150151
try {
151152
if (mainWindow) {
152153
originalBounds = mainWindow.getBounds();
@@ -156,7 +157,6 @@ export async function hideWindowBlock<T>(
156157
} catch (e) {
157158
logger.error(e);
158159
}
159-
mainWindow?.blur();
160160

161161
const result = await Promise.resolve(operation());
162162
return result;

src/preload/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ export type Channels = 'ipc-example';
1313

1414
const electronHandler = {
1515
ipcRenderer: {
16+
invoke: (channel: string, ...args: unknown[]) =>
17+
ipcRenderer.invoke(channel, ...args),
1618
sendMessage(channel: Channels, ...args: unknown[]) {
1719
ipcRenderer.send(channel, ...args);
1820
},

0 commit comments

Comments
 (0)