Skip to content

Commit 75de1eb

Browse files
ZhaoHehskychxulivzycjcl868
authored
feat: UI-tars-1.5 (#502)
Co-authored-by: skychx <skychx@hotmail.com> Co-authored-by: ZhaoHeh <zhaoqinghao@bytedance.com> Co-authored-by: ULIVZ <chenhaoli@bytedance.com> Co-authored-by: Charles <jinxin001@bytedance.com>
1 parent 0ff5eb8 commit 75de1eb

File tree

151 files changed

+10068
-2111
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

151 files changed

+10068
-2111
lines changed

README.md

+7-7
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ UI-TARS Desktop is a GUI Agent application based on [UI-TARS (Vision-Language Mo
1919

2020
<p align="center">
2121
&nbsp&nbsp 📑 <a href="https://arxiv.org/abs/2501.12326">Paper</a> &nbsp&nbsp
22-
| 🤗 <a href="https://huggingface.co/bytedance-research/UI-TARS-7B-DPO">Hugging Face Models</a>&nbsp&nbsp
22+
| 🤗 <a href="https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B">Hugging Face Models</a>&nbsp&nbsp
2323
| &nbsp&nbsp🫨 <a href="https://discord.gg/pTXwYVjfcs">Discord</a>&nbsp&nbsp
24-
| &nbsp&nbsp🤖 <a href="https://www.modelscope.cn/models/bytedance-research/UI-TARS-7B-DPO">ModelScope</a>&nbsp&nbsp
24+
| &nbsp&nbsp🤖 <a href="https://www.modelscope.cn/collections/UI-TARS-bccb56fa1ef640">ModelScope</a>&nbsp&nbsp
2525
<br>
2626
🖥️ Desktop Application &nbsp&nbsp
2727
| &nbsp&nbsp 👓 <a href="https://github.com/web-infra-dev/midscene">Midscene (use in browser)</a>
@@ -31,13 +31,13 @@ UI-TARS Desktop is a GUI Agent application based on [UI-TARS (Vision-Language Mo
3131

3232
| Instruction | Video |
3333
| :---: | :---: |
34-
| Get the current weather in SF using the web browser | <video src="https://github.com/user-attachments/assets/5235418c-ac61-4895-831d-68c1c749fc87" height="300" /> |
35-
| Send a twitter with the content "hello world" | <video src="https://github.com/user-attachments/assets/737ccc11-9124-4464-b4be-3514cbced85c" height="300" /> |
34+
| Please help me open the autosave feature of VS Code and delay AutoSave operations for 500 milliseconds in the VS Code setting. | <video src="https://github.com/user-attachments/assets/e0914ce9-ad33-494b-bdec-0c25c1b01a27" height="300" /> |
35+
| Could you help me check the latest open issue of the UI-TARS-Desktop project on GitHub? | <video src="https://github.com/user-attachments/assets/3d159f54-d24a-4268-96c0-e149607e9199" height="300" /> |
3636

3737

3838
## News
3939

40-
- **\[2025-04-16\]** - We shared the latest progress of the **UI-TARS-1.5** model in our blog ([seed-tars.com/1.5](https://seed-tars.com/1.5)), which excels in playing games and performing GUI tasks, and we open-sourced the [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), checkout the showcase here: https://seed-tars.com/showcase
40+
- **\[2025-04-17\]** - 🎉 We're thrilled to announce the release of new UI-TARS Desktop application v0.1.0, featuring a redesigned Agent UI. The application enhances the computer using experience, introduces new browser operation features, and supports [the advanced UI-TARS-1.5 model](https://seed-tars.com/1.5) for improved performance and precise control.
4141
- **\[2025-02-20\]** - 📦 Introduced [UI TARS SDK](./docs/sdk.md), is a powerful cross-platform toolkit for building GUI automation agents.
4242
- **\[2025-01-23\]** - 🚀 We updated the **[Cloud Deployment](./docs/deployment.md#cloud-deployment)** section in the 中文版: [GUI模型部署教程](https://bytedance.sg.larkoffice.com/docx/TCcudYwyIox5vyxiSDLlgIsTgWf#U94rdCxzBoJMLex38NPlHL21gNb) with new information related to the ModelScope platform. You can now use the ModelScope platform for deployment.
4343

@@ -47,7 +47,7 @@ UI-TARS Desktop is a GUI Agent application based on [UI-TARS (Vision-Language Mo
4747
- 🤖 Natural language control powered by Vision-Language Model
4848
- 🖥️ Screenshot and visual recognition support
4949
- 🎯 Precise mouse and keyboard control
50-
- 💻 Cross-platform support (Windows/MacOS)
50+
- 💻 Cross-platform support (Windows/MacOS/Browser)
5151
- 🔄 Real-time feedback and status display
5252
- 🔐 Private and secure - fully local processing
5353

@@ -57,7 +57,7 @@ See [Quick Start](./docs/quick-start.md).
5757

5858
## Deployment
5959

60-
See [Deployment](./docs/deployment.md).
60+
See [Deployment](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md).
6161

6262
## Contributing
6363

apps/agent-tars/tsconfig.base.json

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
"baseUrl": ".",
44
"moduleResolution": "bundler",
55
"paths": {
6+
"react": ["./src/renderer/node_modules/@types/react"],
7+
"react-dom": ["./src/renderer/node_modules/@types/react-dom"],
68
"@shared/*": ["./src/shared/*"],
79
"@main/*": ["./src/main/*"],
810
"@renderer/*": ["./src/renderer/src/*"],

apps/ui-tars/electron.vite.config.ts

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import tsconfigPaths from 'vite-tsconfig-paths';
1010

1111
import pkg from './package.json';
1212
import { getExternalPkgs } from './scripts/getExternalPkgs';
13+
import tailwindcss from '@tailwindcss/vite';
1314

1415
export default defineConfig({
1516
main: {
@@ -66,7 +67,7 @@ export default defineConfig({
6667
},
6768
},
6869
},
69-
plugins: [react(), tsconfigPaths()],
70+
plugins: [react(), tsconfigPaths(), tailwindcss()],
7071
define: {
7172
APP_VERSION: JSON.stringify(pkg.version),
7273
},
-14.2 KB
Loading

apps/ui-tars/images/mac_app.png

10.8 KB
Loading

apps/ui-tars/images/mac_broken.png

-9.48 KB
Loading

apps/ui-tars/images/mac_install.png

-36.3 KB
Loading
-15.9 KB
Loading
Loading
Loading
83.2 KB
Loading
64.3 KB
Loading
74.7 KB
Loading
Loading
Loading
Loading
Loading
Loading
128 KB
Loading
Loading
Loading
Loading
Loading
Loading

apps/ui-tars/images/setting.png

48 KB
Loading
-147 Bytes
Loading
230 KB
Loading
87.6 KB
Loading

apps/ui-tars/images/utio-flow.png

-208 KB
Loading
-14.1 KB
Loading

apps/ui-tars/package.json

+25-22
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "ui-tars-desktop",
3-
"version": "0.0.9",
3+
"version": "0.1.0-preview",
44
"private": true,
55
"main": "./dist/main/main.js",
66
"packageManager": "pnpm@9.10.0",
@@ -69,65 +69,68 @@
6969
"sharp": "0.33.3"
7070
},
7171
"devDependencies": {
72+
"@common/electron-build": "workspace:*",
7273
"@computer-use/mac-screen-capture-permissions": "^1.0.2",
7374
"@computer-use/nut-js": "^4.2.0",
74-
"@ui-tars/operator-nut-js": "workspace:*",
75-
"@electron-toolkit/utils": "^3.0.0",
76-
"@common/electron-build": "workspace:*",
77-
"@ui-tars/sdk": "workspace:*",
78-
"@ui-tars/action-parser": "workspace:*",
79-
"@ui-tars/shared": "workspace:*",
80-
"@ui-tars/utio": "workspace:*",
81-
"@ui-tars/electron-ipc": "workspace:*",
82-
"dotenv": "^16.4.7",
83-
"electron-debug": "^3.2.0",
84-
"electron-devtools-installer": "^3.2.0",
85-
"electron-log": "5.2.4",
86-
"electron-squirrel-startup": "^1.0.1",
87-
"electron-store": "^10.0.0",
88-
"js-yaml": "^4.1.0",
89-
"ms": "^2.1.3",
90-
"electron-updater": "^6.3.9",
91-
"zustand": "^5.0.0",
92-
"@electron-forge/plugin-base": "^7.7.0",
93-
"@electron-forge/shared-types": "^7.7.0",
9475
"@electron-forge/cli": "^7.7.0",
9576
"@electron-forge/maker-dmg": "^7.7.0",
9677
"@electron-forge/maker-pkg": "^7.7.0",
9778
"@electron-forge/maker-squirrel": "^7.7.0",
9879
"@electron-forge/maker-zip": "^7.7.0",
9980
"@electron-forge/plugin-auto-unpack-natives": "^7.7.0",
81+
"@electron-forge/plugin-base": "^7.7.0",
10082
"@electron-forge/plugin-fuses": "^7.7.0",
10183
"@electron-forge/plugin-vite": "^7.7.0",
10284
"@electron-forge/publisher-github": "^7.7.0",
85+
"@electron-forge/shared-types": "^7.7.0",
10386
"@electron-toolkit/eslint-config-prettier": "^2.0.0",
10487
"@electron-toolkit/eslint-config-ts": "^2.0.0",
10588
"@electron-toolkit/preload": "^3.0.1",
10689
"@electron-toolkit/tsconfig": "^1.0.1",
90+
"@electron-toolkit/utils": "^3.0.0",
10791
"@electron/asar": "^3.2.18",
10892
"@electron/fuses": "^1.8.0",
10993
"@trivago/prettier-plugin-sort-imports": "^5.2.1",
11094
"@types/node": "^20.14.8",
11195
"@typescript-eslint/eslint-plugin": "^5.0.0",
11296
"@typescript-eslint/parser": "^5.0.0",
97+
"@ui-tars/action-parser": "workspace:*",
98+
"@ui-tars/electron-ipc": "workspace:*",
99+
"@ui-tars/operator-nut-js": "workspace:*",
100+
"@ui-tars/operator-browser": "workspace:*",
101+
"@ui-tars/sdk": "workspace:*",
102+
"@ui-tars/shared": "workspace:*",
103+
"@ui-tars/utio": "workspace:*",
113104
"@vitejs/plugin-react": "^4.3.4",
114105
"@vitest/coverage-istanbul": "^3.0.3",
115106
"cross-env": "^7.0.3",
107+
"dotenv": "^16.4.7",
116108
"electron": "34.1.1",
109+
"electron-debug": "^3.2.0",
110+
"electron-devtools-installer": "^3.2.0",
111+
"electron-log": "5.2.4",
117112
"electron-packager-languages": "0.5.0",
113+
"electron-squirrel-startup": "^1.0.1",
114+
"electron-store": "^10.0.0",
118115
"electron-vite": "^3.0.0",
119116
"eslint": "^8.57.0",
120117
"eslint-plugin-import": "^2.25.0",
121118
"eslint-plugin-react": "^7.34.3",
119+
"js-yaml": "^4.1.0",
120+
"ms": "^2.1.3",
122121
"prettier": "^3.3.2",
123122
"rimraf": "^6.0.1",
124123
"sass-embedded": "^1.83.1",
125124
"ts-node": "^10.9.2",
126125
"tsx": "^4.19.2",
127126
"typescript": "^5.7.2",
127+
"update-electron-app": "^3.1.0",
128128
"vite": "^6.1.0",
129129
"vite-tsconfig-paths": "^5.1.4",
130-
"vitest": "^3.0.8"
130+
"vitest": "^3.0.8",
131+
"zustand": "^5.0.0",
132+
"@tailwindcss/vite": "4.1.3",
133+
"tailwindcss": "4.1.3"
131134
},
132135
"engines": {
133136
"node": ">=20.x"

apps/ui-tars/src/main/agent/prompts.ts

+62
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,65 @@ ${NutJSElectronOperator.MANUAL.ACTION_SPACES.join('\n')}
2323
2424
## User Instruction
2525
`;
26+
27+
export const getSystemPromptV1_5 = (
28+
language: 'zh' | 'en',
29+
useCase: 'normal' | 'poki',
30+
) => `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
31+
32+
## Output Format
33+
\`\`\`
34+
Thought: ...
35+
Action: ...
36+
\`\`\`
37+
38+
## Action Space
39+
40+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
41+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
42+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
43+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
44+
hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
45+
type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
46+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') # Show more information on the \`direction\` side.
47+
wait() # Sleep for 5s and take a screenshot to check for any changes.
48+
finished()
49+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
50+
51+
52+
## Note
53+
- Use ${language === 'zh' ? 'Chinese' : 'English'} in \`Thought\` part.
54+
- ${useCase === 'normal' ? 'Generate a well-defined and practical strategy in the `Thought` section, summarizing your next move and its objective.' : 'Compose a step-by-step approach in the `Thought` part, specifying your next action and its focus.'}
55+
56+
## User Instruction
57+
`;
58+
59+
export const getSystemPromptPoki = `
60+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
61+
62+
## Output Format
63+
\`\`\`
64+
Thought: ...
65+
Action: ...
66+
\`\`\`
67+
68+
## Action Space
69+
70+
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
71+
left_double(start_box='<|box_start|>(x1,y1)<|box_end|>')
72+
right_single(start_box='<|box_start|>(x1,y1)<|box_end|>')
73+
drag(start_box='<|box_start|>(x1,y1)<|box_end|>', end_box='<|box_start|>(x3,y3)<|box_end|>')
74+
hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
75+
type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
76+
scroll(start_box='<|box_start|>(x1,y1)<|box_end|>', direction='down or up or right or left') # Show more information on the \`direction\` side.
77+
wait() # Sleep for 5s and take a screenshot to check for any changes.
78+
finished()
79+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
80+
81+
82+
## Note
83+
- Use Chinese in \`Thought\` part.
84+
- Compose a step-by-step approach in the \`Thought\` part, specifying your next action and its focus.
85+
86+
## User Instruction
87+
`;

apps/ui-tars/src/main/ipcRoutes/agent.ts

+47
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,38 @@ import { runAgent } from '@main/services/runAgent';
99
import { showWindow } from '@main/window/index';
1010

1111
import { closeScreenMarker } from '@main/window/ScreenMarker';
12+
import { GUIAgent } from '@ui-tars/sdk';
13+
import { Operator } from '@ui-tars/sdk/core';
1214

1315
const t = initIpc.create();
1416

17+
export class GUIAgentManager {
18+
private static instance: GUIAgentManager;
19+
private currentAgent: GUIAgent<Operator> | null = null;
20+
21+
// eslint-disable-next-line @typescript-eslint/no-empty-function
22+
private constructor() {}
23+
24+
public static getInstance(): GUIAgentManager {
25+
if (!GUIAgentManager.instance) {
26+
GUIAgentManager.instance = new GUIAgentManager();
27+
}
28+
return GUIAgentManager.instance;
29+
}
30+
31+
public setAgent(agent: GUIAgent<Operator>) {
32+
this.currentAgent = agent;
33+
}
34+
35+
public getAgent(): GUIAgent<Operator> | null {
36+
return this.currentAgent;
37+
}
38+
39+
public clearAgent() {
40+
this.currentAgent = null;
41+
}
42+
}
43+
1544
export const agentRoute = t.router({
1645
runAgent: t.procedure.input<void>().handle(async () => {
1746
const { thinking } = store.getState();
@@ -29,13 +58,31 @@ export const agentRoute = t.router({
2958

3059
store.setState({ thinking: false });
3160
}),
61+
pauseRun: t.procedure.input<void>().handle(async () => {
62+
const guiAgent = GUIAgentManager.getInstance().getAgent();
63+
if (guiAgent instanceof GUIAgent) {
64+
guiAgent.pause();
65+
store.setState({ thinking: false });
66+
}
67+
}),
68+
resumeRun: t.procedure.input<void>().handle(async () => {
69+
const guiAgent = GUIAgentManager.getInstance().getAgent();
70+
if (guiAgent instanceof GUIAgent) {
71+
guiAgent.resume();
72+
store.setState({ thinking: false });
73+
}
74+
}),
3275
stopRun: t.procedure.input<void>().handle(async () => {
3376
const { abortController } = store.getState();
3477
store.setState({ status: StatusEnum.END, thinking: false });
3578

3679
showWindow();
3780

3881
abortController?.abort();
82+
const guiAgent = GUIAgentManager.getInstance().getAgent();
83+
if (guiAgent instanceof GUIAgent) {
84+
guiAgent.stop();
85+
}
3986

4087
closeScreenMarker();
4188
}),

apps/ui-tars/src/main/main.ts

+11-10
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import { SettingStore } from './store/setting';
3333
import { createTray } from './tray';
3434
import { registerSettingsHandlers } from './services/settings';
3535
import { sanitizeState } from './utils/sanitizeState';
36+
import { windowManager } from './services/windowManager';
3637

3738
const { isProd } = env;
3839

@@ -185,18 +186,17 @@ const registerIPCHandlers = (
185186
return sanitizeState(state);
186187
});
187188

189+
// 初始化时注册已有窗口
190+
wrappers.forEach((wrapper) => {
191+
if (wrapper instanceof BrowserWindow) {
192+
windowManager.registerWindow(wrapper);
193+
}
194+
});
195+
188196
// only send state to the wrappers that are not destroyed
189197
ipcMain.on('subscribe', (state: unknown) => {
190-
for (const wrapper of wrappers) {
191-
const webContents = wrapper?.webContents;
192-
if (webContents?.isDestroyed()) {
193-
break;
194-
}
195-
webContents?.send(
196-
'subscribe',
197-
sanitizeState(state as Record<string, unknown>),
198-
);
199-
}
198+
const sanitizedState = sanitizeState(state as Record<string, unknown>);
199+
windowManager.broadcast('subscribe', sanitizedState);
200200
});
201201

202202
const unsubscribe = store.subscribe((state: unknown) =>
@@ -243,4 +243,5 @@ app
243243

244244
logger.info('app.whenReady end');
245245
})
246+
246247
.catch(console.log);

0 commit comments

Comments
 (0)