Skip to content

Commit 714a5c9

Browse files
committed
fix: scale factor
1 parent 77ea381 commit 714a5c9

File tree

12 files changed

+77
-64
lines changed

12 files changed

+77
-64
lines changed

packages/operators/nut-js/test/execute.test.ts

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ vi.mock('@ui-tars/sdk/core', async (importOriginal) => {
2929
factor: 1000,
3030
}),
3131
Operator: actual.Operator,
32+
parseBoxToScreenCoords: actual.parseBoxToScreenCoords,
3233
};
3334
});
3435

@@ -85,6 +86,7 @@ describe('execute', () => {
8586
},
8687
screenWidth: 1920,
8788
screenHeight: 1080,
89+
scaleFactor: 1,
8890
};
8991

9092
await nutJSOperator.execute(executeParams);
@@ -96,30 +98,6 @@ describe('execute', () => {
9698
expect(mouse.click).toHaveBeenCalledWith(Button.LEFT);
9799
});
98100

99-
it('Click on the search bar at the top of the screen with scaleFactor', async () => {
100-
const nutJSOperator = new NutJSOperator({ scaleFactor: 1.5 });
101-
const executeParams: ExecuteParams = {
102-
prediction: {
103-
reflection: '',
104-
thought: 'Click on the search bar at the top of the screen\n',
105-
action_type: 'click',
106-
action_inputs: {
107-
start_box: '[0.072,0.646,0.072,0.646]',
108-
},
109-
},
110-
screenWidth: 1920,
111-
screenHeight: 1080,
112-
};
113-
114-
await nutJSOperator.execute(executeParams);
115-
116-
expect(mouse.move).toHaveBeenCalledWith(
117-
straightTo(new Point(207.36, 1046.52)),
118-
);
119-
120-
expect(mouse.click).toHaveBeenCalledWith(Button.LEFT);
121-
});
122-
123101
it('type doubao.com\n', async () => {
124102
const nutJSOperator = new NutJSOperator();
125103
const executeParams: ExecuteParams = {
@@ -133,6 +111,7 @@ describe('execute', () => {
133111
},
134112
screenWidth: 1920,
135113
screenHeight: 1080,
114+
scaleFactor: 1,
136115
};
137116

138117
await nutJSOperator.execute(executeParams);
@@ -154,6 +133,7 @@ describe('execute', () => {
154133
},
155134
screenWidth: 1920,
156135
screenHeight: 1080,
136+
scaleFactor: 1,
157137
};
158138

159139
await nutJSOperator.execute(executeParams);
@@ -175,6 +155,7 @@ describe('execute', () => {
175155
},
176156
screenWidth: 1920,
177157
screenHeight: 1080,
158+
scaleFactor: 1,
178159
};
179160

180161
await nutJSOperator.execute(executeParams);
@@ -199,6 +180,7 @@ describe('execute', () => {
199180
},
200181
screenWidth: 1920,
201182
screenHeight: 1080,
183+
scaleFactor: 1,
202184
};
203185

204186
await nutJSOperator.execute(executeParams);
@@ -224,6 +206,7 @@ describe('execute', () => {
224206
},
225207
screenWidth: 1920,
226208
screenHeight: 1080,
209+
scaleFactor: 1,
227210
};
228211

229212
await nutJSOperator.execute(executeParams);

packages/sdk/src/GUIAgent.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent<
127127
width: snapshot.width,
128128
height: snapshot.height,
129129
},
130+
scaleFactor: snapshot.scaleFactor,
130131
},
131132
timing: {
132133
start,
@@ -202,6 +203,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent<
202203
width: snapshot.width,
203204
height: snapshot.height,
204205
},
206+
scaleFactor: snapshot.scaleFactor,
205207
},
206208
predictionParsed: parsedPredictions,
207209
});

packages/shared/src/types/data.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,13 @@ export interface Conversation extends Message {
1616
screenshotBase64?: string;
1717
screenshotContext?: {
1818
size: {
19+
/** physical device width */
1920
width: number;
21+
/** physical device height */
2022
height: number;
2123
};
24+
/** screenshot scale factor(DPR) */
25+
scaleFactor?: number;
2226
};
2327
predictionParsed?: PredictionParsed[];
2428
}

src/main/agent/operator.ts

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,23 @@ import { Key, keyboard } from '@computer-use/nut-js';
66
import { type ScreenshotOutput, type ExecuteParams } from '@ui-tars/sdk/core';
77
import { NutJSOperator } from '@ui-tars/operator-nut-js';
88
import { clipboard } from 'electron';
9-
import { desktopCapturer, screen } from 'electron';
9+
import { desktopCapturer } from 'electron';
1010

1111
import * as env from '@main/env';
1212
import { logger } from '@main/logger';
1313
import { sleep } from '@ui-tars/shared/utils';
14+
import { getScreenSize } from '@main/utils/screen';
1415

1516
export class NutJSElectronOperator extends NutJSOperator {
1617
public async screenshot(): Promise<ScreenshotOutput> {
17-
const primaryDisplay = screen.getPrimaryDisplay();
18-
19-
const logicalSize = primaryDisplay.size; // Logical = Physical / scaleX
18+
const { physicalSize, logicalSize, scaleFactor } = getScreenSize(); // Logical = Physical / scaleX
2019

2120
logger.info(
2221
'[screenshot] [primaryDisplay]',
23-
'size:',
24-
primaryDisplay.size,
22+
'logicalSize:',
23+
logicalSize,
2524
'scaleFactor:',
26-
primaryDisplay.scaleFactor,
25+
scaleFactor,
2726
);
2827

2928
const sources = await desktopCapturer.getSources({
@@ -36,14 +35,6 @@ export class NutJSElectronOperator extends NutJSOperator {
3635
const primarySource = sources[0];
3736
const screenshot = primarySource.thumbnail;
3837

39-
// Mac retina display scaleFactor = 1
40-
const scaleFactor = env.isMacOS ? 1 : primaryDisplay.scaleFactor;
41-
42-
const physicalSize = {
43-
width: Math.round(logicalSize.width * scaleFactor),
44-
height: Math.round(logicalSize.height * scaleFactor),
45-
};
46-
4738
const resized = screenshot.resize({
4839
width: physicalSize.width,
4940
height: physicalSize.height,

src/main/ipcRoutes/screen.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,19 @@
22
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
5-
import { screen } from 'electron';
65
import { initIpc } from '@ui-tars/electron-ipc/main';
6+
import { getScreenSize } from '@main/utils/screen';
77

88
const t = initIpc.create();
99

1010
export const screenRoute = t.router({
1111
getScreenSize: t.procedure.input<void>().handle(async () => {
12-
const primaryDisplay = screen.getPrimaryDisplay();
12+
const primaryDisplay = getScreenSize();
13+
1314
return {
14-
screenWidth: primaryDisplay.size.width,
15-
screenHeight: primaryDisplay.size.height,
15+
screenWidth: primaryDisplay.physicalSize.width,
16+
screenHeight: primaryDisplay.physicalSize.height,
17+
scaleFactor: primaryDisplay.scaleFactor,
1618
};
1719
}),
1820
});

src/main/services/runAgent.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ export const runAgent = async (
5555
predictionParsed
5656
) {
5757
const screenshotBase64WithElementMarker = await markClickPosition({
58-
...screenshotContext.size,
58+
screenshotContext,
5959
base64: lastConv?.screenshotBase64,
6060
parsed: predictionParsed,
6161
}).catch((e) => {
@@ -95,7 +95,7 @@ export const runAgent = async (
9595
screenshotContext?.size &&
9696
!abortController?.signal?.aborted
9797
) {
98-
showPredictionMarker(predictionParsed, screenshotContext.size);
98+
showPredictionMarker(predictionParsed, screenshotContext);
9999
}
100100

101101
setState({

src/main/shared/setOfMarks.test.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,11 @@ const testMakeScreenMarker = () => {
6464
const { overlays } = setOfMarksOverlays({
6565
predictions: [action],
6666
screenshotContext: {
67-
width: 2560,
68-
height: 1440,
67+
size: {
68+
width: 2560,
69+
height: 1440,
70+
},
71+
scaleFactor: 1,
6972
},
7073
xPos,
7174
yPos,

src/main/shared/setOfMarks.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@ export const setOfMarksOverlays = ({
3030
yPos,
3131
}: {
3232
predictions: PredictionParsed[];
33-
screenshotContext: NonNullable<Conversation['screenshotContext']>['size'];
33+
screenshotContext: NonNullable<Conversation['screenshotContext']>;
3434
xPos?: number;
3535
yPos?: number;
3636
}): {
3737
overlays: Overlay[];
3838
} => {
3939
const overlays: Overlay[] = [];
40-
const { width, height } = screenshotContext;
40+
const { width, height } = screenshotContext?.size || {};
4141

4242
for (const prediction of predictions) {
4343
let boxWidth: number;

src/main/utils/image.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,15 @@
44
*/
55
import sharp from 'sharp';
66

7-
import { type PredictionParsed } from '@ui-tars/shared/types';
7+
import { Conversation, type PredictionParsed } from '@ui-tars/shared/types';
88

99
import { logger } from '@main/logger';
1010
import { setOfMarksOverlays } from '@main/shared/setOfMarks';
1111

1212
// TODO: use jimp to mark click position
1313
export async function markClickPosition(data: {
1414
base64: string;
15-
width: number;
16-
height: number;
15+
screenshotContext: NonNullable<Conversation['screenshotContext']>;
1716
parsed: PredictionParsed[];
1817
}): Promise<string> {
1918
if (!data?.parsed?.length) {
@@ -23,10 +22,7 @@ export async function markClickPosition(data: {
2322
const imageBuffer = Buffer.from(data.base64, 'base64');
2423
const { overlays = [] } = setOfMarksOverlays({
2524
predictions: data.parsed,
26-
screenshotContext: {
27-
width: data.width,
28-
height: data.height,
29-
},
25+
screenshotContext: data.screenshotContext,
3026
});
3127
const imageOverlays: sharp.OverlayOptions[] = overlays
3228
.map((o) => {

src/main/utils/screen.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
import { screen } from 'electron';
6+
7+
import * as env from '@main/env';
8+
9+
export const getScreenSize = () => {
10+
const primaryDisplay = screen.getPrimaryDisplay();
11+
12+
const logicalSize = primaryDisplay.size; // Logical = Physical / scaleX
13+
// Mac retina display scaleFactor = 1
14+
const scaleFactor = env.isMacOS ? 1 : primaryDisplay.scaleFactor;
15+
16+
const physicalSize = {
17+
width: Math.round(logicalSize.width * scaleFactor),
18+
height: Math.round(logicalSize.height * scaleFactor),
19+
};
20+
21+
return {
22+
physicalSize,
23+
logicalSize,
24+
scaleFactor,
25+
};
26+
};

src/main/window/ScreenMarker.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,15 @@ class ScreenMarker {
199199
// show Screen Marker in screen for prediction
200200
showPredictionMarker(
201201
predictions: PredictionParsed[],
202-
screenshotContext: NonNullable<Conversation['screenshotContext']>['size'],
202+
screenshotContext: NonNullable<Conversation['screenshotContext']>,
203203
) {
204204
const { overlays } = setOfMarksOverlays({
205205
predictions,
206206
screenshotContext,
207207
xPos: this.lastShowPredictionMarkerPos?.xPos,
208208
yPos: this.lastShowPredictionMarkerPos?.yPos,
209209
});
210+
const { scaleFactor = 1 } = screenshotContext;
210211

211212
// loop predictions
212213
for (let i = 0; i < overlays.length; i++) {
@@ -230,8 +231,9 @@ class ScreenMarker {
230231
webPreferences: { nodeIntegration: true, contextIsolation: false },
231232
...(overlay.xPos &&
232233
overlay.yPos && {
233-
x: overlay.xPos + overlay.offsetX,
234-
y: overlay.yPos + overlay.offsetY,
234+
// Logical Resolution
235+
x: (overlay.xPos + overlay.offsetX) * scaleFactor,
236+
y: (overlay.yPos + overlay.offsetY) * scaleFactor,
235237
}),
236238
});
237239

@@ -309,7 +311,7 @@ export const closeScreenMarker = () => {
309311

310312
export const showPredictionMarker = (
311313
predictions: PredictionParsed[],
312-
screenshotContext: NonNullable<Conversation['screenshotContext']>['size'],
314+
screenshotContext: NonNullable<Conversation['screenshotContext']>,
313315
) => {
314316
ScreenMarker.getInstance().showPredictionMarker(
315317
predictions,

src/renderer/src/hooks/useScreenRecord.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ export const useScreenRecord = (
2222

2323
const drawSetOfMarkOverlays = (
2424
ctx: CanvasRenderingContext2D,
25-
screenshotContext: NonNullable<Conversation['screenshotContext']>['size'],
25+
screenshotContext: NonNullable<Conversation['screenshotContext']>,
2626
) => {
2727
const messages = getState().messages;
2828
// console.log('[messages]', messages);
@@ -64,7 +64,8 @@ export const useScreenRecord = (
6464
try {
6565
recordedChunksRef.current = [];
6666

67-
const { screenWidth, screenHeight } = await api.getScreenSize();
67+
const { screenWidth, screenHeight, scaleFactor } =
68+
await api.getScreenSize();
6869

6970
const stream = await navigator.mediaDevices.getDisplayMedia({
7071
video: {
@@ -131,8 +132,11 @@ export const useScreenRecord = (
131132

132133
// draw set of mark overlays
133134
drawSetOfMarkOverlays(ctx, {
134-
width: screenWidth,
135-
height: screenHeight,
135+
size: {
136+
width: screenWidth,
137+
height: screenHeight,
138+
},
139+
scaleFactor,
136140
});
137141
}
138142
animationFrameId = requestAnimationFrame(drawFrame);

0 commit comments

Comments
 (0)