Skip to content

Commit 7a78e4c

Browse files
committed
refactor(sdk): screenshot only return base64 and scaleFactor
1 parent ec731b4 commit 7a78e4c

File tree

9 files changed

+28
-56
lines changed

9 files changed

+28
-56
lines changed

packages/operators/browserbase/package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,7 @@
3636
"dependencies": {
3737
"@ui-tars/shared": "workspace:*",
3838
"@browserbasehq/stagehand": "^1.13.0",
39-
"big.js": "^6.2.2",
40-
"jimp": "1.6.0"
39+
"big.js": "^6.2.2"
4140
},
4241
"peerDependencies": {
4342
"@ui-tars/sdk": "workspace:*"

packages/operators/browserbase/src/index.ts

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import {
99
type ExecuteOutput,
1010
StatusEnum,
1111
} from '@ui-tars/sdk/core';
12-
import { Jimp } from 'jimp';
1312
import { Stagehand, ConstructorParams } from '@browserbasehq/stagehand';
1413

1514
export class BrowserbaseOperator extends Operator {
@@ -35,7 +34,7 @@ export class BrowserbaseOperator extends Operator {
3534
User Instruction: Who is the top GitHub contributor to Stagehand by Browserbase?
3635
1. Thought: The best starting point to find the top GitHub contributor to the Stagehand project by Browserbase is the project's GitHub repository itself. This will provide direct access to the contributors' list, where you can see who has made the most contributions.\nAction: GOTO(url='https://github.com/browserbase/stagehand')
3736
2. Thought: The 'Insights' tab on a GitHub repository page provides detailed information about the project's contributors, including the number of contributions each has made. By accessing this tab, we can find the top contributor.\nAction: ACT(description='Click on the 'Insights' tab.')
38-
3. The 'Contributors' section under the 'Insights' tab will show a list of contributors along with the number of contributions each has made. This is the most direct way to identify the top contributor to the project.\nAction: ACT(description='Click on the \'Contributors\' link in the left sidebar.')
37+
3. The 'Contributors' section under the 'Insights' tab will show a list of contributors along with the number of contributions each has made. This is the most direct way to identify the top contributor to the project.\nAction: ACT(description='Click on the 'Contributors' link in the left sidebar.')
3938
4. The screenshot shows the contributors to the Stagehand project, with 'kamath' listed as the top contributor with 69 commits. This information is directly visible in the 'Contributors' section under the 'Insights' tab.\nAction: CLOSE()
4039
4140
\`\`\``,
@@ -60,13 +59,8 @@ User Instruction: Who is the top GitHub contributor to Stagehand by Browserbase?
6059
const cdpSession = await page.context().newCDPSession(page);
6160
const { data: base64 } = await cdpSession.send('Page.captureScreenshot');
6261

63-
const buffer = Buffer.from(base64, 'base64');
64-
const image = await Jimp.fromBuffer(buffer);
65-
6662
return {
6763
base64,
68-
width: image.width,
69-
height: image.height,
7064
scaleFactor: 1,
7165
};
7266
}

packages/operators/nut-js/src/index.ts

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -73,35 +73,27 @@ export class NutJSOperator extends Operator {
7373
const width = screenWithScale.width / screenWithScale.pixelDensity.scaleX;
7474
const height = screenWithScale.height / screenWithScale.pixelDensity.scaleY;
7575

76-
const realScreenImage = await screenWithScaleImage
76+
const physicalScreenImage = await screenWithScaleImage
7777
.resize({
7878
w: width,
7979
h: height,
8080
})
8181
.getBuffer('image/png', { quality: 75 });
8282

8383
const output = {
84-
base64: realScreenImage.toString('base64'),
85-
width,
86-
height,
84+
base64: physicalScreenImage.toString('base64'),
8785
scaleFactor,
8886
};
8987

9088
logger?.info(
91-
`[NutjsOperator] screenshot: ${output.width}x${output.height}, scaleFactor: ${scaleFactor}`,
89+
`[NutjsOperator] screenshot: ${width}x${height}, scaleFactor: ${scaleFactor}`,
9290
);
9391
return output;
9492
}
9593

9694
async execute(params: ExecuteParams): Promise<ExecuteOutput> {
9795
const { logger } = useContext();
98-
const {
99-
parsedPrediction,
100-
screenWidth,
101-
screenHeight,
102-
scaleFactor,
103-
factors,
104-
} = params;
96+
const { parsedPrediction, screenWidth, screenHeight, scaleFactor } = params;
10597

10698
const { action_type, action_inputs } = parsedPrediction;
10799
const startBoxStr = action_inputs?.start_box || '';
@@ -111,7 +103,6 @@ export class NutJSOperator extends Operator {
111103
boxStr: startBoxStr,
112104
screenWidth,
113105
screenHeight,
114-
factors,
115106
});
116107

117108
logger.info(`[NutjsOperator Position]: (${startX}, ${startY})`);

packages/sdk/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,10 +225,10 @@ When implementing a custom operator, you need to implement two core methods: `sc
225225
"jimp": "^1.6.0"
226226
},
227227
"peerDependencies": {
228-
"@ui-tars/sdk": "latest"
228+
"@ui-tars/sdk": "^1.2.0-beta.17"
229229
},
230230
"devDependencies": {
231-
"@ui-tars/sdk": "latest",
231+
"@ui-tars/sdk": "^1.2.0-beta.17",
232232
"tsup": "^8.3.5",
233233
"typescript": "^5.7.2",
234234
"vitest": "^3.0.2"

packages/sdk/src/GUIAgent.ts

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,13 @@ import { GUIAgentData, StatusEnum, ShareVersion } from '@ui-tars/shared/types';
77
import { IMAGE_PLACEHOLDER, MAX_LOOP_COUNT } from '@ui-tars/shared/constants';
88
import { sleep } from '@ui-tars/shared/utils';
99
import asyncRetry from 'async-retry';
10+
import { Jimp } from 'jimp';
1011

1112
import { setContext } from './context/useContext';
1213
import { Operator, GUIAgentConfig } from './types';
1314
import { UITarsModel } from './Model';
1415
import { BaseGUIAgent } from './base';
15-
import {
16-
getSummary,
17-
parseBoxToScreenCoords,
18-
processVlmParams,
19-
toVlmModelFormat,
20-
} from './utils';
16+
import { getSummary, processVlmParams, toVlmModelFormat } from './utils';
2117
import {
2218
INTERNAL_ACTION_SPACES_ENUM,
2319
MAX_SNAPSHOT_ERR_CNT,
@@ -123,11 +119,15 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent<
123119
onRetry: retry?.screenshot?.onRetry,
124120
});
125121

126-
const isValidImage = !!(
127-
snapshot?.base64 &&
128-
snapshot?.width &&
129-
snapshot?.height
130-
);
122+
const { width, height } =
123+
(await Jimp.read(
124+
Buffer.from(
125+
snapshot.base64.replace(/^data:image\/\w+;base64,/, ''),
126+
'base64',
127+
),
128+
)) || {};
129+
130+
const isValidImage = !!(snapshot?.base64 && width && height);
131131

132132
if (!isValidImage) {
133133
loopCnt -= 1;
@@ -145,8 +145,8 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent<
145145
screenshotBase64: snapshot.base64,
146146
screenshotContext: {
147147
size: {
148-
width: snapshot.width,
149-
height: snapshot.height,
148+
width,
149+
height,
150150
},
151151
scaleFactor: snapshot.scaleFactor,
152152
},
@@ -224,8 +224,8 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent<
224224
},
225225
screenshotContext: {
226226
size: {
227-
width: snapshot.width,
228-
height: snapshot.height,
227+
width,
228+
height,
229229
},
230230
scaleFactor: snapshot.scaleFactor,
231231
},
@@ -271,8 +271,8 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent<
271271
operator.execute({
272272
prediction,
273273
parsedPrediction,
274-
screenWidth: snapshot.width,
275-
screenHeight: snapshot.height,
274+
screenWidth: width,
275+
screenHeight: height,
276276
scaleFactor: snapshot.scaleFactor,
277277
factors: this.model.factors,
278278
}),

packages/sdk/src/utils.ts

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@ export const parseBoxToScreenCoords = ({
2424
boxStr,
2525
screenWidth,
2626
screenHeight,
27-
factors = DEFUALT_FACTORS,
2827
}: {
2928
boxStr: string;
3029
screenWidth: number;
3130
screenHeight: number;
32-
factors?: Factors;
3331
}) => {
3432
if (!boxStr) {
3533
return { x: null, y: null };
@@ -41,11 +39,10 @@ export const parseBoxToScreenCoords = ({
4139
.map((num) => parseFloat(num.trim()));
4240

4341
const [x1, y1, x2 = x1, y2 = y1] = coords;
44-
const [widthFactor, heightFactor] = factors;
4542

4643
return {
47-
x: Math.round(((x1 + x2) / 2) * screenWidth * widthFactor) / widthFactor,
48-
y: Math.round(((y1 + y2) / 2) * screenHeight * heightFactor) / heightFactor,
44+
x: Math.round(((x1 + x2) / 2) * screenWidth * 1000) / 1000,
45+
y: Math.round(((y1 + y2) / 2) * screenHeight * 1000) / 1000,
4946
};
5047
};
5148

packages/shared/src/types/agent.ts

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,7 @@ export interface VlmResponse {
2626
export interface ScreenshotResult {
2727
/** screenshot base64, `keep screenshot size as physical resolution` */
2828
base64: string;
29-
/** real screenshot pixel width, `Physical Resolution` */
30-
width: number;
31-
/** real screenshot pixel height, `Physical Resolution` */
32-
height: number;
33-
/** screenshot scale factor(DPR) */
29+
/** screenshot scale factor(DPR), physical_resolution = logical_resolution * scaleFactor */
3430
scaleFactor: number;
3531
}
3632

pnpm-lock.yaml

Lines changed: 0 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/main/agent/operator.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,6 @@ export class NutJSElectronOperator extends NutJSOperator {
7979

8080
return {
8181
base64: resized.toPNG().toString('base64'),
82-
width: physicalSize.width,
83-
height: physicalSize.height,
8482
scaleFactor,
8583
};
8684
}

0 commit comments

Comments
 (0)