Skip to content

Commit bbf9881

Browse files
fix(core): evaluation code (#1135)
* fix(core): evaluation code * fix(shared): globalConfigManager init error in chrome extension --------- Co-authored-by: EAGzzyCSL <eagzzycsl@hotmail.com>
1 parent d154153 commit bbf9881

File tree

16 files changed

+199
-93
lines changed

16 files changed

+199
-93
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ midscene_run
109109
midscene_run/report
110110
midscene_run/dump
111111

112+
*.ignore.png
112113
extension_output
113114
.cursor
114115
android-playground/static/

apps/site/docs/en/choose-a-model.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ You can use the `Doubao-1.5-thinking-vision-pro` model on [Volcano Engine](https
5353
```bash
5454
OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
5555
OPENAI_API_KEY="...."
56-
MIDSCENE_MODEL_NAME="ep-..." # Inference endpoint ID from Volcano Engine
56+
MIDSCENE_MODEL_NAME="ep-..." # Inference endpoint ID or model name from Volcano Engine
5757
MIDSCENE_USE_DOUBAO_VISION=1
5858
```
5959

@@ -64,7 +64,7 @@ You can use `doubao-1.5-ui-tars` on [Volcengine](https://www.volcengine.com):
6464
```bash
6565
OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
6666
OPENAI_API_KEY="...."
67-
MIDSCENE_MODEL_NAME="ep-2025..."
67+
MIDSCENE_MODEL_NAME="ep-2025..." # Inference endpoint ID or model name from Volcano Engine
6868
MIDSCENE_USE_VLM_UI_TARS=DOUBAO
6969
```
7070

apps/site/docs/zh/choose-a-model.mdx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ MIDSCENE_USE_QWEN_VL=1
4242
```bash
4343
OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
4444
OPENAI_API_KEY="...."
45-
MIDSCENE_MODEL_NAME="ep-..." # 火山引擎的推理接入点ID
45+
MIDSCENE_MODEL_NAME="ep-..." # 火山引擎模型名称或推理接入点 ID
4646
MIDSCENE_USE_DOUBAO_VISION=1
4747
```
4848

@@ -53,7 +53,7 @@ MIDSCENE_USE_DOUBAO_VISION=1
5353
```bash
5454
OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
5555
OPENAI_API_KEY="...."
56-
MIDSCENE_MODEL_NAME="ep-2025..." # 火山引擎的推理接入点ID
56+
MIDSCENE_MODEL_NAME="ep-2025..." # 火山引擎模型名称或推理接入点 ID
5757
MIDSCENE_USE_VLM_UI_TARS=DOUBAO
5858
```
5959

@@ -132,7 +132,7 @@ Qwen-2.5-VL 具备操作规划(action planning)能力,可以自主控制
132132
**在 Midscene.js 中使用时的限制**
133133

134134
- **小图标识别能力较差**:为了准确识别小图标、小元素,你可能需要[开启 `deepThink` 参数](./blog-introducing-instant-actions-and-deep-think)并调优描述,否则识别效果无法保证。
135-
- **断言能力不足**:在某些情况下,Qwen-2.5-VL 的断言能力可能不如 `gpt-4o`
135+
- **断言能力不足**:在某些情况下,Qwen-2.5-VL 的断言能力不如 `gpt-4o`
136136

137137
**配置**
138138

@@ -185,7 +185,7 @@ UI-TARS 是一个专为 UI 自动化设计的开源模型。它仅以截图作
185185
**在 Midscene.js 中使用时的限制**
186186

187187
- **断言能力较差**:在某些情况下,UI-TARS 的断言能力可能不如 `gpt-4o`
188-
- **无法固定操作路径**:UI-TARS 具备较强的探索能力,它可能会主动多次尝试不同的操作路径,这可能会导致每次调用时操作路径不固定
188+
- **无法固定操作路径**:UI-TARS 具备较强的探索能力,它可能会主动多次尝试不同的操作路径,继而导致每次调用时操作路径不固定
189189

190190
**配置**
191191

@@ -202,7 +202,7 @@ MIDSCENE_USE_VLM_UI_TARS=1 # 别忘了配置这项用于 UI-TARS 模式!
202202

203203
在火山引擎平台上,有一个已经部署完成的 `doubao-1.5-ui-tars` 模型,开发者可以通过 API 调用、按使用量付费。模型文档帮助:https://www.volcengine.com/docs/82379/1536429
204204

205-
在使用火山引擎版本模型时,需要创建推理接入点(形如 `ep-2025...`)。集齐 API Key 和推理点信息后,配置文件类似如下:
205+
在使用火山引擎版本模型时,可能需要创建推理接入点(形如 `ep-2025...`)。集齐 API Key 和推理点信息后,配置文件类似如下:
206206

207207
```bash
208208
# 注意 URL 最后填写到 /v3 结束即可

packages/evaluation/package.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
"format": "cd ../.. && npm run lint"
2121
},
2222
"files": ["dist", "README.md"],
23-
"type": "module",
2423
"dependencies": {
2524
"@midscene/core": "workspace:*",
2625
"@midscene/shared": "workspace:*",

packages/evaluation/src/test-analyzer.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ ${errorMsg ? `Error: ${errorMsg}` : ''}
230230
}
231231

232232
// compare coordinates
233-
if (testCase.response_rect && vlLocateMode()) {
233+
if (testCase.response_rect && vlLocateMode({ intent: 'grounding' })) {
234234
const resultRect = (result as LocateResult).rect;
235235
if (!resultRect) {
236236
throw new Error(

packages/evaluation/tests/llm-locator.test.ts

Lines changed: 62 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
import { writeFileSync } from 'node:fs';
2-
import Insight, {
3-
type Rect,
4-
MIDSCENE_MODEL_NAME,
5-
getAIConfig,
6-
} from '@midscene/core';
2+
import Insight, { type Rect } from '@midscene/core';
73
import { sleep } from '@midscene/core/utils';
8-
import { vlLocateMode } from '@midscene/shared/env';
4+
import {
5+
getModelName,
6+
globalConfigManager,
7+
vlLocateMode,
8+
} from '@midscene/shared/env';
99
import { saveBase64Image } from '@midscene/shared/img';
10+
1011
import dotenv from 'dotenv';
11-
import { afterAll, expect, test } from 'vitest';
12+
import { afterAll, beforeAll, expect, test } from 'vitest';
1213
import { TestResultCollector } from '../src/test-analyzer';
1314
import { annotateRects, buildContext, getCases } from './util';
1415

@@ -27,20 +28,34 @@ const testSources = [
2728
'aweme-play',
2829
];
2930

30-
const positionModeTag = vlLocateMode() ? 'by_coordinates' : 'by_element';
31-
const resultCollector = new TestResultCollector(
32-
positionModeTag,
33-
getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
34-
);
31+
let resultCollector: TestResultCollector;
3532

3633
let failCaseThreshold = 2;
3734
if (process.env.CI) {
38-
failCaseThreshold = vlLocateMode() ? 2 : 3;
35+
failCaseThreshold = vlLocateMode({
36+
intent: 'grounding',
37+
})
38+
? 2
39+
: 3;
3940
}
4041

41-
if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
42-
expect(vlLocateMode()).toBeTruthy();
43-
}
42+
beforeAll(async () => {
43+
await globalConfigManager.init();
44+
const positionModeTag = vlLocateMode({
45+
intent: 'grounding',
46+
})
47+
? 'by_coordinates'
48+
: 'by_element';
49+
50+
resultCollector = new TestResultCollector(
51+
positionModeTag,
52+
getModelName({ intent: 'grounding' }),
53+
);
54+
55+
if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
56+
expect(vlLocateMode({ intent: 'grounding' })).toBeTruthy();
57+
}
58+
});
4459

4560
afterAll(async () => {
4661
await resultCollector.printSummary();
@@ -60,6 +75,9 @@ testSources.forEach((source) => {
6075
rect: Rect;
6176
}> = [];
6277
for (const [index, testCase] of cases.testCases.entries()) {
78+
console.log(
79+
`Processing ${source} ${index + 1} of ${cases.testCases.length}`,
80+
);
6381
const context = await buildContext(source);
6482

6583
const prompt = testCase.prompt;
@@ -72,7 +90,7 @@ testSources.forEach((source) => {
7290
result = await insight.locate({
7391
prompt,
7492
deepThink:
75-
vlLocateMode() === 'doubao-vision'
93+
vlLocateMode({ intent: 'grounding' }) === 'doubao-vision'
7694
? undefined
7795
: testCase.deepThink,
7896
});
@@ -91,31 +109,29 @@ testSources.forEach((source) => {
91109

92110
const { element, rect } = result;
93111

94-
if (process.env.UPDATE_ANSWER_DATA) {
95-
// const { elementById } = context;
96-
97-
if (rect) {
98-
const indexId = index + 1;
99-
testCase.response_rect = rect;
100-
testCase.annotation_index_id = indexId;
101-
annotations.push({
102-
indexId,
103-
rect,
104-
});
105-
106-
// // biome-ignore lint/performance/noDelete: <explanation>
107-
// delete (testCase as any).response_bbox;
108-
// // biome-ignore lint/performance/noDelete: <explanation>
109-
// delete (testCase as any).response;
110-
}
111-
112-
if (element) {
113-
testCase.response_element = {
114-
id: element.id,
115-
indexId: element.indexId,
116-
};
117-
}
112+
const shouldUpdateAnswerData = process.env.UPDATE_ANSWER_DATA;
113+
if (rect) {
114+
const indexId = index + 1;
115+
testCase.response_rect = rect;
116+
testCase.annotation_index_id = indexId;
117+
annotations.push({
118+
indexId,
119+
rect,
120+
});
118121

122+
// // biome-ignore lint/performance/noDelete: <explanation>
123+
// delete (testCase as any).response_bbox;
124+
// // biome-ignore lint/performance/noDelete: <explanation>
125+
// delete (testCase as any).response;
126+
}
127+
128+
if (element) {
129+
testCase.response_element = {
130+
id: element.id,
131+
indexId: element.indexId,
132+
};
133+
}
134+
if (shouldUpdateAnswerData) {
119135
// write testCase to file
120136
writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));
121137
}
@@ -124,10 +140,14 @@ testSources.forEach((source) => {
124140
context.screenshotBase64,
125141
annotations.map((item) => item.rect),
126142
);
143+
const outputPath = shouldUpdateAnswerData
144+
? `${aiDataPath}-coordinates-annotated.png`
145+
: `${aiDataPath}-coordinates-annotated.ignore.png`;
127146
await saveBase64Image({
128147
base64Data: markedImage,
129-
outputPath: `${aiDataPath}-coordinates-annotated.png`,
148+
outputPath,
130149
});
150+
console.log(`Saved to ${outputPath}`);
131151
}
132152

133153
resultCollector.addResult(

packages/evaluation/tests/llm-planning.test.ts

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,58 @@ import {
33
MIDSCENE_MODEL_NAME,
44
type PlanningAIResponse,
55
type Rect,
6-
getAIConfig,
76
plan,
87
} from '@midscene/core';
98
import { adaptBboxToRect } from '@midscene/core/ai-model';
9+
import {
10+
type DeviceAction,
11+
defineActionInput,
12+
defineActionKeyboardPress,
13+
defineActionTap,
14+
} from '@midscene/core/device';
1015
import { sleep } from '@midscene/core/utils';
11-
import { vlLocateMode } from '@midscene/shared/env';
16+
import {
17+
getModelName,
18+
globalConfigManager,
19+
vlLocateMode,
20+
} from '@midscene/shared/env';
1221
import { saveBase64Image } from '@midscene/shared/img';
1322
import dotenv from 'dotenv';
14-
import { afterEach, describe, expect, test } from 'vitest';
23+
import { afterEach, beforeAll, describe, expect, test } from 'vitest';
1524
import { TestResultCollector } from '../src/test-analyzer';
1625
import { annotateRects, buildContext, getCases } from './util';
1726
dotenv.config({
1827
debug: true,
1928
override: true,
2029
});
2130

22-
if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
23-
expect(vlLocateMode()).toBeTruthy();
24-
}
25-
2631
const failCaseThreshold = process.env.CI ? 2 : 0;
2732
const testSources = ['todo'];
2833

29-
const vlMode = vlLocateMode();
34+
let actionSpace: DeviceAction[] = [];
35+
36+
let vlMode = false;
37+
beforeAll(async () => {
38+
await globalConfigManager.init();
39+
40+
vlMode = !!vlLocateMode({
41+
intent: 'grounding',
42+
});
43+
44+
if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
45+
expect(
46+
vlLocateMode({
47+
intent: 'grounding',
48+
}),
49+
).toBeTruthy();
50+
}
51+
52+
actionSpace = [
53+
defineActionTap(async () => {}),
54+
defineActionInput(async () => {}),
55+
defineActionKeyboardPress(async () => {}),
56+
];
57+
});
3058

3159
describe.skipIf(vlMode)('ai planning - by element', () => {
3260
testSources.forEach((source) => {
@@ -42,7 +70,7 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
4270

4371
const resultCollector = new TestResultCollector(
4472
`${caseGroupName}-planning`,
45-
getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
73+
getModelName({ intent: 'planning' }) || 'unspecified',
4674
);
4775

4876
for (const [, testCase] of cases.testCases.entries()) {
@@ -54,6 +82,7 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
5482
const res = await plan(prompt, {
5583
context,
5684
interfaceType: 'puppeteer',
85+
actionSpace,
5786
});
5887

5988
if (process.env.UPDATE_ANSWER_DATA) {
@@ -87,7 +116,7 @@ const vlCases = [
87116

88117
const resultCollector = new TestResultCollector(
89118
'planning',
90-
getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
119+
getModelName({ intent: 'planning' }) || 'unspecified',
91120
);
92121

93122
afterEach(async () => {
@@ -124,6 +153,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
124153
context,
125154
actionContext: testCase.action_context,
126155
interfaceType: 'puppeteer',
156+
actionSpace,
127157
});
128158
} catch (error) {
129159
res = error as Error;
@@ -142,6 +172,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
142172
res.action.locate.bbox,
143173
context.size.width,
144174
context.size.height,
175+
{ intent: 'planning' },
145176
);
146177
testCase.annotation_index_id = indexId;
147178
annotations.push({

packages/evaluation/tsconfig.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,25 @@
11
{
22
"compilerOptions": {
33
"baseUrl": ".",
4+
"sourceMap": true,
45
"declaration": true,
56
"emitDeclarationOnly": true,
67
"esModuleInterop": true,
78
"forceConsistentCasingInFileNames": true,
89
"isolatedModules": true,
910
"jsx": "preserve",
1011
"lib": ["DOM", "ESNext"],
11-
"moduleResolution": "node",
12+
"moduleResolution": "bundler",
1213
"paths": {
1314
"@/*": ["./src/*"]
1415
},
15-
"target": "ESNext",
1616
"resolveJsonModule": true,
1717
"rootDir": "./",
1818
"skipLibCheck": true,
1919
"strict": true,
20-
"module": "ESNext"
20+
"module": "ESNext",
21+
"composite": true,
22+
"target": "es2018"
2123
},
2224
"exclude": ["node_modules"],
2325
"include": ["src", "tests", "./playwright.config.ts", "./vitest.config"]

packages/evaluation/vitest.config.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,7 @@ export default defineConfig({
2121
'@': path.resolve(__dirname, 'src'),
2222
},
2323
},
24+
ssr: {
25+
external: ['@silvia-odwyer/photon'],
26+
},
2427
});

0 commit comments

Comments
 (0)