web-infra-dev
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/site/docs/en/choose-a-model.mdx
Lines changed: 2 additions & 2 deletions b/‎apps/site/docs/en/choose-a-model.mdx
Lines changed: 2 additions & 2 deletions
diff --git a/‎apps/site/docs/zh/choose-a-model.mdx
Lines changed: 5 additions & 5 deletions b/‎apps/site/docs/zh/choose-a-model.mdx
Lines changed: 5 additions & 5 deletions
diff --git a/‎packages/evaluation/package.json
Lines changed: 0 additions & 1 deletion b/‎packages/evaluation/package.json
Lines changed: 0 additions & 1 deletion
diff --git a/‎packages/evaluation/src/test-analyzer.ts
Lines changed: 1 addition & 1 deletion b/‎packages/evaluation/src/test-analyzer.ts
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/evaluation/tests/llm-locator.test.ts
Lines changed: 62 additions & 42 deletions b/‎packages/evaluation/tests/llm-locator.test.ts
Lines changed: 62 additions & 42 deletions
diff --git a/‎packages/evaluation/tests/llm-planning.test.ts
Lines changed: 41 additions & 10 deletions b/‎packages/evaluation/tests/llm-planning.test.ts
Lines changed: 41 additions & 10 deletions
diff --git a/‎packages/evaluation/tsconfig.json
Lines changed: 5 additions & 3 deletions b/‎packages/evaluation/tsconfig.json
Lines changed: 5 additions & 3 deletions
diff --git a/‎packages/evaluation/vitest.config.ts
Lines changed: 3 additions & 0 deletions b/‎packages/evaluation/vitest.config.ts
Lines changed: 3 additions & 0 deletions
@@ -109,6 +109,7 @@ midscene_run
 midscene_run/report
 midscene_run/dump
 
+*.ignore.png
 extension_output
 .cursor
 android-playground/static/
 
@@ -53,7 +53,7 @@ You can use the `Doubao-1.5-thinking-vision-pro` model on [Volcano Engine](https
 ```bash
 OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" 
 OPENAI_API_KEY="...."
-MIDSCENE_MODEL_NAME="ep-..." # Inference endpoint ID from Volcano Engine
+MIDSCENE_MODEL_NAME="ep-..." # Inference endpoint ID or model name from Volcano Engine
 MIDSCENE_USE_DOUBAO_VISION=1
 ```
 
@@ -64,7 +64,7 @@ You can use `doubao-1.5-ui-tars` on [Volcengine](https://www.volcengine.com):
 ```bash
 OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" 
 OPENAI_API_KEY="...."
-MIDSCENE_MODEL_NAME="ep-2025..."
+MIDSCENE_MODEL_NAME="ep-2025..." # Inference endpoint ID or model name from Volcano Engine
 MIDSCENE_USE_VLM_UI_TARS=DOUBAO
 ```
 
 
@@ -42,7 +42,7 @@ MIDSCENE_USE_QWEN_VL=1
 ```bash
 OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" 
 OPENAI_API_KEY="...."
-MIDSCENE_MODEL_NAME="ep-..." # 火山引擎的推理接入点ID
+MIDSCENE_MODEL_NAME="ep-..." # 火山引擎模型名称或推理接入点 ID
 MIDSCENE_USE_DOUBAO_VISION=1
 ```
 
@@ -53,7 +53,7 @@ MIDSCENE_USE_DOUBAO_VISION=1
 ```bash
 OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" 
 OPENAI_API_KEY="...."
-MIDSCENE_MODEL_NAME="ep-2025..." # 火山引擎的推理接入点ID
+MIDSCENE_MODEL_NAME="ep-2025..." # 火山引擎模型名称或推理接入点 ID
 MIDSCENE_USE_VLM_UI_TARS=DOUBAO
 ```
 
@@ -132,7 +132,7 @@ Qwen-2.5-VL 具备操作规划（action planning）能力，可以自主控制
 **在 Midscene.js 中使用时的限制**
 
 - **小图标识别能力较差**：为了准确识别小图标、小元素，你可能需要[开启 `deepThink` 参数](./blog-introducing-instant-actions-and-deep-think)并调优描述，否则识别效果无法保证。
-- **断言能力不足**：在某些情况下，Qwen-2.5-VL 的断言能力可能不如 `gpt-4o`。
+- **断言能力不足**：在某些情况下，Qwen-2.5-VL 的断言能力不如 `gpt-4o`。
 
 **配置**
 
@@ -185,7 +185,7 @@ UI-TARS 是一个专为 UI 自动化设计的开源模型。它仅以截图作
 **在 Midscene.js 中使用时的限制**
 
 - **断言能力较差**：在某些情况下，UI-TARS 的断言能力可能不如 `gpt-4o`。
-- **无法固定操作路径**：UI-TARS 具备较强的探索能力，它可能会主动多次尝试不同的操作路径，这可能会导致每次调用时操作路径不固定。
+- **无法固定操作路径**：UI-TARS 具备较强的探索能力，它可能会主动多次尝试不同的操作路径，继而导致每次调用时操作路径不固定。
 
 **配置**
 
@@ -202,7 +202,7 @@ MIDSCENE_USE_VLM_UI_TARS=1 # 别忘了配置这项用于 UI-TARS 模式！
 
 在火山引擎平台上，有一个已经部署完成的 `doubao-1.5-ui-tars` 模型，开发者可以通过 API 调用、按使用量付费。模型文档帮助：https://www.volcengine.com/docs/82379/1536429
 
-在使用火山引擎版本模型时，需要创建推理接入点（形如 `ep-2025...`）。集齐 API Key 和推理点信息后，配置文件类似如下：
+在使用火山引擎版本模型时，可能需要创建推理接入点（形如 `ep-2025...`）。集齐 API Key 和推理点信息后，配置文件类似如下：
 
 ```bash
 # 注意 URL 最后填写到 /v3 结束即可
 
@@ -20,7 +20,6 @@
     "format": "cd ../.. && npm run lint"
   },
   "files": ["dist", "README.md"],
-  "type": "module",
   "dependencies": {
     "@midscene/core": "workspace:*",
     "@midscene/shared": "workspace:*",
 
@@ -230,7 +230,7 @@ ${errorMsg ? `Error: ${errorMsg}` : ''}
     }
 
     // compare coordinates
-    if (testCase.response_rect && vlLocateMode()) {
+    if (testCase.response_rect && vlLocateMode({ intent: 'grounding' })) {
       const resultRect = (result as LocateResult).rect;
       if (!resultRect) {
         throw new Error(
 
@@ -1,14 +1,15 @@
 import { writeFileSync } from 'node:fs';
-import Insight, {
-  type Rect,
-  MIDSCENE_MODEL_NAME,
-  getAIConfig,
-} from '@midscene/core';
+import Insight, { type Rect } from '@midscene/core';
 import { sleep } from '@midscene/core/utils';
-import { vlLocateMode } from '@midscene/shared/env';
+import {
+  getModelName,
+  globalConfigManager,
+  vlLocateMode,
+} from '@midscene/shared/env';
 import { saveBase64Image } from '@midscene/shared/img';
+
 import dotenv from 'dotenv';
-import { afterAll, expect, test } from 'vitest';
+import { afterAll, beforeAll, expect, test } from 'vitest';
 import { TestResultCollector } from '../src/test-analyzer';
 import { annotateRects, buildContext, getCases } from './util';
 
@@ -27,20 +28,34 @@ const testSources = [
   'aweme-play',
 ];
 
-const positionModeTag = vlLocateMode() ? 'by_coordinates' : 'by_element';
-const resultCollector = new TestResultCollector(
-  positionModeTag,
-  getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
-);
+let resultCollector: TestResultCollector;
 
 let failCaseThreshold = 2;
 if (process.env.CI) {
-  failCaseThreshold = vlLocateMode() ? 2 : 3;
+  failCaseThreshold = vlLocateMode({
+    intent: 'grounding',
+  })
+    ? 2
+    : 3;
 }
 
-if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
-  expect(vlLocateMode()).toBeTruthy();
-}
+beforeAll(async () => {
+  await globalConfigManager.init();
+  const positionModeTag = vlLocateMode({
+    intent: 'grounding',
+  })
+    ? 'by_coordinates'
+    : 'by_element';
+
+  resultCollector = new TestResultCollector(
+    positionModeTag,
+    getModelName({ intent: 'grounding' }),
+  );
+
+  if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
+    expect(vlLocateMode({ intent: 'grounding' })).toBeTruthy();
+  }
+});
 
 afterAll(async () => {
   await resultCollector.printSummary();
@@ -60,6 +75,9 @@ testSources.forEach((source) => {
         rect: Rect;
       }> = [];
       for (const [index, testCase] of cases.testCases.entries()) {
+        console.log(
+          `Processing ${source} ${index + 1} of ${cases.testCases.length}`,
+        );
         const context = await buildContext(source);
 
         const prompt = testCase.prompt;
@@ -72,7 +90,7 @@ testSources.forEach((source) => {
           result = await insight.locate({
             prompt,
             deepThink:
-              vlLocateMode() === 'doubao-vision'
+              vlLocateMode({ intent: 'grounding' }) === 'doubao-vision'
                 ? undefined
                 : testCase.deepThink,
           });
@@ -91,31 +109,29 @@ testSources.forEach((source) => {
 
         const { element, rect } = result;
 
-        if (process.env.UPDATE_ANSWER_DATA) {
-          // const { elementById } = context;
-
-          if (rect) {
-            const indexId = index + 1;
-            testCase.response_rect = rect;
-            testCase.annotation_index_id = indexId;
-            annotations.push({
-              indexId,
-              rect,
-            });
-
-            // // biome-ignore lint/performance/noDelete: <explanation>
-            // delete (testCase as any).response_bbox;
-            // // biome-ignore lint/performance/noDelete: <explanation>
-            // delete (testCase as any).response;
-          }
-
-          if (element) {
-            testCase.response_element = {
-              id: element.id,
-              indexId: element.indexId,
-            };
-          }
+        const shouldUpdateAnswerData = process.env.UPDATE_ANSWER_DATA;
+        if (rect) {
+          const indexId = index + 1;
+          testCase.response_rect = rect;
+          testCase.annotation_index_id = indexId;
+          annotations.push({
+            indexId,
+            rect,
+          });
 
+          // // biome-ignore lint/performance/noDelete: <explanation>
+          // delete (testCase as any).response_bbox;
+          // // biome-ignore lint/performance/noDelete: <explanation>
+          // delete (testCase as any).response;
+        }
+
+        if (element) {
+          testCase.response_element = {
+            id: element.id,
+            indexId: element.indexId,
+          };
+        }
+        if (shouldUpdateAnswerData) {
           // write testCase to file
           writeFileSync(aiDataPath, JSON.stringify(cases, null, 2));
         }
@@ -124,10 +140,14 @@ testSources.forEach((source) => {
             context.screenshotBase64,
             annotations.map((item) => item.rect),
           );
+          const outputPath = shouldUpdateAnswerData
+            ? `${aiDataPath}-coordinates-annotated.png`
+            : `${aiDataPath}-coordinates-annotated.ignore.png`;
           await saveBase64Image({
             base64Data: markedImage,
-            outputPath: `${aiDataPath}-coordinates-annotated.png`,
+            outputPath,
           });
+          console.log(`Saved to ${outputPath}`);
         }
 
         resultCollector.addResult(
 
@@ -3,30 +3,58 @@ import {
   MIDSCENE_MODEL_NAME,
   type PlanningAIResponse,
   type Rect,
-  getAIConfig,
   plan,
 } from '@midscene/core';
 import { adaptBboxToRect } from '@midscene/core/ai-model';
+import {
+  type DeviceAction,
+  defineActionInput,
+  defineActionKeyboardPress,
+  defineActionTap,
+} from '@midscene/core/device';
 import { sleep } from '@midscene/core/utils';
-import { vlLocateMode } from '@midscene/shared/env';
+import {
+  getModelName,
+  globalConfigManager,
+  vlLocateMode,
+} from '@midscene/shared/env';
 import { saveBase64Image } from '@midscene/shared/img';
 import dotenv from 'dotenv';
-import { afterEach, describe, expect, test } from 'vitest';
+import { afterEach, beforeAll, describe, expect, test } from 'vitest';
 import { TestResultCollector } from '../src/test-analyzer';
 import { annotateRects, buildContext, getCases } from './util';
 dotenv.config({
   debug: true,
   override: true,
 });
 
-if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
-  expect(vlLocateMode()).toBeTruthy();
-}
-
 const failCaseThreshold = process.env.CI ? 2 : 0;
 const testSources = ['todo'];
 
-const vlMode = vlLocateMode();
+let actionSpace: DeviceAction[] = [];
+
+let vlMode = false;
+beforeAll(async () => {
+  await globalConfigManager.init();
+
+  vlMode = !!vlLocateMode({
+    intent: 'grounding',
+  });
+
+  if (process.env.MIDSCENE_EVALUATION_EXPECT_VL) {
+    expect(
+      vlLocateMode({
+        intent: 'grounding',
+      }),
+    ).toBeTruthy();
+  }
+
+  actionSpace = [
+    defineActionTap(async () => {}),
+    defineActionInput(async () => {}),
+    defineActionKeyboardPress(async () => {}),
+  ];
+});
 
 describe.skipIf(vlMode)('ai planning - by element', () => {
   testSources.forEach((source) => {
@@ -42,7 +70,7 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
 
         const resultCollector = new TestResultCollector(
           `${caseGroupName}-planning`,
-          getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
+          getModelName({ intent: 'planning' }) || 'unspecified',
         );
 
         for (const [, testCase] of cases.testCases.entries()) {
@@ -54,6 +82,7 @@ describe.skipIf(vlMode)('ai planning - by element', () => {
           const res = await plan(prompt, {
             context,
             interfaceType: 'puppeteer',
+            actionSpace,
           });
 
           if (process.env.UPDATE_ANSWER_DATA) {
@@ -87,7 +116,7 @@ const vlCases = [
 
 const resultCollector = new TestResultCollector(
   'planning',
-  getAIConfig(MIDSCENE_MODEL_NAME) || 'unspecified',
+  getModelName({ intent: 'planning' }) || 'unspecified',
 );
 
 afterEach(async () => {
@@ -124,6 +153,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
               context,
               actionContext: testCase.action_context,
               interfaceType: 'puppeteer',
+              actionSpace,
             });
           } catch (error) {
             res = error as Error;
@@ -142,6 +172,7 @@ describe.skipIf(!vlMode)('ai planning - by coordinates', () => {
                   res.action.locate.bbox,
                   context.size.width,
                   context.size.height,
+                  { intent: 'planning' },
                 );
                 testCase.annotation_index_id = indexId;
                 annotations.push({
 
@@ -1,23 +1,25 @@
 {
   "compilerOptions": {
     "baseUrl": ".",
+    "sourceMap": true,
     "declaration": true,
     "emitDeclarationOnly": true,
     "esModuleInterop": true,
     "forceConsistentCasingInFileNames": true,
     "isolatedModules": true,
     "jsx": "preserve",
     "lib": ["DOM", "ESNext"],
-    "moduleResolution": "node",
+    "moduleResolution": "bundler",
     "paths": {
       "@/*": ["./src/*"]
     },
-    "target": "ESNext",
     "resolveJsonModule": true,
     "rootDir": "./",
     "skipLibCheck": true,
     "strict": true,
-    "module": "ESNext"
+    "module": "ESNext",
+    "composite": true,
+    "target": "es2018"
   },
   "exclude": ["node_modules"],
   "include": ["src", "tests", "./playwright.config.ts", "./vitest.config"]
 
@@ -21,4 +21,7 @@ export default defineConfig({
       '@': path.resolve(__dirname, 'src'),
     },
   },
+  ssr: {
+    external: ['@silvia-odwyer/photon'],
+  },
 });
Original file line number	Diff line number	Diff line change
@@ -230,7 +230,7 @@ ${errorMsg ? `Error: ${errorMsg}` : ''}
`230`	`230`	`}`
`231`	`231`
`232`	`232`	`// compare coordinates`
`233`		`- if (testCase.response_rect && vlLocateMode()) {`
	`233`	`+ if (testCase.response_rect && vlLocateMode({ intent: 'grounding' })) {`
`234`	`234`	`const resultRect = (result as LocateResult).rect;`
`235`	`235`	`if (!resultRect) {`
`236`	`236`	`throw new Error(`