diff --git a/config/gni/devtools_grd_files.gni b/config/gni/devtools_grd_files.gni index 286948d1f26..32df9ee3567 100644 --- a/config/gni/devtools_grd_files.gni +++ b/config/gni/devtools_grd_files.gni @@ -645,6 +645,10 @@ grd_files_bundled_sources = [ "front_end/panels/ai_chat/common/log.js", "front_end/panels/ai_chat/common/context.js", "front_end/panels/ai_chat/common/page.js", + "front_end/panels/ai_chat/common/WebSocketRPCClient.js", + "front_end/panels/ai_chat/common/EvaluationConfig.js", + "front_end/panels/ai_chat/evaluation/EvaluationProtocol.js", + "front_end/panels/ai_chat/evaluation/EvaluationAgent.js", "front_end/panels/ai_chat/tracing/TracingProvider.js", "front_end/panels/ai_chat/tracing/LangfuseProvider.js", "front_end/panels/ai_chat/tracing/TracingConfig.js", diff --git a/eval-server/.env.example b/eval-server/.env.example new file mode 100644 index 00000000000..1e8a74879ce --- /dev/null +++ b/eval-server/.env.example @@ -0,0 +1,16 @@ +# WebSocket Server Configuration +PORT=8080 +HOST=localhost + +# LLM Judge Configuration +OPENAI_API_KEY=your-openai-api-key-here +JUDGE_MODEL=gpt-4 +JUDGE_TEMPERATURE=0.1 + +# Logging Configuration +LOG_LEVEL=info +LOG_DIR=./logs + +# RPC Configuration +RPC_TIMEOUT=30000 +MAX_CONCURRENT_EVALUATIONS=10 \ No newline at end of file diff --git a/eval-server/.gitignore b/eval-server/.gitignore new file mode 100644 index 00000000000..97aca2ea1cd --- /dev/null +++ b/eval-server/.gitignore @@ -0,0 +1,2 @@ +.env +node_modules \ No newline at end of file diff --git a/eval-server/CLAUDE.md b/eval-server/CLAUDE.md new file mode 100644 index 00000000000..5db83421a3f --- /dev/null +++ b/eval-server/CLAUDE.md @@ -0,0 +1,103 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +bo-eval-server is a WebSocket-based evaluation server for LLM agents that implements an LLM-as-a-judge evaluation system. The server accepts connections from AI agents, sends them evaluation tasks via RPC calls, collects their responses, and uses an LLM to judge the quality of responses. + +## Commands + +### Development +- `npm start` - Start the WebSocket server +- `npm run dev` - Start server with file watching for development +- `npm run cli` - Start interactive CLI for server management and testing +- `npm test` - Run example agent client for testing + +### Installation +- `npm install` - Install dependencies +- Copy `.env.example` to `.env` and configure environment variables + +### Required Environment Variables +- `OPENAI_API_KEY` - OpenAI API key for LLM judge functionality +- `PORT` - WebSocket server port (default: 8080) + +## Architecture + +### Core Components + +**WebSocket Server** (`src/server.js`) +- Accepts connections from LLM agents +- Manages agent lifecycle (connect, ready, disconnect) +- Orchestrates evaluation sessions +- Handles bidirectional RPC communication + +**RPC Client** (`src/rpc-client.js`) +- Implements JSON-RPC 2.0 protocol for server-to-client calls +- Manages request/response correlation with unique IDs +- Handles timeouts and error conditions +- Calls `Evaluate(request: String) -> String` method on connected agents + +**LLM Evaluator** (`src/evaluator.js`) +- Integrates with OpenAI API for LLM-as-a-judge functionality +- Evaluates agent responses on multiple criteria (correctness, completeness, clarity, relevance, helpfulness) +- Returns structured JSON evaluation with scores and reasoning + +**Logger** (`src/logger.js`) +- Structured logging using Winston +- Separate log files for different event types +- JSON format for easy parsing and analysis +- Logs all RPC calls, evaluations, and connection events + +### Evaluation Flow + +1. Agent connects to WebSocket server +2. Agent sends "ready" signal +3. Server calls agent's `Evaluate` method with a task +4. Agent processes task and returns response +5. Server sends response to LLM judge for evaluation +6. Results are logged as JSON with scores and detailed feedback + +### Project Structure + +``` +src/ +├── server.js # Main WebSocket server and evaluation orchestration +├── rpc-client.js # JSON-RPC client for calling agent methods +├── evaluator.js # LLM judge integration (OpenAI) +├── logger.js # Structured logging and result storage +├── config.js # Configuration management +└── cli.js # Interactive CLI for testing and management + +logs/ # Log files (created automatically) +├── combined.log # All log events +├── error.log # Error events only +└── evaluations.jsonl # Evaluation results in JSON Lines format +``` + +### Key Features + +- **Bidirectional RPC**: Server can call methods on connected clients +- **LLM-as-a-Judge**: Automated evaluation of agent responses using GPT-4 +- **Concurrent Evaluations**: Support for multiple agents and parallel evaluations +- **Structured Logging**: All interactions logged as JSON for analysis +- **Interactive CLI**: Built-in CLI for testing and server management +- **Connection Management**: Robust handling of agent connections and disconnections +- **Timeout Handling**: Configurable timeouts for RPC calls and evaluations + +### Agent Protocol + +Agents must implement: +- WebSocket connection to server +- JSON-RPC 2.0 protocol support +- `Evaluate(task: string) -> string` method +- "ready" message to signal availability for evaluations + +### Configuration + +All configuration is managed through environment variables and `src/config.js`. Key settings: +- Server port and host +- OpenAI API configuration +- RPC timeouts +- Logging levels and directories +- Maximum concurrent evaluations \ No newline at end of file diff --git a/eval-server/README.md b/eval-server/README.md new file mode 100644 index 00000000000..3179bccf573 --- /dev/null +++ b/eval-server/README.md @@ -0,0 +1,47 @@ +# bo-eval-server + +A WebSocket-based evaluation server for LLM agents using LLM-as-a-judge methodology. + +## Quick Start + +1. **Install dependencies** + ```bash + npm install + ``` + +2. **Configure environment** + ```bash + cp .env.example .env + # Edit .env and add your OPENAI_API_KEY + ``` + +3. **Start the server** + ```bash + npm start + ``` + +4. **Use interactive CLI** (alternative to step 3) + ```bash + npm run cli + ``` + +## Features + +- 🔌 WebSocket server for real-time agent connections +- 🤖 Bidirectional RPC calls to connected agents +- ⚖️ LLM-as-a-judge evaluation using OpenAI GPT-4 +- 📊 Structured JSON logging of all evaluations +- 🖥️ Interactive CLI for testing and management +- ⚡ Support for concurrent agent evaluations + +## Agent Protocol + +Your agent needs to: + +1. Connect to the WebSocket server (default: `ws://localhost:8080`) +2. Send a `{"type": "ready"}` message when ready for evaluations +3. Implement the `Evaluate` RPC method that accepts a string task and returns a string response + +## For more details + +See [CLAUDE.md](./CLAUDE.md) for comprehensive documentation of the architecture and implementation. \ No newline at end of file diff --git a/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml b/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml new file mode 100644 index 00000000000..f5b865f5b55 --- /dev/null +++ b/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml @@ -0,0 +1,12 @@ +client: + id: 1233ae25-9f9e-4f77-924d-865f7d615cef + name: DevTools Client 1233ae25 + secret_key: hello + description: Auto-generated DevTools evaluation client +settings: + max_concurrent_evaluations: 3 + default_timeout: 45000 + retry_policy: + max_retries: 2 + backoff_multiplier: 2 + initial_delay: 1000 diff --git a/eval-server/docs/CLIENT_SETUP.md b/eval-server/docs/CLIENT_SETUP.md new file mode 100644 index 00000000000..53502ae074d --- /dev/null +++ b/eval-server/docs/CLIENT_SETUP.md @@ -0,0 +1,445 @@ +# Client Setup Guide + +## Overview + +This guide explains how to set up a new evaluation client to connect to the evaluation server. Clients can be any application that implements the WebSocket evaluation protocol, such as Chrome DevTools or custom test agents. + +## Prerequisites + +- WebSocket client library +- JSON-RPC 2.0 implementation +- UUID v4 generator +- Tools/agents to execute evaluations + +## Setup Steps + +### 1. Generate Client ID + +Generate a unique UUID v4 for your client: + +```javascript +// JavaScript example +import { v4 as uuidv4 } from 'uuid'; +const clientId = uuidv4(); // e.g., "550e8400-e29b-41d4-a716-446655440000" +``` + +Store this ID persistently - it will be used for all connections. + +### 2. Request YAML Configuration + +Contact the evaluation server administrator to: +1. Create a YAML evaluation file for your client ID +2. Optionally set up a secret key for authentication +3. Configure appropriate evaluations for your client + +Example request: +``` +Client ID: 550e8400-e29b-41d4-a716-446655440000 +Client Name: Chrome DevTools Production +Tools Available: extract_schema_data, research_agent, action_agent +Purpose: Automated regression testing +``` + +### 3. Implement WebSocket Connection + +```javascript +class EvaluationClient { + constructor(serverUrl, clientId, secretKey) { + this.serverUrl = serverUrl; + this.clientId = clientId; + this.secretKey = secretKey; + this.ws = null; + } + + connect() { + this.ws = new WebSocket(this.serverUrl); + + this.ws.onopen = () => { + console.log('Connected to evaluation server'); + }; + + this.ws.onmessage = (event) => { + this.handleMessage(JSON.parse(event.data)); + }; + + this.ws.onerror = (error) => { + console.error('WebSocket error:', error); + }; + } +} +``` + +### 4. Implement Protocol Messages + +#### Handle Welcome Message +```javascript +handleMessage(message) { + switch (message.type) { + case 'welcome': + // Server is ready, send registration + this.register(); + break; + + case 'registration_ack': + if (message.status === 'accepted') { + console.log(`Registered! ${message.evaluationsCount} evaluations assigned`); + this.sendReady(); + } else { + console.error('Registration rejected:', message.reason); + } + break; + + default: + // Handle other messages... + } +} +``` + +#### Send Registration +```javascript +register() { + this.send({ + type: 'register', + clientId: this.clientId, + secretKey: this.secretKey, // Optional + capabilities: { + tools: ['extract_schema_data', 'research_agent'], + maxConcurrency: 3, + version: '1.0.0' + } + }); +} +``` + +#### Send Ready Signal +```javascript +sendReady() { + this.send({ + type: 'ready', + timestamp: new Date().toISOString() + }); +} +``` + +### 5. Implement RPC Handler + +```javascript +handleMessage(message) { + // ... existing code ... + + // Handle JSON-RPC requests + if (message.jsonrpc === '2.0' && message.method) { + this.handleRpcRequest(message); + } +} + +async handleRpcRequest(request) { + if (request.method === 'evaluate') { + try { + const result = await this.executeEvaluation(request.params); + + this.send({ + jsonrpc: '2.0', + result: { + status: 'success', + output: result.output, + executionTime: result.duration, + toolCalls: result.toolCalls, + metadata: result.metadata + }, + id: request.id + }); + } catch (error) { + this.send({ + jsonrpc: '2.0', + error: { + code: -32000, + message: error.message, + data: { + tool: request.params.tool, + error: error.toString(), + timestamp: new Date().toISOString() + } + }, + id: request.id + }); + } + } +} +``` + +### 6. Implement Tool Execution + +```javascript +async executeEvaluation(params) { + const startTime = Date.now(); + + // Send status update + this.send({ + type: 'status', + evaluationId: params.evaluationId, + status: 'running', + progress: 0.1, + message: 'Starting evaluation...' + }); + + // Execute the appropriate tool + let result; + switch (params.tool) { + case 'extract_schema_data': + result = await this.extractSchema(params.url, params.input); + break; + + case 'research_agent': + result = await this.runResearchAgent(params.url, params.input); + break; + + default: + throw new Error(`Unknown tool: ${params.tool}`); + } + + const executionTime = Date.now() - startTime; + + return { + output: result, + duration: executionTime, + toolCalls: [{ + tool: params.tool, + timestamp: new Date().toISOString(), + duration: executionTime, + status: 'success' + }], + metadata: { + url: params.url, + toolVersion: '1.0.0' + } + }; +} +``` + +## Chrome DevTools Integration + +For Chrome DevTools specifically: + +### 1. Update EvaluationConfig + +```typescript +// In EvaluationConfig.ts +interface EvaluationConfiguration { + enabled: boolean; + endpoint: string; + secretKey?: string; + clientId?: string; // Add client ID field +} + +// Generate and store client ID +function ensureClientId(): string { + let clientId = localStorage.getItem('ai_chat_evaluation_client_id'); + if (!clientId) { + clientId = generateUUID(); + localStorage.setItem('ai_chat_evaluation_client_id', clientId); + } + return clientId; +} +``` + +### 2. Create Evaluation Agent + +```typescript +// EvaluationAgent.ts +import { WebSocketRPCClient } from '../common/WebSocketRPCClient.js'; +import { ToolRegistry } from '../agent_framework/ConfigurableAgentTool.js'; + +export class EvaluationAgent { + private client: WebSocketRPCClient; + private clientId: string; + + constructor(config: EvaluationConfiguration) { + this.clientId = config.clientId || ensureClientId(); + this.client = new WebSocketRPCClient({ + endpoint: config.endpoint, + secretKey: config.secretKey + }); + + this.setupHandlers(); + } + + private setupHandlers(): void { + this.client.on('connected', () => { + this.register(); + }); + + // Handle RPC requests + this.client.on('rpc-request', async (request) => { + if (request.method === 'evaluate') { + const result = await this.handleEvaluation(request.params); + return result; + } + }); + } + + private async handleEvaluation(params: any): Promise { + const tool = ToolRegistry.getRegisteredTool(params.tool); + if (!tool) { + throw new Error(`Tool not found: ${params.tool}`); + } + + // Execute tool with params.input + const result = await tool.execute(params.input); + + return { + status: 'success', + output: result, + executionTime: Date.now() - startTime + }; + } +} +``` + +## Testing Your Client + +### 1. Local Testing + +Use the example agent to test your server setup: + +```bash +# In bo-eval-server directory +npm test +``` + +### 2. Connection Test + +```javascript +// Quick connection test +const client = new EvaluationClient( + 'ws://localhost:8080', + 'your-client-id', + 'optional-secret' +); + +client.connect(); + +// Should see: +// Connected to evaluation server +// Registered! X evaluations assigned +``` + +### 3. Manual Evaluation Test + +You can trigger evaluations manually through the server's CLI: + +```bash +npm run cli +> run-evaluation your-client-id evaluation-id +``` + +## Troubleshooting + +### Connection Issues + +1. **Check server is running** + ```bash + curl -i -N -H "Connection: Upgrade" -H "Upgrade: websocket" http://localhost:8080 + ``` + +2. **Verify client ID exists** + - Check `clients/{your-client-id}.yaml` exists on server + - Ensure client ID format is valid UUID v4 + +3. **Authentication failures** + - Verify secret key matches server configuration + - Check for typos in client ID or secret + +### Evaluation Failures + +1. **Tool not found** + - Ensure tool name in YAML matches client capabilities + - Verify tool is registered in your client + +2. **Timeouts** + - Increase timeout in YAML configuration + - Check for infinite loops in tool execution + +3. **Invalid input** + - Validate input against expected schema + - Check for required fields + +## Security Best Practices + +1. **Store credentials securely** + - Never hardcode secret keys + - Use environment variables or secure storage + +2. **Validate inputs** + - Sanitize URLs before navigation + - Validate schemas before execution + +3. **Resource limits** + - Implement timeout handling + - Limit concurrent evaluations + +4. **Use WSS in production** + ```javascript + const client = new EvaluationClient( + 'wss://eval-server.example.com', // Use WSS + clientId, + secretKey + ); + ``` + +## Example: Minimal Client + +```javascript +// minimal-client.js +import WebSocket from 'ws'; + +const CLIENT_ID = 'your-uuid-here'; +const SECRET_KEY = 'your-secret-here'; + +const ws = new WebSocket('ws://localhost:8080'); + +ws.on('open', () => { + console.log('Connected'); +}); + +ws.on('message', async (data) => { + const msg = JSON.parse(data); + + if (msg.type === 'welcome') { + // Register + ws.send(JSON.stringify({ + type: 'register', + clientId: CLIENT_ID, + secretKey: SECRET_KEY, + capabilities: { + tools: ['extract_schema_data'], + maxConcurrency: 1, + version: '1.0.0' + } + })); + } + + if (msg.type === 'registration_ack' && msg.status === 'accepted') { + // Send ready + ws.send(JSON.stringify({ + type: 'ready', + timestamp: new Date().toISOString() + })); + } + + if (msg.jsonrpc && msg.method === 'evaluate') { + // Simple evaluation response + ws.send(JSON.stringify({ + jsonrpc: '2.0', + result: { + status: 'success', + output: { message: 'Evaluation completed' }, + executionTime: 1000 + }, + id: msg.id + })); + } +}); + +ws.on('error', console.error); +``` \ No newline at end of file diff --git a/eval-server/docs/PROTOCOL.md b/eval-server/docs/PROTOCOL.md new file mode 100644 index 00000000000..694e58a69d1 --- /dev/null +++ b/eval-server/docs/PROTOCOL.md @@ -0,0 +1,310 @@ +# WebSocket Evaluation Protocol + +## Overview + +This document describes the WebSocket communication protocol between evaluation clients (e.g., Chrome DevTools) and the evaluation server. The protocol supports client registration, authentication, and bidirectional evaluation task execution using JSON-RPC 2.0. + +## Connection Flow + +``` +Client Server + | | + |------ WebSocket Connect ------>| + | | + |<----- Welcome Message ---------| + | | + |------ Register Message ------->| + | | + |<----- Registration ACK ---------| + | | + |------ Ready Signal ----------->| + | | + |<===== Evaluation Loop ========>| +``` + +## Message Types + +### 1. Client → Server Messages + +#### 1.1 Registration Message +Sent immediately after receiving the welcome message to register the client with the server. + +```json +{ + "type": "register", + "clientId": "550e8400-e29b-41d4-a716-446655440000", + "secretKey": "optional-secret-key", // Optional field for authentication + "capabilities": { + "tools": ["extract_schema_data", "research_agent", "action_agent"], + "maxConcurrency": 3, + "version": "1.0.0" + } +} +``` + +**Fields:** +- `type`: Must be "register" +- `clientId`: UUID v4 format, unique identifier for the client +- `secretKey`: Optional authentication key +- `capabilities`: Object describing client capabilities + - `tools`: Array of tool names the client can execute + - `maxConcurrency`: Maximum number of concurrent evaluations + - `version`: Client version string + +#### 1.2 Ready Signal +Indicates the client is ready to receive evaluation tasks. + +```json +{ + "type": "ready", + "timestamp": "2024-01-01T00:00:00Z" +} +``` + +#### 1.3 Status Update +Provides progress updates for running evaluations. + +```json +{ + "type": "status", + "evaluationId": "eval-123", + "status": "running" | "completed" | "failed", + "progress": 0.5, // Optional, value between 0 and 1 + "message": "Processing page content..." // Optional status message +} +``` + +#### 1.4 Heartbeat (Ping) +Keep-alive message to maintain connection. + +```json +{ + "type": "ping", + "timestamp": "2024-01-01T00:00:00Z" +} +``` + +### 2. Server → Client Messages + +#### 2.1 Welcome Message +Sent immediately after WebSocket connection is established. + +```json +{ + "type": "welcome", + "serverId": "server-001", + "version": "1.0.0", + "timestamp": "2024-01-01T00:00:00Z" +} +``` + +#### 2.2 Registration Acknowledgment +Response to client registration. + +```json +{ + "type": "registration_ack", + "clientId": "550e8400-e29b-41d4-a716-446655440000", + "status": "accepted" | "rejected", + "message": "Client registered successfully", + "evaluationsCount": 5, // Number of evaluations assigned to this client + "reason": "Invalid secret key" // Only present if status is "rejected" +} +``` + +#### 2.3 Heartbeat Response (Pong) +Response to client ping. + +```json +{ + "type": "pong", + "timestamp": "2024-01-01T00:00:00Z" +} +``` + +## JSON-RPC 2.0 Evaluation Protocol + +The evaluation tasks are sent using JSON-RPC 2.0 protocol over the WebSocket connection. + +### 3. Evaluation Request (Server → Client) + +#### 3.1 Evaluate Method +Requests the client to execute an evaluation task. + +```json +{ + "jsonrpc": "2.0", + "method": "evaluate", + "params": { + "evaluationId": "wikipedia-chrome-devtools-001", + "name": "Extract Chrome DevTools Wikipedia Article", + "url": "https://en.wikipedia.org/wiki/Chrome_DevTools", + "tool": "extract_schema_data", + "input": { + "schema": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "summary": {"type": "string"}, + "tableOfContents": { + "type": "array", + "items": {"type": "string"} + } + } + } + }, + "timeout": 30000, // Timeout in milliseconds + "metadata": { + "tags": ["schema-extraction", "wikipedia"], + "retries": 2, + "priority": "normal" + } + }, + "id": "rpc-001" +} +``` + +**Parameters:** +- `evaluationId`: Unique identifier for this evaluation (from YAML definition) +- `name`: Human-readable name of the evaluation +- `url`: Target URL for the evaluation +- `tool`: Name of the tool to execute +- `input`: Tool-specific input parameters +- `timeout`: Maximum execution time in milliseconds +- `metadata`: Additional evaluation metadata + +### 4. Evaluation Response (Client → Server) + +#### 4.1 Success Response +Sent when evaluation completes successfully. + +```json +{ + "jsonrpc": "2.0", + "result": { + "status": "success", + "output": { + "title": "Chrome DevTools", + "summary": "Chrome DevTools is a set of web developer tools built directly into the Google Chrome browser.", + "tableOfContents": [ + "Overview", + "Features", + "History", + "Usage" + ] + }, + "executionTime": 2500, // Total execution time in milliseconds + "toolCalls": [ + { + "tool": "extract_schema_data", + "timestamp": "2024-01-01T00:00:00Z", + "duration": 2400, + "status": "success" + } + ], + "metadata": { + "pageLoadTime": 800, + "extractionTime": 1700, + "retryCount": 0 + } + }, + "id": "rpc-001" +} +``` + +#### 4.2 Error Response +Sent when evaluation fails. + +```json +{ + "jsonrpc": "2.0", + "error": { + "code": -32000, + "message": "Tool execution failed", + "data": { + "tool": "extract_schema_data", + "error": "Page load timeout after 30000ms", + "url": "https://en.wikipedia.org/wiki/Chrome_DevTools", + "timestamp": "2024-01-01T00:00:00Z", + "stackTrace": "Error: Timeout...\n at PageLoader.load..." // Optional + } + }, + "id": "rpc-001" +} +``` + +## Error Codes + +Standard JSON-RPC 2.0 error codes: +- `-32700`: Parse error - Invalid JSON was received +- `-32600`: Invalid request - JSON is not a valid request object +- `-32601`: Method not found - Method does not exist +- `-32602`: Invalid params - Invalid method parameters +- `-32603`: Internal error - Internal JSON-RPC error + +Custom error codes for evaluation: +- `-32000`: Tool execution error - Tool failed during execution +- `-32001`: Timeout error - Evaluation exceeded timeout +- `-32002`: Authentication error - Invalid or missing credentials +- `-32003`: Rate limit exceeded - Too many requests +- `-32004`: Invalid tool - Requested tool not available +- `-32005`: Resource error - Unable to access required resources + +## Connection Management + +### Reconnection +- Clients should implement automatic reconnection with exponential backoff +- On reconnection, clients must re-register with the same clientId +- Server maintains evaluation state across reconnections + +### Timeouts +- Default connection timeout: 60 seconds +- Ping interval: 30 seconds +- Evaluation timeout: Specified per evaluation in YAML + +### Rate Limiting +- Server may implement rate limiting per client +- Rate limit errors use code `-32003` +- Clients should respect rate limit headers in error responses + +## Security Considerations + +1. **Authentication**: Clients may use optional secret keys for authentication +2. **Transport Security**: Production deployments should use WSS (WebSocket Secure) +3. **Input Validation**: All inputs should be validated against schemas +4. **Resource Limits**: Enforce timeouts and memory limits for evaluations + +## Examples + +### Complete Flow Example + +1. **Client connects and registers:** +```json +// Client → Server +{"type": "register", "clientId": "550e8400-e29b-41d4-a716-446655440000", "capabilities": {"tools": ["extract_schema_data"], "maxConcurrency": 3, "version": "1.0.0"}} + +// Server → Client +{"type": "registration_ack", "clientId": "550e8400-e29b-41d4-a716-446655440000", "status": "accepted", "message": "Client registered successfully", "evaluationsCount": 2} +``` + +2. **Client signals ready:** +```json +// Client → Server +{"type": "ready", "timestamp": "2024-01-01T00:00:00Z"} +``` + +3. **Server sends evaluation:** +```json +// Server → Client +{"jsonrpc": "2.0", "method": "evaluate", "params": {"evaluationId": "test-001", "url": "https://example.com", "tool": "extract_schema_data", "input": {"schema": {"type": "object", "properties": {"title": {"type": "string"}}}}, "timeout": 30000}, "id": "rpc-001"} +``` + +4. **Client returns result:** +```json +// Client → Server +{"jsonrpc": "2.0", "result": {"status": "success", "output": {"title": "Example Domain"}, "executionTime": 1500}, "id": "rpc-001"} +``` + +## Version History + +- **1.0.0** (2024-01-01): Initial protocol version \ No newline at end of file diff --git a/eval-server/docs/TRIGGERING_EVALUATIONS.md b/eval-server/docs/TRIGGERING_EVALUATIONS.md new file mode 100644 index 00000000000..61604da488a --- /dev/null +++ b/eval-server/docs/TRIGGERING_EVALUATIONS.md @@ -0,0 +1,334 @@ +# How to Trigger Evaluations + +This guide explains all the different ways to trigger evaluations in the system. + +## Prerequisites + +1. **Server Running**: Make sure the evaluation server is running: + ```bash + npm start + ``` + +2. **Client Connected**: A DevTools client must be connected and ready. You'll see logs like: + ``` + [info]: Client registered successfully {"clientId":"550e8400...","capabilities":"extract_schema_data, research_agent"} + [info]: Client ready for evaluations {"clientId":"550e8400..."} + ``` + +## Method 1: Interactive CLI + +Start the interactive CLI: +```bash +npm run cli +``` + +### Available Commands + +#### List Clients and Evaluations +```bash +eval-server> clients +``` +This shows all registered clients and their available evaluations with current status. + +#### Run Specific Evaluation +```bash +eval-server> run +``` +Example: +```bash +eval-server> run 550e8400-e29b-41d4-a716-446655440000 wikipedia-chrome-devtools-001 +``` + +#### Run All Evaluations for a Client +```bash +eval-server> run-all +``` +Example: +```bash +eval-server> run-all 550e8400-e29b-41d4-a716-446655440000 +``` + +#### Check Status +```bash +eval-server> status +``` +Shows server status, connected clients, and active evaluations. + +#### Get Help +```bash +eval-server> help +``` + +## Method 2: HTTP API + +The server also exposes an HTTP API on port 8081. + +### Get Server Status +```bash +curl http://localhost:8081/status +``` + +### List All Clients +```bash +curl http://localhost:8081/clients +``` + +### Get Client Evaluations +```bash +curl "http://localhost:8081/clients/:id/evaluations?id=550e8400-e29b-41d4-a716-446655440000" +``` + +### Trigger Specific Evaluation +```bash +curl -X POST http://localhost:8081/evaluate \\ + -H "Content-Type: application/json" \\ + -d '{ + "clientId": "550e8400-e29b-41d4-a716-446655440000", + "evaluationId": "wikipedia-chrome-devtools-001" + }' +``` + +### Trigger All Evaluations for a Client +```bash +curl -X POST http://localhost:8081/evaluate \\ + -H "Content-Type: application/json" \\ + -d '{ + "clientId": "550e8400-e29b-41d4-a716-446655440000", + "runAll": true + }' +``` + +## Method 3: Automatic Scheduling (YAML Configuration) + +Evaluations can be configured to run automatically based on their schedule in the YAML file. + +### Schedule Types + +#### On-Demand (Manual Only) +```yaml +schedule: + type: "on_demand" +``` +Only runs when manually triggered. + +#### Periodic (Automatic) +```yaml +schedule: + type: "periodic" + interval: 86400000 # Run every 24 hours (in milliseconds) +``` +Runs automatically at the specified interval. + +#### One-Time (Automatic) +```yaml +schedule: + type: "once" + run_at: "2024-12-25T09:00:00Z" # Run once at specific time +``` +Runs once at the specified time. + +## Method 4: Programmatic Integration + +You can integrate the evaluation system into your own applications: + +### Node.js Example +```javascript +import { EvaluationServer } from './src/server.js'; + +const server = new EvaluationServer(); +server.start(); + +// Wait for client to connect +setTimeout(async () => { + const clientId = '550e8400-e29b-41d4-a716-446655440000'; + const evaluationId = 'wikipedia-chrome-devtools-001'; + + // Get client connection + const connection = server.connectedAgents.get(clientId); + if (connection && connection.ready) { + // Get evaluation + const evaluation = server.getClientManager() + .getClientEvaluations(clientId) + .find(e => e.id === evaluationId); + + if (evaluation) { + // Execute evaluation + await server.executeEvaluation(connection, evaluation); + console.log('Evaluation completed!'); + } + } +}, 5000); +``` + +### Python Example (using HTTP API) +```python +import requests +import json + +def trigger_evaluation(client_id, evaluation_id): + response = requests.post('http://localhost:8081/evaluate', + headers={'Content-Type': 'application/json'}, + json={ + 'clientId': client_id, + 'evaluationId': evaluation_id + }) + + if response.status_code == 200: + return response.json() + else: + raise Exception(f"Failed to trigger evaluation: {response.text}") + +# Example usage +result = trigger_evaluation( + '550e8400-e29b-41d4-a716-446655440000', + 'wikipedia-chrome-devtools-001' +) +print(json.dumps(result, indent=2)) +``` + +## Method 5: Webhook Integration + +You can set up webhooks to trigger evaluations from external systems: + +### GitHub Actions Example +```yaml +name: Run Evaluations +on: + schedule: + - cron: '0 9 * * *' # Daily at 9 AM + workflow_dispatch: # Manual trigger + +jobs: + evaluate: + runs-on: ubuntu-latest + steps: + - name: Trigger Evaluation + run: | + curl -X POST ${{ secrets.EVAL_SERVER_URL }}/evaluate \\ + -H "Content-Type: application/json" \\ + -d '{ + "clientId": "${{ secrets.CLIENT_ID }}", + "runAll": true + }' +``` + +### Slack Bot Example +```javascript +// Slack bot command: /eval wikipedia +app.command('/eval', async ({ command, ack, respond }) => { + await ack(); + + const evaluationId = command.text.trim(); + const clientId = process.env.DEFAULT_CLIENT_ID; + + try { + const response = await fetch('http://localhost:8081/evaluate', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ clientId, evaluationId }) + }); + + const result = await response.json(); + await respond(`✅ Evaluation '${evaluationId}' completed successfully!`); + } catch (error) { + await respond(`❌ Evaluation failed: ${error.message}`); + } +}); +``` + +## Monitoring Evaluation Results + +### Real-time Logs +Monitor the server logs to see evaluation progress: +```bash +tail -f logs/combined.log +``` + +### Status Checking +Check evaluation status via API: +```bash +# Get all evaluations for a client +curl "http://localhost:8081/clients/:id/evaluations?id=CLIENT_ID" + +# Check server status +curl http://localhost:8081/status +``` + +### Log Files +Evaluation results are logged to: +- `logs/combined.log` - All logs +- `logs/error.log` - Error logs only + +## Troubleshooting + +### Client Not Connected +``` +❌ Client 'CLIENT_ID' is not connected or not ready +``` +**Solutions:** +1. Make sure DevTools is running and connected +2. Check that the client ID matches +3. Verify the WebSocket connection is working + +### Evaluation Not Found +``` +❌ Evaluation 'EVAL_ID' not found for client 'CLIENT_ID' +``` +**Solutions:** +1. Check the YAML file for the correct evaluation ID +2. Ensure the evaluation is enabled (`enabled: true`) +3. Reload the server if you changed the YAML file + +### Tool Not Available +``` +Tool execution failed: Tool not found: tool_name +``` +**Solutions:** +1. Verify the tool is registered in DevTools +2. Check that the tool name matches exactly +3. Ensure DevTools has the required capabilities + +### Connection Timeout +``` +WebSocket connection failed +``` +**Solutions:** +1. Check if the server is running on the correct port +2. Verify firewall settings +3. Check network connectivity + +## Best Practices + +1. **Start Simple**: Begin with on-demand evaluations before setting up automation +2. **Monitor Logs**: Always monitor logs when running evaluations +3. **Test Connections**: Use the `status` command to verify everything is connected +4. **Gradual Rollout**: Test individual evaluations before running batch operations +5. **Error Handling**: Implement proper error handling in automated systems +6. **Rate Limiting**: Don't run too many evaluations simultaneously + +## Example Workflow + +Here's a typical workflow for triggering evaluations: + +```bash +# 1. Start the server +npm start + +# 2. In another terminal, start the CLI +npm run cli + +# 3. Check status and clients +eval-server> status +eval-server> clients + +# 4. Run a specific evaluation +eval-server> run 550e8400-e29b-41d4-a716-446655440000 wikipedia-chrome-devtools-001 + +# 5. Check results in logs +# (Monitor the server logs for detailed results) + +# 6. Run all evaluations if needed +eval-server> run-all 550e8400-e29b-41d4-a716-446655440000 +``` + +This comprehensive guide covers all the ways to trigger and monitor evaluations in your system! \ No newline at end of file diff --git a/eval-server/docs/YAML_SCHEMA.md b/eval-server/docs/YAML_SCHEMA.md new file mode 100644 index 00000000000..eecb185cc45 --- /dev/null +++ b/eval-server/docs/YAML_SCHEMA.md @@ -0,0 +1,328 @@ +# YAML Evaluation Schema Documentation + +## Overview + +This document describes the YAML schema used to define evaluations for each client. Each client has a dedicated YAML file stored in the `clients/` directory, named after their client ID. + +## File Location + +``` +bo-eval-server/ +└── clients/ + ├── 550e8400-e29b-41d4-a716-446655440000.yaml + ├── 771f9500-f39c-52e5-b827-557766551111.yaml + └── ... +``` + +## Schema Structure + +### Root Level + +```yaml +# Client identification and authentication +client: + id: "550e8400-e29b-41d4-a716-446655440000" # Required: UUID v4 + name: "Chrome DevTools Agent" # Required: Human-readable name + secret_key: "optional-secret-key" # Optional: Authentication key + description: "Production DevTools instance" # Optional: Client description + +# Client-specific settings +settings: + max_concurrent_evaluations: 3 # Maximum parallel evaluations + default_timeout: 30000 # Default timeout in milliseconds + retry_policy: + max_retries: 2 # Maximum retry attempts + backoff_multiplier: 2 # Exponential backoff multiplier + initial_delay: 1000 # Initial retry delay in ms + +# List of evaluations assigned to this client +evaluations: + - id: "eval-001" + # ... evaluation definition + - id: "eval-002" + # ... evaluation definition +``` + +### Evaluation Definition + +Each evaluation in the `evaluations` array follows this structure: + +```yaml +- id: "wikipedia-chrome-devtools-001" # Required: Unique evaluation ID + name: "Extract Chrome DevTools Wikipedia" # Required: Display name + description: "Extract structured data" # Optional: Detailed description + enabled: true # Optional: Enable/disable (default: true) + + # Target configuration + target: + url: "https://en.wikipedia.org/wiki/Chrome_DevTools" # Required: Target URL + wait_for: "networkidle" # Optional: Wait condition (load|domcontentloaded|networkidle) + wait_timeout: 5000 # Optional: Wait timeout in ms + + # Tool configuration + tool: "extract_schema_data" # Required: Tool to execute + timeout: 30000 # Optional: Override default timeout + + # Tool-specific input + input: + schema: # For extract_schema_data tool + type: "object" + properties: + title: + type: "string" + summary: + type: "string" + + # Scheduling configuration + schedule: + type: "on_demand" # on_demand|periodic|once + # For periodic: + interval: 3600000 # Interval in milliseconds + # For once: + run_at: "2024-01-01T00:00:00Z" # ISO timestamp + + # Validation configuration + validation: + type: "llm-judge" # llm-judge|snapshot|hybrid + + # For llm-judge validation + llm_judge: + model: "gpt-4o-mini" # LLM model to use + temperature: 0.3 # Model temperature + criteria: # Evaluation criteria + - "Title should be accurately extracted" + - "Summary should be comprehensive" + - "All required fields should be present" + + # Visual verification settings + visual_verification: + enabled: true + capture_before: true # Screenshot before tool execution + capture_after: true # Screenshot after tool execution + prompts: # Custom verification prompts + - "Verify the title matches the page header" + + # For snapshot validation + snapshot: + structure_only: false # Compare structure only + exclude_paths: # Paths to exclude from comparison + - "timestamp" + - "random_id" + sanitizers: # Value sanitization rules + - path: "date" + pattern: "\\d{4}-\\d{2}-\\d{2}" + replacement: "YYYY-MM-DD" + + # For hybrid validation (both llm-judge and snapshot) + hybrid: + weight_llm: 0.7 # Weight for LLM score + weight_snapshot: 0.3 # Weight for snapshot score + + # Metadata and tags + metadata: + tags: # Categorization tags + - "schema-extraction" + - "wikipedia" + - "regression" + priority: "normal" # low|normal|high + owner: "team-browser" # Responsible team/person + created: "2024-01-01" # Creation date + modified: "2024-01-15" # Last modification date +``` + +## Tool-Specific Input Schemas + +### extract_schema_data + +```yaml +input: + schema: # JSON Schema for extraction + type: "object" + properties: + title: + type: "string" + items: + type: "array" + items: + type: "object" + properties: + name: + type: "string" + price: + type: "number" +``` + +### research_agent + +```yaml +input: + query: "Research the latest AI developments" # Research query + max_iterations: 5 # Maximum agent iterations + include_sources: true # Include source URLs + depth: "comprehensive" # shallow|moderate|comprehensive +``` + +### action_agent + +```yaml +input: + task: "Fill out the contact form" # Task description + form_data: # Data to use + name: "Test User" + email: "test@example.com" + verify_completion: true # Verify task completion +``` + +### web_task_agent + +```yaml +input: + instructions: | # Multi-line instructions + 1. Navigate to the products page + 2. Search for "laptop" + 3. Filter by price < $1000 + 4. Extract the first 5 results + expected_outcome: "List of laptops under $1000" + max_steps: 10 # Maximum action steps +``` + +## Complete Example + +```yaml +client: + id: "550e8400-e29b-41d4-a716-446655440000" + name: "Chrome DevTools Production Agent" + secret_key: "sk-prod-abc123" + description: "Production DevTools instance for continuous evaluation" + +settings: + max_concurrent_evaluations: 5 + default_timeout: 45000 + retry_policy: + max_retries: 3 + backoff_multiplier: 2 + initial_delay: 2000 + +evaluations: + # Schema extraction evaluation + - id: "schema-extract-wiki-001" + name: "Wikipedia Chrome DevTools Schema Extraction" + description: "Test schema extraction on Wikipedia article" + enabled: true + + target: + url: "https://en.wikipedia.org/wiki/Chrome_DevTools" + wait_for: "networkidle" + wait_timeout: 5000 + + tool: "extract_schema_data" + timeout: 30000 + + input: + schema: + type: "object" + properties: + title: + type: "string" + summary: + type: "string" + features: + type: "array" + items: + type: "string" + lastModified: + type: "string" + + schedule: + type: "periodic" + interval: 86400000 # Daily + + validation: + type: "hybrid" + llm_judge: + model: "gpt-4o" + criteria: + - "All schema fields must be populated" + - "Summary should be at least 100 characters" + - "Features should contain at least 5 items" + snapshot: + exclude_paths: + - "lastModified" + hybrid: + weight_llm: 0.6 + weight_snapshot: 0.4 + + metadata: + tags: ["schema", "wikipedia", "daily"] + priority: "high" + owner: "qa-team" + + # Research agent evaluation + - id: "research-agent-news-001" + name: "Research Latest Tech News" + description: "Test research agent on current tech news" + enabled: true + + target: + url: "https://news.ycombinator.com" + + tool: "research_agent" + timeout: 60000 + + input: + query: "What are the top 3 technology stories today?" + max_iterations: 5 + include_sources: true + depth: "moderate" + + schedule: + type: "on_demand" + + validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Response includes 3 distinct technology stories" + - "Each story has a clear summary" + - "Sources are provided for each story" + - "Information is current (from today)" + + metadata: + tags: ["research", "news", "tech"] + priority: "normal" +``` + +## Validation Rules + +1. **Client ID**: Must be valid UUID v4 format +2. **Evaluation IDs**: Must be unique within the file +3. **Tool names**: Must match registered tools in the client +4. **URLs**: Must be valid HTTP/HTTPS URLs +5. **Timeouts**: Must be positive integers (milliseconds) +6. **Schedule intervals**: Must be at least 60000ms (1 minute) + +## YAML Best Practices + +1. Use meaningful IDs that describe the evaluation +2. Group related evaluations together +3. Use tags consistently for categorization +4. Document complex input schemas with comments +5. Keep validation criteria specific and measurable +6. Use anchors and aliases for repeated configurations: + +```yaml +# Define anchor +defaults: &defaults + timeout: 30000 + retry_policy: + max_retries: 2 + +# Use alias +evaluations: + - id: "eval-001" + <<: *defaults # Inherits timeout and retry_policy + name: "Test 1" + # ... +``` \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-a11y-001.yaml b/eval-server/evals/action-agent/action-agent-a11y-001.yaml new file mode 100644 index 00000000000..95265515737 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-a11y-001.yaml @@ -0,0 +1,46 @@ +# Accessibility action test +id: "action-agent-a11y-001" +name: "Click Using ARIA Label" +description: "Test clicking an element identified primarily by ARIA attributes" +enabled: true + +target: + url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click the button with aria-label \"Print Page\"" + reasoning: "Testing action selection using accessibility attributes" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Used accessibility tree to find elements" + - "Correctly identified element by ARIA label" + - "Successfully clicked the target button" + - "Demonstrated understanding of accessibility attributes" + - "No reliance on visual appearance alone" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the Print Page button was successfully clicked" + - "Check if any print dialog or print preview appeared" + - "Confirm the button showed visual feedback (pressed state)" + - "Ensure the action was performed on the correct accessibility-labeled element" + +metadata: + tags: ["action", "accessibility", "aria", "click", "a11y"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-accordion-001.yaml b/eval-server/evals/action-agent/action-agent-accordion-001.yaml new file mode 100644 index 00000000000..f2df3430523 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-accordion-001.yaml @@ -0,0 +1,46 @@ +# Accordion expansion test +id: "action-agent-accordion-001" +name: "Expand Accordion Section" +description: "Test clicking to expand an accordion panel" +enabled: true + +target: + url: "https://jqueryui.com/accordion/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click to expand the \"Section 2\" accordion panel" + reasoning: "Testing accordion expand/collapse interaction" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the Section 2 accordion header" + - "Successfully clicked to expand the section" + - "Section 2 content became visible" + - "Other sections collapsed appropriately" + - "Accordion animation completed smoothly" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify Section 2 is now expanded and content visible" + - "Check if other accordion sections collapsed" + - "Confirm the expansion animation completed" + - "Ensure Section 2 header shows expanded state" + +metadata: + tags: ["action", "accordion", "expand", "collapse", "ui"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml b/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml new file mode 100644 index 00000000000..c22bfc737c0 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml @@ -0,0 +1,46 @@ +# Autocomplete search test +id: "action-agent-autocomplete-001" +name: "Use Autocomplete Search" +description: "Test typing in autocomplete field and selecting from suggestions" +enabled: true + +target: + url: "https://jqueryui.com/autocomplete/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions" + reasoning: "Testing autocomplete/typeahead interaction patterns" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the autocomplete input field" + - "Typed \"Java\" to trigger suggestions" + - "Autocomplete dropdown appeared with suggestions" + - "Selected \"JavaScript\" from the suggestion list" + - "Input field shows the selected value" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify \"JavaScript\" appears in the input field" + - "Check if autocomplete suggestions appeared" + - "Confirm the correct suggestion was selected" + - "Ensure dropdown closed after selection" + +metadata: + tags: ["action", "autocomplete", "typeahead", "search", "suggestions"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-checkbox-001.yaml b/eval-server/evals/action-agent/action-agent-checkbox-001.yaml new file mode 100644 index 00000000000..b76f3072005 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-checkbox-001.yaml @@ -0,0 +1,46 @@ +# Checkbox/radio button test +id: "action-agent-checkbox-001" +name: "Toggle Newsletter Checkbox" +description: "Test clicking checkbox elements for form options" +enabled: true + +target: + url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 45000 + +input: + objective: "Click the checkbox labeled \"I have a bike\" to check it" + reasoning: "Testing interaction with checkbox form elements" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Identified the correct checkbox among multiple options" + - "Used click action on the checkbox element" + - "Checkbox state changed from unchecked to checked" + - "Handled the iframe structure if present" + - "No errors with form element interaction" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to verify the checkbox state changed from unchecked to checked" + - "Confirm the \"I have a bike\" checkbox now shows a checkmark" + - "Verify the checkbox visual indicator (checkmark) is clearly visible" + - "Ensure no other checkboxes were accidentally modified" + +metadata: + tags: ["action", "checkbox", "form", "w3schools", "input"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-checkbox-002.yaml b/eval-server/evals/action-agent/action-agent-checkbox-002.yaml new file mode 100644 index 00000000000..0b25fa8195b --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-checkbox-002.yaml @@ -0,0 +1,47 @@ +# Toggle checkbox test - using HTML form test site +id: "action-agent-checkbox-002" +name: "Check Extra Cheese Checkbox" +description: "Test checking a specific checkbox using the check method" +enabled: true + +target: + url: "https://httpbin.org/forms/post" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 45000 + +input: + objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section" + reasoning: "Testing checkbox interaction functionality using check method" + hint: "Look for the Extra Cheese checkbox and use the check method to select it" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the Extra Cheese checkbox in the Pizza Toppings section" + - "Used the check method instead of click for better reliability" + - "Checkbox became checked (if it wasn't already)" + - "No errors occurred during checkbox interaction" + - "Form maintained its structure after checkbox selection" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the Extra Cheese checkbox is now checked (shows checkmark)" + - "Check that the checkbox shows proper visual feedback for checked state" + - "Confirm the form structure remained intact" + - "Ensure the checkbox for Extra Cheese was specifically targeted and checked" + +metadata: + tags: ["action", "checkbox", "check", "form", "httpbin"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-click-001.yaml b/eval-server/evals/action-agent/action-agent-click-001.yaml new file mode 100644 index 00000000000..e9af6cfdf23 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-click-001.yaml @@ -0,0 +1,47 @@ +# Basic search interaction test +id: "action-agent-click-001" +name: "Search with Text Entry and Click" +description: "Test entering text in search field and clicking search button" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 90000 + +input: + objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button" + reasoning: "Testing multi-step interaction: text input followed by button click" + hint: "First fill the search input field, then find and click the search button" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Successfully located the search input field" + - "Entered \"DevTools automation\" text in the search box" + - "Located the Google Search button after entering text" + - "Successfully clicked the search button" + - "Search was executed and results page loaded" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify text \"DevTools automation\" was entered in the search field" + - "Check if search results page loaded with relevant results" + - "Confirm the search was executed (URL changed to results page)" + - "Ensure search results are related to \"DevTools automation\"" + +metadata: + tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"] + priority: "high" + timeout: 90000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-context-001.yaml b/eval-server/evals/action-agent/action-agent-context-001.yaml new file mode 100644 index 00000000000..61626977f4d --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-context-001.yaml @@ -0,0 +1,46 @@ +# Right click context menu test +id: "action-agent-context-001" +name: "Right Click Context Menu" +description: "Test right-clicking to open context menu" +enabled: true + +target: + url: "https://the-internet.herokuapp.com/context_menu" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Right-click on the context menu area to open the context menu" + reasoning: "Testing right-click context menu interaction" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the designated context menu area" + - "Performed right-click action correctly" + - "Context menu appeared with options" + - "Successfully triggered the right-click event" + - "Alert or confirmation appeared as expected" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify right-click was performed on correct area" + - "Check if context menu or alert appeared" + - "Confirm right-click event was properly triggered" + - "Ensure the expected response occurred" + +metadata: + tags: ["action", "context-menu", "right-click", "mouse", "menu"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-datepicker-001.yaml b/eval-server/evals/action-agent/action-agent-datepicker-001.yaml new file mode 100644 index 00000000000..f4abbf7ac33 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-datepicker-001.yaml @@ -0,0 +1,46 @@ +# Date picker test +id: "action-agent-datepicker-001" +name: "Select Date from Calendar" +description: "Test clicking date input and selecting a specific date from calendar popup" +enabled: true + +target: + url: "https://jqueryui.com/datepicker/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click the date input field and select March 15, 2024 from the calendar picker" + reasoning: "Testing interaction with calendar popup widgets" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located and clicked the date input field" + - "Calendar popup opened successfully" + - "Navigated to correct month/year if needed" + - "Selected the specific date (March 15, 2024)" + - "Date input field shows the selected date" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the date input field contains the selected date" + - "Check if the calendar widget opened and closed properly" + - "Confirm the correct date was highlighted and selected" + - "Ensure the date format matches expected output" + +metadata: + tags: ["action", "datepicker", "calendar", "form", "popup"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-daterange-001.yaml b/eval-server/evals/action-agent/action-agent-daterange-001.yaml new file mode 100644 index 00000000000..4581a472052 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-daterange-001.yaml @@ -0,0 +1,46 @@ +# Date range picker test +id: "action-agent-daterange-001" +name: "Select Date Range" +description: "Test selecting a date range with start and end dates" +enabled: true + +target: + url: "https://www.daterangepicker.com/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Select a date range from February 1, 2024 to February 28, 2024" + reasoning: "Testing complex date range selection with start and end dates" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Opened the date range picker interface" + - "Selected the start date (February 1, 2024)" + - "Selected the end date (February 28, 2024)" + - "Date range was properly applied" + - "Input field shows the complete date range" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify both start and end dates are displayed in the input" + - "Check if the date range picker shows the selected range" + - "Confirm the format matches expected date range display" + - "Ensure both dates were selected in sequence" + +metadata: + tags: ["action", "daterange", "date-picker", "form", "complex"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-dropdown-001.yaml b/eval-server/evals/action-agent/action-agent-dropdown-001.yaml new file mode 100644 index 00000000000..b37b91c3e3f --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-dropdown-001.yaml @@ -0,0 +1,46 @@ +# Dropdown selection test +id: "action-agent-dropdown-001" +name: "Select Dropdown Option" +description: "Test selecting an option from a dropdown menu" +enabled: true + +target: + url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 45000 + +input: + objective: "Select \"Audi\" from the car brands dropdown menu" + reasoning: "Testing dropdown selection interaction" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the dropdown/select element" + - "Identified the correct option to select" + - "Successfully selected the Audi option" + - "Dropdown value changed to the selected option" + - "Handled select element interaction properly" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to verify the dropdown selection changed" + - "Confirm \"Audi\" is now displayed as the selected option" + - "Check if the dropdown is closed after selection" + - "Verify no other form elements were affected by the selection" + +metadata: + tags: ["action", "dropdown", "select", "form", "w3schools"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-dynamic-001.yaml b/eval-server/evals/action-agent/action-agent-dynamic-001.yaml new file mode 100644 index 00000000000..a4380f33f3d --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-dynamic-001.yaml @@ -0,0 +1,46 @@ +# Dynamic content interaction test +id: "action-agent-dynamic-001" +name: "Click Dynamic Load Button" +description: "Test clicking a button that loads dynamic content" +enabled: true + +target: + url: "https://the-internet.herokuapp.com/dynamic_loading/1" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 90000 + +input: + objective: "Click the \"Start\" button to trigger dynamic content loading" + reasoning: "Testing interaction with dynamically loaded content" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Found and clicked the Start button" + - "Handled the dynamic loading process" + - "Recognized that content changes after clicking" + - "No timing issues with the dynamic content" + - "Successfully triggered the loading animation" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to verify dynamic content loaded after clicking Start" + - "Check if loading animation or spinner was displayed" + - "Confirm new content appeared that was previously hidden" + - "Verify the Start button state changed or was replaced after clicking" + +metadata: + tags: ["action", "dynamic", "click", "ajax", "loading"] + priority: "high" + timeout: 90000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-ecommerce-001.yaml b/eval-server/evals/action-agent/action-agent-ecommerce-001.yaml new file mode 100644 index 00000000000..503c157d37f --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-ecommerce-001.yaml @@ -0,0 +1,46 @@ +# E-commerce action test +id: "action-agent-ecommerce-001" +name: "Add Product to Cart" +description: "Test clicking \"Add to Cart\" button on an e-commerce product page" +enabled: true + +target: + url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 180000 + +input: + objective: "Click the \"Add to Cart\" button for this storage container" + reasoning: "Testing e-commerce interaction with product cart functionality" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the Add to Cart button on the product page" + - "Successfully clicked the button" + - "Handled any popups or confirmations that appeared" + - "Verified the item was added (cart count changed or confirmation shown)" + - "Dealt with page dynamics after clicking" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to verify the Add to Cart button was clicked" + - "Check if cart count indicator increased or shows the item was added" + - "Look for any confirmation popup or notification about the item being added" + - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)" + +metadata: + tags: ["action", "ecommerce", "click", "homedepot", "cart"] + priority: "high" + timeout: 180000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-error-001.yaml b/eval-server/evals/action-agent/action-agent-error-001.yaml new file mode 100644 index 00000000000..43c95e6d0ff --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-error-001.yaml @@ -0,0 +1,47 @@ +# Error recovery test +id: "action-agent-error-001" +name: "Handle Missing Element" +description: "Test agent behavior when target element is not found" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click the \"Sign Up\" button" + reasoning: "Testing error handling when element does not exist" + hint: "There is no Sign Up button on Google homepage - agent should handle gracefully" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Attempted to find the requested element" + - "Recognized that the element does not exist" + - "Provided clear error message or explanation" + - "Did not crash or produce confusing output" + - "Suggested alternatives or explained the issue" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the page remains in a stable state despite the missing element" + - "Confirm no error dialogs or broken UI elements appeared" + - "Check that the agent handled the missing element gracefully" + - "Ensure the page was properly analyzed even though the target was not found" + +metadata: + tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"] + priority: "high" + timeout: 60000 + retries: 1 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-filter-001.yaml b/eval-server/evals/action-agent/action-agent-filter-001.yaml new file mode 100644 index 00000000000..77829993599 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-filter-001.yaml @@ -0,0 +1,46 @@ +# Search filter application test +id: "action-agent-filter-001" +name: "Apply Search Filters" +description: "Test applying search filters to modify results" +enabled: true + +target: + url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Type \"Anna\" in the search filter to filter the list" + reasoning: "Testing search filter application" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the search filter input" + - "Typed \"Anna\" in the filter field" + - "List items filtered to show only matching results" + - "Non-matching items were hidden or removed from view" + - "Filter functionality worked as expected" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify search input contains \"Anna\"" + - "Check if list shows only items containing \"Anna\"" + - "Confirm non-matching items are not visible" + - "Ensure filter functionality reduced the visible list items" + +metadata: + tags: ["action", "filter", "search", "list", "dynamic"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-form-001.yaml b/eval-server/evals/action-agent/action-agent-form-001.yaml new file mode 100644 index 00000000000..61d036f683d --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-form-001.yaml @@ -0,0 +1,46 @@ +# Form fill action test +id: "action-agent-form-001" +name: "Fill Search Query" +description: "Test filling a search input field with specific text" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 45000 + +input: + objective: "Fill the search box with \"Chrome DevTools automation testing\"" + reasoning: "Testing form input capability with a specific search query" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Successfully identified the search input field" + - "Used perform_action with fill method" + - "Correctly filled the field with the specified text" + - "Verified the field accepted the input" + - "No formatting or encoding issues with the text" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to confirm text was entered in the search field" + - "Verify the exact text \"Chrome DevTools automation testing\" is visible" + - "Check if search suggestions or autocomplete dropdown appeared" + - "Ensure no input validation errors are shown" + +metadata: + tags: ["action", "form-fill", "input", "google", "basic"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-hover-001.yaml b/eval-server/evals/action-agent/action-agent-hover-001.yaml new file mode 100644 index 00000000000..ed98fbf6ef6 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-hover-001.yaml @@ -0,0 +1,46 @@ +# Hover action test +id: "action-agent-hover-001" +name: "Hover to Reveal Menu" +description: "Test hovering over an element to reveal hidden content" +enabled: true + +target: + url: "https://the-internet.herokuapp.com/hovers" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Hover over the first user avatar image to reveal the hidden caption" + reasoning: "Testing hover interaction to reveal dynamic content" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the first user avatar image" + - "Used appropriate hover action method" + - "Successfully triggered the hover state" + - "Hidden caption became visible after hover" + - "Handled mouse interaction correctly" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to verify hover revealed hidden content" + - "Check that caption or overlay appeared over the first avatar" + - "Confirm the hover state is visually active on the image" + - "Verify user information or caption text is now visible" + +metadata: + tags: ["action", "hover", "mouse", "dynamic", "reveal"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-keyboard-001.yaml b/eval-server/evals/action-agent/action-agent-keyboard-001.yaml new file mode 100644 index 00000000000..6bfceac0b24 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-keyboard-001.yaml @@ -0,0 +1,46 @@ +# Keyboard tab navigation test +id: "action-agent-keyboard-001" +name: "Keyboard Tab Navigation" +description: "Test using keyboard navigation to move between elements" +enabled: true + +target: + url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Use Tab key to navigate between menu items and Enter to activate" + reasoning: "Testing keyboard-only navigation patterns" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Successfully used keyboard navigation" + - "Tab key moved focus between menu items" + - "Focus indicators were visible during navigation" + - "Enter key activated the focused menu item" + - "Keyboard navigation followed accessibility standards" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify focus indicators are visible on menu items" + - "Check if keyboard navigation moved focus correctly" + - "Confirm Enter key activated the focused item" + - "Ensure accessibility navigation patterns worked" + +metadata: + tags: ["action", "keyboard", "navigation", "accessibility", "focus"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-login-001.yaml b/eval-server/evals/action-agent/action-agent-login-001.yaml new file mode 100644 index 00000000000..1b705ce8dee --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-login-001.yaml @@ -0,0 +1,47 @@ +# Login form test +id: "action-agent-login-001" +name: "Fill Login Credentials" +description: "Test filling username and password fields in a login form" +enabled: true + +target: + url: "https://the-internet.herokuapp.com/login" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\"" + reasoning: "Testing form fill with multiple fields including password type" + input_data: "tomsmithSuperSecretPassword!" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Identified both username and password fields" + - "Filled username field with correct value" + - "Filled password field with correct value" + - "Handled password field type appropriately" + - "Used the provided input_data XML format correctly" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the username field shows \"tomsmith\" entered" + - "Confirm the password field has dots/asterisks indicating password entry" + - "Check that both fields are properly filled before submission" + - "Ensure no validation errors are shown for the filled fields" + +metadata: + tags: ["action", "login", "form-fill", "authentication", "multi-field"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-modal-001.yaml b/eval-server/evals/action-agent/action-agent-modal-001.yaml new file mode 100644 index 00000000000..1324fee7cf4 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-modal-001.yaml @@ -0,0 +1,46 @@ +# Modal dialog test +id: "action-agent-modal-001" +name: "Open and Close Modal" +description: "Test opening modal dialog and closing it with X button" +enabled: true + +target: + url: "https://getbootstrap.com/docs/5.0/components/modal/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click to open the modal dialog, then close it using the X button" + reasoning: "Testing modal dialog interaction patterns" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located and clicked the modal trigger button" + - "Modal dialog opened successfully" + - "Modal content was visible and accessible" + - "Found and clicked the close (X) button" + - "Modal closed and page returned to normal state" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify modal opened with visible content" + - "Check if modal overlay appeared correctly" + - "Confirm modal was closed after clicking X" + - "Ensure page background is accessible again" + +metadata: + tags: ["action", "modal", "dialog", "popup", "overlay"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-multiselect-001.yaml b/eval-server/evals/action-agent/action-agent-multiselect-001.yaml new file mode 100644 index 00000000000..fed3f78d278 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-multiselect-001.yaml @@ -0,0 +1,46 @@ +# Multi-select dropdown test +id: "action-agent-multiselect-001" +name: "Select Multiple Options" +description: "Test selecting multiple options from a multi-select dropdown" +enabled: true + +target: + url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown" + reasoning: "Testing multiple selection in select elements" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the multi-select dropdown element" + - "Successfully selected Volvo option" + - "Successfully selected Audi option" + - "Both options remain selected simultaneously" + - "Used appropriate multi-select interaction method" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify both Volvo and Audi appear selected" + - "Check if both options are highlighted/marked" + - "Confirm multi-select functionality worked correctly" + - "Ensure no other options were accidentally selected" + +metadata: + tags: ["action", "multi-select", "dropdown", "form", "multiple"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-multistep-001.yaml b/eval-server/evals/action-agent/action-agent-multistep-001.yaml new file mode 100644 index 00000000000..31514dde101 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-multistep-001.yaml @@ -0,0 +1,47 @@ +# Multi-step form test +id: "action-agent-multistep-001" +name: "Complete Search and Submit" +description: "Test filling a search form and then clicking the submit button" +enabled: true + +target: + url: "https://www.bing.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Fill the search box with \"automated testing tools\" and then click the search button" + reasoning: "Testing multi-step form interaction combining fill and click actions" + hint: "This requires two actions: first fill the search field, then click the search button" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Recognized this requires multiple actions" + - "First filled the search input correctly" + - "Then located and clicked the search button" + - "Both actions completed successfully in sequence" + - "Search was initiated with the correct query" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the search input contains \"automated testing tools\" text" + - "Confirm the search was submitted and results page loaded" + - "Check that search results are related to the query" + - "Ensure the multi-step action completed fully with both fill and click" + +metadata: + tags: ["action", "multi-step", "form-fill", "click", "bing", "search"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-nav-001.yaml b/eval-server/evals/action-agent/action-agent-nav-001.yaml new file mode 100644 index 00000000000..f49a0cf9b89 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-nav-001.yaml @@ -0,0 +1,46 @@ +# Complex navigation test +id: "action-agent-nav-001" +name: "Navigate via Menu Click" +description: "Test clicking navigation menu items to navigate between pages" +enabled: true + +target: + url: "https://www.wikipedia.org" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click on the \"English\" language link to navigate to English Wikipedia" + reasoning: "Testing navigation through link clicks on a multilingual site" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Identified the correct language link among many options" + - "Successfully clicked the English link" + - "Navigation occurred to the English Wikipedia" + - "Used appropriate tools to verify navigation success" + - "Handled the multilingual page structure correctly" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia" + - "Check if the page language and content changed to English" + - "Verify the URL changed to en.wikipedia.org" + - "Confirm the English Wikipedia main page is displayed" + +metadata: + tags: ["action", "navigation", "click", "wikipedia", "multilingual"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-radio-001.yaml b/eval-server/evals/action-agent/action-agent-radio-001.yaml new file mode 100644 index 00000000000..07d6ef88805 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-radio-001.yaml @@ -0,0 +1,47 @@ +# Radio button selection test +id: "action-agent-radio-001" +name: "Select Radio Button Option" +description: "Test selecting a specific radio button option using click method" +enabled: true + +target: + url: "https://httpbin.org/forms/post" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 45000 + +input: + objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group" + reasoning: "Testing radio button selection functionality" + hint: "Look for the Medium radio button in the Pizza Size section and click it to select" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the Medium radio button in the Pizza Size section" + - "Successfully clicked the Medium radio button" + - "Radio button became selected (checked state)" + - "Other radio buttons in the same group became unselected" + - "Form maintained its structure after radio button selection" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the Medium radio button is now selected (shows filled circle)" + - "Check that other pizza size options (Small, Large) are no longer selected" + - "Confirm the form structure remained intact" + - "Ensure the Medium pizza size radio button was specifically targeted" + +metadata: + tags: ["action", "radio", "click", "form", "httpbin"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-slider-001.yaml b/eval-server/evals/action-agent/action-agent-slider-001.yaml new file mode 100644 index 00000000000..c3706587f07 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-slider-001.yaml @@ -0,0 +1,46 @@ +# Range slider test +id: "action-agent-slider-001" +name: "Adjust Range Slider" +description: "Test moving slider to set a specific value" +enabled: true + +target: + url: "https://jqueryui.com/slider/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Move the slider to set the value to 75" + reasoning: "Testing slider/range input manipulation" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the slider control element" + - "Successfully moved the slider handle" + - "Set the slider value to approximately 75" + - "Slider position reflects the target value" + - "Any associated display shows the correct value" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify slider handle moved to represent value 75" + - "Check if value display shows 75 or close to it" + - "Confirm slider position visually matches target" + - "Ensure slider interaction was smooth and successful" + +metadata: + tags: ["action", "slider", "range", "form", "drag"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-tableselect-001.yaml b/eval-server/evals/action-agent/action-agent-tableselect-001.yaml new file mode 100644 index 00000000000..d78e66ca6fb --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-tableselect-001.yaml @@ -0,0 +1,46 @@ +# Table row selection test +id: "action-agent-tableselect-001" +name: "Select Table Row" +description: "Test clicking to select a table row" +enabled: true + +target: + url: "https://datatables.net/examples/api/select_single_row.html" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click on the first row to select it" + reasoning: "Testing table row selection patterns" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the first table row" + - "Successfully clicked the row" + - "Row became highlighted/selected" + - "Selection state is visually apparent" + - "Only one row is selected at a time" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the first row is now highlighted/selected" + - "Check if row selection visual feedback is clear" + - "Confirm only the clicked row is selected" + - "Ensure row selection styling is properly applied" + +metadata: + tags: ["action", "table", "select", "row", "highlight"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-tablesort-001.yaml b/eval-server/evals/action-agent/action-agent-tablesort-001.yaml new file mode 100644 index 00000000000..e3e31764939 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-tablesort-001.yaml @@ -0,0 +1,46 @@ +# Table column sorting test +id: "action-agent-tablesort-001" +name: "Sort Table Column" +description: "Test clicking table column header to sort data" +enabled: true + +target: + url: "https://datatables.net/examples/basic_init/zero_configuration.html" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click on the \"Name\" column header to sort the table by name" + reasoning: "Testing table column sorting interaction" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the Name column header" + - "Successfully clicked the column header" + - "Table data reordered by name alphabetically" + - "Sort indicator appeared on the Name column" + - "Table sorting completed without errors" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify table rows are now sorted alphabetically by name" + - "Check if sort arrow/indicator appears on Name column" + - "Confirm the data order changed from before to after" + - "Ensure table structure remained intact after sorting" + +metadata: + tags: ["action", "table", "sort", "column", "data"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-tabs-001.yaml b/eval-server/evals/action-agent/action-agent-tabs-001.yaml new file mode 100644 index 00000000000..22db60cd572 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-tabs-001.yaml @@ -0,0 +1,46 @@ +# Tab panel navigation test +id: "action-agent-tabs-001" +name: "Navigate Tab Panels" +description: "Test clicking tab to switch between tab panels" +enabled: true + +target: + url: "https://jqueryui.com/tabs/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel" + reasoning: "Testing tab panel navigation" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the \"Nunc tincidunt\" tab button" + - "Successfully clicked the tab" + - "Tab panel content switched to the selected tab" + - "Active tab visual state changed appropriately" + - "Content area updated to show the new panel" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the \"Nunc tincidunt\" tab is now active/highlighted" + - "Check if the content panel changed to show new content" + - "Confirm the tab switching animation completed" + - "Ensure the correct tab content is visible" + +metadata: + tags: ["action", "tabs", "navigation", "panels", "ui"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-timepicker-001.yaml b/eval-server/evals/action-agent/action-agent-timepicker-001.yaml new file mode 100644 index 00000000000..056fbe9c792 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-timepicker-001.yaml @@ -0,0 +1,46 @@ +# Time picker test +id: "action-agent-timepicker-001" +name: "Select Time from Picker" +description: "Test setting time using time picker controls" +enabled: true + +target: + url: "https://timepicker.co/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Set the time to 2:30 PM using the time picker controls" + reasoning: "Testing time selection with hour/minute controls" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the time picker interface" + - "Set the hour to 2 (14 for 24-hour format)" + - "Set the minutes to 30" + - "Selected PM or appropriate time format" + - "Time input shows 2:30 PM or equivalent" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the time input displays 2:30 PM or 14:30" + - "Check if hour and minute were set correctly" + - "Confirm AM/PM selection if applicable" + - "Ensure the time picker interface was properly used" + +metadata: + tags: ["action", "timepicker", "time", "form", "clock"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-upload-001.yaml b/eval-server/evals/action-agent/action-agent-upload-001.yaml new file mode 100644 index 00000000000..518515d61d4 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-upload-001.yaml @@ -0,0 +1,46 @@ +# File upload test +id: "action-agent-upload-001" +name: "Upload File via Input" +description: "Test clicking file input and uploading a test file" +enabled: true + +target: + url: "https://the-internet.herokuapp.com/upload" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Click the file input and upload a test file" + reasoning: "Testing file upload interaction through input elements" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the file input element" + - "Triggered file selection dialog" + - "Selected a file for upload" + - "File name appears in the input field" + - "Upload process initiated successfully" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify file name appears in the upload input field" + - "Check if file selection was successful" + - "Confirm upload button is available or file is ready" + - "Ensure no upload errors are displayed" + +metadata: + tags: ["action", "upload", "file", "input", "form"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-video-001.yaml b/eval-server/evals/action-agent/action-agent-video-001.yaml new file mode 100644 index 00000000000..ba21b28e53c --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-video-001.yaml @@ -0,0 +1,47 @@ +# Video playback controls test +id: "action-agent-video-001" +name: "Control Video Playback" +description: "Test starting video playback using click + spacebar" +enabled: true + +target: + url: "https://www.w3schools.com/html/html5_video.asp" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 90000 + +input: + objective: "Click the video element to focus it, then press spacebar to start playback" + reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)" + hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Located the Video element in the accessibility tree" + - "Successfully clicked the Video element to focus it" + - "Used keyboard input to press spacebar" + - "Video playback started after spacebar press" + - "No errors occurred during the interaction sequence" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify video player is visible on the page" + - "Check if the play button was clicked (may show pause button after)" + - "Look for visual indicators that video started playing" + - "Ensure no error messages appeared during video interaction" + +metadata: + tags: ["action", "video", "media", "controls", "playback"] + priority: "high" + timeout: 90000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/action-agent/action-agent-video-002.yaml b/eval-server/evals/action-agent/action-agent-video-002.yaml new file mode 100644 index 00000000000..d7188ecd592 --- /dev/null +++ b/eval-server/evals/action-agent/action-agent-video-002.yaml @@ -0,0 +1,47 @@ +# Video play button specific targeting test +id: "action-agent-video-002" +name: "Click Video Play Button Specifically" +description: "Test clicking the specific play button (not the video element)" +enabled: true + +target: + url: "https://www.w3schools.com/html/html5_video.asp" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Find and click the button that has name=\"play\" (not the Video element itself)" + reasoning: "Testing specific targeting of the play button element" + hint: "Target the button element with text or label \"play\", do not click the Video element" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Found a button element (not Video element) with \"play\" in the name" + - "Successfully clicked the play button specifically" + - "Did not click on the Video element itself" + - "Play button click was executed correctly" + - "Video responded to the button click" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify the play button (not video element) was clicked" + - "Check if video started playing after button click" + - "Confirm the target was the button, not the video container" + - "Look for changes in video player state" + +metadata: + tags: ["action", "video", "button", "specific-targeting"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/research-agent/research-agent-basic-001.yaml b/eval-server/evals/research-agent/research-agent-basic-001.yaml new file mode 100644 index 00000000000..85743d55c38 --- /dev/null +++ b/eval-server/evals/research-agent/research-agent-basic-001.yaml @@ -0,0 +1,39 @@ +# Basic research test - stable topic with clear sources +id: "research-agent-basic-001" +name: "Research Chrome DevTools History" +description: "Research the history and development of Chrome DevTools" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "research_agent" +timeout: 180000 + +input: + query: "History and development of Chrome DevTools browser developer tools" + reasoning: "Testing basic research capabilities on a well-documented technical topic" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0 + criteria: + - "Research covers the origins and early development of Chrome DevTools" + - "Information includes key milestones and major feature additions" + - "Sources include official documentation or reliable technical sources" + - "At least 3-5 different sources were consulted" + - "Information is factually accurate and up-to-date" + - "Research demonstrates understanding of the topic evolution" + - "Handoff to content_writer_agent occurred with comprehensive data" + +metadata: + tags: ["basic", "technical", "stable", "documentation"] + priority: "high" + timeout: 180000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/research-agent/research-agent-business-001.yaml b/eval-server/evals/research-agent/research-agent-business-001.yaml new file mode 100644 index 00000000000..defeed16e6f --- /dev/null +++ b/eval-server/evals/research-agent/research-agent-business-001.yaml @@ -0,0 +1,39 @@ +# Business research test +id: "research-agent-business-001" +name: "Research Remote Work Productivity" +description: "Research remote work impact on productivity and business outcomes" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "research_agent" +timeout: 240000 + +input: + query: "Remote work productivity statistics impact business outcomes 2024 studies" + reasoning: "Testing business research requiring statistical data and multiple perspectives" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Research includes statistical data and survey results" + - "Covers multiple perspectives (employee, employer, industry)" + - "Sources include business publications, research studies, and reports" + - "Information addresses both positive and negative impacts" + - "Data is recent and relevant to current work trends" + - "Research demonstrates understanding of business implications" + - "Statistics and claims are properly sourced" + +metadata: + tags: ["business", "statistics", "workplace", "comprehensive"] + priority: "high" + timeout: 240000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/research-agent/research-agent-comparison-001.yaml b/eval-server/evals/research-agent/research-agent-comparison-001.yaml new file mode 100644 index 00000000000..a433a58d886 --- /dev/null +++ b/eval-server/evals/research-agent/research-agent-comparison-001.yaml @@ -0,0 +1,39 @@ +# Comparative research test +id: "research-agent-comparison-001" +name: "Compare JavaScript vs TypeScript" +description: "Research and compare JavaScript and TypeScript for web development" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "research_agent" +timeout: 200000 + +input: + query: "JavaScript vs TypeScript comparison web development pros cons differences" + reasoning: "Testing comparative research requiring balanced analysis of multiple options" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Research covers both JavaScript and TypeScript comprehensively" + - "Includes clear comparison points (syntax, features, ecosystem)" + - "Presents advantages and disadvantages of each language" + - "Sources include technical documentation and developer resources" + - "Information is balanced and objective, not biased toward one option" + - "Demonstrates understanding of use cases for each language" + - "Research data is well-organized for comparative analysis" + +metadata: + tags: ["comparison", "technical", "programming", "balanced"] + priority: "high" + timeout: 200000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/research-agent/research-agent-current-001.yaml b/eval-server/evals/research-agent/research-agent-current-001.yaml new file mode 100644 index 00000000000..198c981c829 --- /dev/null +++ b/eval-server/evals/research-agent/research-agent-current-001.yaml @@ -0,0 +1,40 @@ +# Current events research test +id: "research-agent-current-001" +name: "Research Latest AI Development Trends" +description: "Research recent developments in AI and machine learning (last 6 months)" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "research_agent" +timeout: 240000 + +input: + query: "Latest AI artificial intelligence developments breakthroughs 2024 2025" + reasoning: "Testing research on current events and rapidly evolving topics" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + include_url: true + criteria: + - "Research focuses on recent developments (within last 6 months)" + - "Covers multiple aspects of AI development (models, applications, research)" + - "Sources are current and from reputable news or research outlets" + - "Information includes specific examples or case studies" + - "Demonstrates ability to identify current trends vs older information" + - "Successfully gathered information from diverse source types" + - "Data is properly organized for content writer handoff" + +metadata: + tags: ["current-events", "ai", "dynamic", "trends"] + priority: "high" + timeout: 240000 + retries: 1 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/research-agent/research-agent-edge-001.yaml b/eval-server/evals/research-agent/research-agent-edge-001.yaml new file mode 100644 index 00000000000..234c832fe97 --- /dev/null +++ b/eval-server/evals/research-agent/research-agent-edge-001.yaml @@ -0,0 +1,39 @@ +# No-results edge case test +id: "research-agent-edge-001" +name: "Research Obscure Fictional Topic" +description: "Test handling of queries with very limited or no reliable sources" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "research_agent" +timeout: 180000 + +input: + query: "quantum bluetooth watermelon encryption algorithm 2024" + reasoning: "Testing edge case handling when query yields no meaningful results" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Agent recognizes when query yields limited or unreliable results" + - "Demonstrates appropriate search strategy modification" + - "Does not fabricate information when sources are unavailable" + - "Gracefully handles lack of substantive results" + - "Still attempts handoff to content writer with available information" + - "Maintains professional approach despite limited data" + - "Shows appropriate uncertainty when information is sparse" + +metadata: + tags: ["edge-case", "no-results", "error-handling", "fictional"] + priority: "high" + timeout: 180000 + retries: 1 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/research-agent/research-agent-technical-001.yaml b/eval-server/evals/research-agent/research-agent-technical-001.yaml new file mode 100644 index 00000000000..c5e25408d53 --- /dev/null +++ b/eval-server/evals/research-agent/research-agent-technical-001.yaml @@ -0,0 +1,39 @@ +# Deep technical research test +id: "research-agent-technical-001" +name: "Research WebAssembly Performance" +description: "Deep dive research into WebAssembly performance characteristics and use cases" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "research_agent" +timeout: 900000 + +input: + query: "WebAssembly WASM performance benchmarks use cases implementation details" + reasoning: "Testing deep technical research requiring specialized knowledge synthesis" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Research covers technical details of WebAssembly architecture" + - "Includes performance benchmarks and comparison data" + - "Discusses practical use cases and implementation scenarios" + - "Sources include technical specifications, benchmarks, and expert analysis" + - "Information demonstrates deep understanding of the technology" + - "Research addresses both benefits and limitations" + - "Technical accuracy is maintained throughout" + +metadata: + tags: ["technical", "deep-dive", "performance", "webassembly"] + priority: "high" + timeout: 900000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/research-agent/research-agent-tools-001.yaml b/eval-server/evals/research-agent/research-agent-tools-001.yaml new file mode 100644 index 00000000000..44da108d763 --- /dev/null +++ b/eval-server/evals/research-agent/research-agent-tools-001.yaml @@ -0,0 +1,40 @@ +# Tool orchestration test - focuses on how well the agent uses available tools +id: "research-agent-tools-001" +name: "Research Python Framework Comparison" +description: "Research comparing Django vs Flask Python frameworks with focus on tool usage" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "research_agent" +timeout: 240000 + +input: + query: "Django vs Flask Python web framework comparison features performance" + reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Agent effectively used navigate_url to access search engines" + - "Schema-based extraction was used to gather structured search results" + - "Fetcher tool was used to collect content from multiple URLs" + - "Navigation strategy was logical and systematic" + - "Tool usage demonstrated purposeful research progression" + - "Information from different tools was effectively synthesized" + - "At least 3-5 different sources were accessed and processed" + - "Final handoff included comprehensive data from all tools" + +metadata: + tags: ["tool-orchestration", "systematic", "python", "frameworks"] + priority: "high" + timeout: 240000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/amazon-product-001.yaml b/eval-server/evals/schema-extractor/amazon-product-001.yaml new file mode 100644 index 00000000000..bfeb975979c --- /dev/null +++ b/eval-server/evals/schema-extractor/amazon-product-001.yaml @@ -0,0 +1,78 @@ +# E-commerce product extraction test +id: "amazon-product-001" +name: "Extract Amazon Product Details" +description: "Extract product information from an Amazon product page" +enabled: true + +target: + url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 60000 + +input: + schema: + type: "object" + properties: + product: + type: "object" + properties: + title: + type: "string" + brand: + type: "string" + price: + type: "object" + properties: + current: + type: "number" + currency: + type: "string" + rating: + type: "object" + properties: + average: + type: "number" + count: + type: "number" + images: + type: "array" + items: + type: "string" + format: "url" + features: + type: "array" + items: + type: "string" + required: + - "title" + - "price" + availability: + type: "string" + required: + - "product" + instruction: "Extract comprehensive product information including pricing, ratings, and key features" + reasoning: "Testing extraction from a dynamic e-commerce page with complex structure" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Product title is accurate and complete" + - "Price information is current and properly formatted" + - "Rating data includes both average and review count" + - "Image URLs are valid and accessible" + - "Key product features are captured" + - "All URLs are properly resolved (not node IDs)" + +metadata: + tags: ["ecommerce", "amazon", "product", "dynamic"] + priority: "high" + timeout: 60000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/bbc-news-001.yaml b/eval-server/evals/schema-extractor/bbc-news-001.yaml new file mode 100644 index 00000000000..e434d2a874a --- /dev/null +++ b/eval-server/evals/schema-extractor/bbc-news-001.yaml @@ -0,0 +1,69 @@ +# News article extraction test +id: "bbc-news-001" +name: "Extract BBC News Article" +description: "Extract article content and metadata from a BBC News page" +enabled: true + +target: + url: "https://www.bbc.com/news/technology" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 30000 + +input: + schema: + type: "object" + properties: + headlines: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + summary: + type: "string" + url: + type: "string" + format: "url" + category: + type: "string" + required: + - "title" + mainStory: + type: "object" + properties: + headline: + type: "string" + summary: + type: "string" + url: + type: "string" + format: "url" + required: + - "headlines" + instruction: "Extract the main headlines and featured stories from the BBC Technology news section" + reasoning: "Testing extraction from a news aggregation page with multiple articles" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + include_url: true + criteria: + - "Headlines are current and relevant to technology news" + - "Article summaries provide meaningful context" + - "URLs link to valid BBC news articles" + - "Main story is properly identified" + - "All extracted content is in English" + +metadata: + tags: ["news", "bbc", "aggregation", "dynamic"] + priority: "high" + timeout: 30000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/bing-search-001.yaml b/eval-server/evals/schema-extractor/bing-search-001.yaml new file mode 100644 index 00000000000..8488f341b43 --- /dev/null +++ b/eval-server/evals/schema-extractor/bing-search-001.yaml @@ -0,0 +1,70 @@ +# Bing Search results extraction test +id: "bing-search-001" +name: "Extract Bing Search Results" +description: "Extract search results from Bing search page" +enabled: true + +target: + url: "https://www.bing.com/search?q=web+scraping+best+practices" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 45000 + +input: + schema: + type: "object" + properties: + query: + type: "string" + searchResults: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + snippet: + type: "string" + datePublished: + type: "string" + required: + - "title" + - "url" + - "snippet" + sidebarInfo: + type: "object" + properties: + title: + type: "string" + description: + type: "string" + source: + type: "string" + required: + - "searchResults" + instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing" + reasoning: "Testing extraction from Bing search results with different layout than Google" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Search results match the query intent" + - "Results include valid URLs and meaningful snippets" + - "Sidebar information is extracted when present" + - "No duplicate results in the list" + +metadata: + tags: ["search", "bing", "serp", "dynamic"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/github-repo-001-streamlined.yaml b/eval-server/evals/schema-extractor/github-repo-001-streamlined.yaml new file mode 100644 index 00000000000..07532e7d13f --- /dev/null +++ b/eval-server/evals/schema-extractor/github-repo-001-streamlined.yaml @@ -0,0 +1,66 @@ +# Simple structured data test (Streamlined version) +id: "github-repo-001-streamlined" +name: "Extract GitHub Repository Info (Streamlined)" +description: "Extract basic repository information from a GitHub page using streamlined extractor" +enabled: true + +target: + url: "https://github.com/microsoft/TypeScript" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 30000 + +input: + schema: + type: "object" + properties: + name: + type: "string" + description: + type: "string" + language: + type: "string" + stars: + type: "number" + forks: + type: "number" + topics: + type: "array" + items: + type: "string" + readme: + type: "object" + properties: + summary: + type: "string" + required: + - "name" + - "description" + instruction: "Extract repository metadata and basic statistics" + reasoning: "Testing extraction from a well-structured GitHub repository page" + +validation: + type: "hybrid" + snapshot: + exclude_paths: + - "stars" + - "forks" + structure_only: false + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Repository name matches the GitHub page" + - "Description accurately reflects the project purpose" + - "Programming language is correctly identified" + - "Topic tags are relevant to the project" + +metadata: + tags: ["github", "repository", "structured", "streamlined"] + priority: "high" + timeout: 30000 + retries: 1 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/github-repo-001.yaml b/eval-server/evals/schema-extractor/github-repo-001.yaml new file mode 100644 index 00000000000..7a01a14043e --- /dev/null +++ b/eval-server/evals/schema-extractor/github-repo-001.yaml @@ -0,0 +1,66 @@ +# Simple structured data test +id: "github-repo-001" +name: "Extract GitHub Repository Info" +description: "Extract basic repository information from a GitHub page" +enabled: true + +target: + url: "https://github.com/microsoft/TypeScript" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 30000 + +input: + schema: + type: "object" + properties: + name: + type: "string" + description: + type: "string" + language: + type: "string" + stars: + type: "number" + forks: + type: "number" + topics: + type: "array" + items: + type: "string" + readme: + type: "object" + properties: + summary: + type: "string" + required: + - "name" + - "description" + instruction: "Extract repository metadata and basic statistics" + reasoning: "Testing extraction from a well-structured GitHub repository page" + +validation: + type: "hybrid" + snapshot: + exclude_paths: + - "stars" + - "forks" + structure_only: false + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Repository name matches the GitHub page" + - "Description accurately reflects the project purpose" + - "Programming language is correctly identified" + - "Topic tags are relevant to the project" + +metadata: + tags: ["github", "repository", "structured"] + priority: "high" + timeout: 30000 + retries: 1 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/google-flights-001.yaml b/eval-server/evals/schema-extractor/google-flights-001.yaml new file mode 100644 index 00000000000..80da1bb7bb5 --- /dev/null +++ b/eval-server/evals/schema-extractor/google-flights-001.yaml @@ -0,0 +1,106 @@ +# Google Flights search extraction test +id: "google-flights-001" +name: "Extract Google Flights Search Results" +description: "Extract flight options from Google Flights search" +enabled: true + +target: + url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 60000 + +input: + schema: + type: "object" + properties: + searchCriteria: + type: "object" + properties: + origin: + type: "string" + destination: + type: "string" + departureDate: + type: "string" + returnDate: + type: "string" + tripType: + type: "string" + passengers: + type: "number" + flights: + type: "array" + items: + type: "object" + properties: + airline: + type: "string" + flightNumber: + type: "string" + departureTime: + type: "string" + arrivalTime: + type: "string" + duration: + type: "string" + stops: + type: "number" + price: + type: "object" + properties: + amount: + type: "number" + currency: + type: "string" + cabin: + type: "string" + bookingUrl: + type: "string" + format: "url" + legroom: + type: "string" + amenities: + type: "array" + items: + type: "string" + required: + - "airline" + - "departureTime" + - "arrivalTime" + - "price" + priceInsights: + type: "object" + properties: + trend: + type: "string" + recommendation: + type: "string" + averagePrice: + type: "number" + required: + - "flights" + instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results" + reasoning: "Testing extraction from complex travel search interface with dynamic pricing" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Flight times are in proper format" + - "Prices are numeric values with currency" + - "Airlines and flight numbers are accurate" + - "Stop information is correctly identified" + - "Duration is in readable format" + +metadata: + tags: ["travel", "flights", "google", "booking"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/google-search-001.yaml b/eval-server/evals/schema-extractor/google-search-001.yaml new file mode 100644 index 00000000000..7e6f0e6a4eb --- /dev/null +++ b/eval-server/evals/schema-extractor/google-search-001.yaml @@ -0,0 +1,76 @@ +# Google Search results extraction test +id: "google-search-001" +name: "Extract Google Search Results" +description: "Extract search results from Google search page" +enabled: true + +target: + url: "https://www.google.com/search?q=chrome+devtools+tutorial" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 45000 + +input: + schema: + type: "object" + properties: + query: + type: "string" + searchResults: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + snippet: + type: "string" + domain: + type: "string" + required: + - "title" + - "url" + - "snippet" + featuredSnippet: + type: "object" + properties: + content: + type: "string" + source: + type: "string" + url: + type: "string" + format: "url" + relatedSearches: + type: "array" + items: + type: "string" + required: + - "searchResults" + instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches" + reasoning: "Testing extraction from Google search results page with various result types" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Search results are relevant to the query" + - "Each result has a valid title, URL, and snippet" + - "URLs are properly resolved and not node IDs" + - "Related searches are extracted if present" + - "Featured snippet is captured when available" + +metadata: + tags: ["search", "google", "serp", "dynamic"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/homedepot-001.yaml b/eval-server/evals/schema-extractor/homedepot-001.yaml new file mode 100644 index 00000000000..4e8b835b66d --- /dev/null +++ b/eval-server/evals/schema-extractor/homedepot-001.yaml @@ -0,0 +1,92 @@ +# Home Depot product search extraction test +id: "homedepot-001" +name: "Extract Home Depot Product Search" +description: "Extract product listings from Home Depot search results" +enabled: true + +target: + url: "https://www.homedepot.com/s/power%2520drill" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 60000 + +input: + schema: + type: "object" + properties: + searchQuery: + type: "string" + totalResults: + type: "number" + products: + type: "array" + items: + type: "object" + properties: + name: + type: "string" + brand: + type: "string" + price: + type: "number" + originalPrice: + type: "number" + savings: + type: "number" + rating: + type: "number" + reviewCount: + type: "number" + productUrl: + type: "string" + format: "url" + imageUrl: + type: "string" + format: "url" + availability: + type: "string" + features: + type: "array" + items: + type: "string" + required: + - "name" + - "price" + - "productUrl" + filters: + type: "object" + properties: + brands: + type: "array" + items: + type: "string" + priceRanges: + type: "array" + items: + type: "string" + required: + - "products" + instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability" + reasoning: "Testing extraction from e-commerce search results with product cards and filters" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Products are relevant to the search query" + - "Prices are numeric values in USD" + - "Product URLs link to Home Depot product pages" + - "Ratings are on a 5-star scale" + - "Key product features are captured" + +metadata: + tags: ["ecommerce", "homedepot", "products", "search"] + priority: "high" + timeout: 60000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/macys-001.yaml b/eval-server/evals/schema-extractor/macys-001.yaml new file mode 100644 index 00000000000..23a4e37dec0 --- /dev/null +++ b/eval-server/evals/schema-extractor/macys-001.yaml @@ -0,0 +1,106 @@ +# Macy's product listing extraction test +id: "macys-001" +name: "Extract Macy's Product Listings" +description: "Extract fashion products from Macy's category page" +enabled: true + +target: + url: "https://www.macys.com/shop/womens-clothing/womens-dresses" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 60000 + +input: + schema: + type: "object" + properties: + category: + type: "string" + totalProducts: + type: "number" + products: + type: "array" + items: + type: "object" + properties: + name: + type: "string" + brand: + type: "string" + currentPrice: + type: "number" + originalPrice: + type: "number" + discount: + type: "string" + colors: + type: "array" + items: + type: "string" + sizes: + type: "array" + items: + type: "string" + rating: + type: "number" + reviewCount: + type: "number" + productUrl: + type: "string" + format: "url" + imageUrl: + type: "string" + format: "url" + promotions: + type: "array" + items: + type: "string" + required: + - "name" + - "brand" + - "currentPrice" + refinements: + type: "object" + properties: + brands: + type: "array" + items: + type: "string" + sizes: + type: "array" + items: + type: "string" + colors: + type: "array" + items: + type: "string" + priceRanges: + type: "array" + items: + type: "string" + required: + - "products" + instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's" + reasoning: "Testing extraction from fashion e-commerce with complex product attributes" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Products are from the correct category" + - "Prices reflect current and sale prices" + - "Color and size options are captured" + - "Brand names are accurately extracted" + - "Promotional text is included when present" + +metadata: + tags: ["ecommerce", "macys", "fashion", "products"] + priority: "high" + timeout: 60000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/schema-extractor/wikipedia-search-001.yaml b/eval-server/evals/schema-extractor/wikipedia-search-001.yaml new file mode 100644 index 00000000000..ad5f2f43b82 --- /dev/null +++ b/eval-server/evals/schema-extractor/wikipedia-search-001.yaml @@ -0,0 +1,77 @@ +# Wikipedia search results extraction test +id: "wikipedia-search-001" +name: "Extract Wikipedia Search Results" +description: "Extract search results from Wikipedia search" +enabled: true + +target: + url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_data" +timeout: 30000 + +input: + schema: + type: "object" + properties: + searchTerm: + type: "string" + resultCount: + type: "number" + searchResults: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + snippet: + type: "string" + category: + type: "string" + wordCount: + type: "number" + lastEdited: + type: "string" + required: + - "title" + - "url" + - "snippet" + suggestedArticles: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + required: + - "searchResults" + instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date" + reasoning: "Testing extraction from Wikipedia's internal search with rich metadata" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Search results are Wikipedia articles" + - "Each result has a valid Wikipedia URL" + - "Snippets contain relevant content highlights" + - "Metadata like word count is extracted when available" + +metadata: + tags: ["search", "wikipedia", "encyclopedia"] + priority: "high" + timeout: 30000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/screenshot-verification/dynamic-content-verification-001.yaml b/eval-server/evals/screenshot-verification/dynamic-content-verification-001.yaml new file mode 100644 index 00000000000..ce271614e03 --- /dev/null +++ b/eval-server/evals/screenshot-verification/dynamic-content-verification-001.yaml @@ -0,0 +1,47 @@ +# Dynamic content visual verification test +id: "dynamic-content-verification-001" +name: "Dynamic Content Visual Verification" +description: "Test visual verification of dynamic content loading using screenshots" +enabled: true + +target: + url: "https://the-internet.herokuapp.com/dynamic_loading/1" + +tool: "action_agent" +timeout: 90000 + +input: + objective: "Take a screenshot, click the Start button, wait for content to load, then take another screenshot to verify the dynamic content appeared" + reasoning: "Testing visual verification of dynamic content changes using screenshot comparison" + hint: "Use take_screenshot before clicking Start, then again after the dynamic content loads" + +schedule: + type: "on_demand" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o" + criteria: + - "Initial screenshot captured the page before dynamic loading" + - "Start button was successfully clicked" + - "Agent waited for dynamic content to fully load" + - "Final screenshot shows the revealed dynamic content" + - "Visual comparison demonstrates successful content loading verification" + - "Screenshots show clear before/after difference in content visibility" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare screenshots to verify dynamic content loading" + - "Confirm the first screenshot shows hidden content area" + - "Verify the second screenshot shows the revealed 'Hello World!' text" + - "Check that the loading animation or process is properly captured" + +metadata: + tags: ["screenshot", "dynamic-content", "visual-verification", "loading"] + priority: "high" + timeout: 90000 + retries: 2 + flaky: true \ No newline at end of file diff --git a/eval-server/evals/screenshot-verification/screenshot-error-handling-001.yaml b/eval-server/evals/screenshot-verification/screenshot-error-handling-001.yaml new file mode 100644 index 00000000000..4a98da0d81a --- /dev/null +++ b/eval-server/evals/screenshot-verification/screenshot-error-handling-001.yaml @@ -0,0 +1,44 @@ +# Screenshot error handling test +id: "screenshot-error-handling-001" +name: "Screenshot Error Handling" +description: "Test screenshot tool error handling and recovery" +enabled: true + +target: + url: "https://httpstat.us/500" + +tool: "take_screenshot" +timeout: 30000 + +input: + fullPage: false + +schedule: + type: "on_demand" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o" + criteria: + - "Screenshot tool handled the error page gracefully" + - "Either successfully captured the error page or reported appropriate error" + - "No crashes or undefined behavior occurred" + - "Tool response is meaningful regardless of page loading issues" + - "Error handling demonstrates robustness of screenshot functionality" + visual_verification: + enabled: true + capture_before: false + capture_after: true + prompts: + - "If screenshot was taken, verify it shows the error page content" + - "Check that the tool handled the HTTP 500 error appropriately" + - "Confirm no blank or corrupted screenshots were produced" + - "Ensure error scenarios are handled professionally" + +metadata: + tags: ["screenshot", "error-handling", "robustness", "edge-case"] + priority: "normal" + timeout: 30000 + retries: 1 + flaky: true \ No newline at end of file diff --git a/eval-server/evals/screenshot-verification/screenshot-fullpage-001.yaml b/eval-server/evals/screenshot-verification/screenshot-fullpage-001.yaml new file mode 100644 index 00000000000..b592f8c6bc0 --- /dev/null +++ b/eval-server/evals/screenshot-verification/screenshot-fullpage-001.yaml @@ -0,0 +1,45 @@ +# Full page screenshot verification test +id: "screenshot-fullpage-001" +name: "Take Full Page Screenshot" +description: "Test taking full page screenshot and verify functionality" +enabled: true + +target: + url: "https://en.wikipedia.org/wiki/Chrome_DevTools" + +tool: "take_screenshot" +timeout: 45000 + +input: + fullPage: true + +schedule: + type: "on_demand" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o" + criteria: + - "Full page screenshot was successfully captured" + - "Data URL contains valid image data" + - "Screenshot captures the entire page content including areas below the fold" + - "Image size is larger than viewport-only screenshot would be" + - "No errors occurred during full page capture" + - "Screenshot includes both header and footer content" + visual_verification: + enabled: true + capture_before: false + capture_after: true + prompts: + - "Verify the screenshot shows the complete Wikipedia article page" + - "Check that content above and below the fold is captured" + - "Confirm the image is taller than a typical viewport" + - "Ensure no content is cut off at the bottom" + +metadata: + tags: ["screenshot", "fullpage", "visual", "verification", "wikipedia"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: false \ No newline at end of file diff --git a/eval-server/evals/screenshot-verification/screenshot-viewport-001.yaml b/eval-server/evals/screenshot-verification/screenshot-viewport-001.yaml new file mode 100644 index 00000000000..54833d68f26 --- /dev/null +++ b/eval-server/evals/screenshot-verification/screenshot-viewport-001.yaml @@ -0,0 +1,44 @@ +# Viewport screenshot verification test +id: "screenshot-viewport-001" +name: "Take Viewport Screenshot" +description: "Test taking viewport screenshot and verify functionality" +enabled: true + +target: + url: "https://www.google.com" + +tool: "take_screenshot" +timeout: 30000 + +input: + fullPage: false + +schedule: + type: "on_demand" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o" + criteria: + - "Screenshot was successfully captured" + - "Data URL is properly formatted and contains image data" + - "Screenshot shows the viewport content correctly" + - "No errors occurred during screenshot capture" + - "Image data length indicates a valid screenshot was taken" + visual_verification: + enabled: true + capture_before: false + capture_after: true + prompts: + - "Verify the screenshot shows the Google homepage" + - "Check that the screenshot is not empty or corrupted" + - "Confirm the image quality is appropriate for verification" + - "Ensure the screenshot captures the current viewport accurately" + +metadata: + tags: ["screenshot", "viewport", "visual", "verification"] + priority: "high" + timeout: 30000 + retries: 2 + flaky: false \ No newline at end of file diff --git a/eval-server/evals/screenshot-verification/visual-comparison-001.yaml b/eval-server/evals/screenshot-verification/visual-comparison-001.yaml new file mode 100644 index 00000000000..035447993f2 --- /dev/null +++ b/eval-server/evals/screenshot-verification/visual-comparison-001.yaml @@ -0,0 +1,47 @@ +# Visual comparison verification test +id: "visual-comparison-001" +name: "Visual Comparison Before and After Action" +description: "Test visual verification by comparing screenshots before and after an action" +enabled: true + +target: + url: "https://www.google.com" + +tool: "action_agent" +timeout: 60000 + +input: + objective: "Take a screenshot, then type 'DevTools testing' in the search box, and take another screenshot to compare" + reasoning: "Testing visual verification workflow with before/after screenshot comparison" + hint: "Use take_screenshot tool before and after performing the search input action" + +schedule: + type: "on_demand" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o" + criteria: + - "Initial screenshot was taken before performing any actions" + - "Search text was successfully entered into the search field" + - "Second screenshot was taken after the text input" + - "Visual comparison shows the difference between before and after states" + - "Search field contains the entered text in the final screenshot" + - "Screenshots demonstrate successful action verification workflow" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Compare the before and after screenshots" + - "Verify the search field is empty in the first screenshot" + - "Confirm the search field contains 'DevTools testing' in the second screenshot" + - "Check that the visual changes accurately reflect the performed action" + +metadata: + tags: ["screenshot", "visual-comparison", "action-verification", "before-after"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: false \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/amazon-product-001.yaml b/eval-server/evals/streamlined-schema-extractor/amazon-product-001.yaml new file mode 100644 index 00000000000..b1544549e4e --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/amazon-product-001.yaml @@ -0,0 +1,78 @@ +# E-commerce product extraction test (Streamlined) +id: "amazon-product-001" +name: "Extract Amazon Product Details" +description: "Extract product information from an Amazon product page" +enabled: true + +target: + url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 60000 + +input: + schema: + type: "object" + properties: + product: + type: "object" + properties: + title: + type: "string" + brand: + type: "string" + price: + type: "object" + properties: + current: + type: "number" + currency: + type: "string" + rating: + type: "object" + properties: + average: + type: "number" + count: + type: "number" + images: + type: "array" + items: + type: "string" + format: "url" + features: + type: "array" + items: + type: "string" + required: + - "title" + - "price" + availability: + type: "string" + required: + - "product" + instruction: "Extract comprehensive product information including pricing, ratings, and key features" + reasoning: "Testing extraction from a dynamic e-commerce page with complex structure" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Product title is accurate and complete" + - "Price information is current and properly formatted" + - "Rating data includes both average and review count" + - "Image URLs are valid and accessible" + - "Key product features are captured" + - "All URLs are properly resolved (not node IDs)" + +metadata: + tags: ["ecommerce", "amazon", "product", "dynamic"] + priority: "high" + timeout: 60000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/bbc-news-001.yaml b/eval-server/evals/streamlined-schema-extractor/bbc-news-001.yaml new file mode 100644 index 00000000000..31ef2883ecd --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/bbc-news-001.yaml @@ -0,0 +1,69 @@ +# News article extraction test (Streamlined) +id: "bbc-news-001" +name: "Extract BBC News Article" +description: "Extract article content and metadata from a BBC News page" +enabled: true + +target: + url: "https://www.bbc.com/news/technology" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 30000 + +input: + schema: + type: "object" + properties: + headlines: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + summary: + type: "string" + url: + type: "string" + format: "url" + category: + type: "string" + required: + - "title" + mainStory: + type: "object" + properties: + headline: + type: "string" + summary: + type: "string" + url: + type: "string" + format: "url" + required: + - "headlines" + instruction: "Extract the main headlines and featured stories from the BBC Technology news section" + reasoning: "Testing extraction from a news aggregation page with multiple articles" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + include_url: true + criteria: + - "Headlines are current and relevant to technology news" + - "Article summaries provide meaningful context" + - "URLs link to valid BBC news articles" + - "Main story is properly identified" + - "All extracted content is in English" + +metadata: + tags: ["news", "bbc", "aggregation", "dynamic"] + priority: "high" + timeout: 30000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/bing-search-001.yaml b/eval-server/evals/streamlined-schema-extractor/bing-search-001.yaml new file mode 100644 index 00000000000..e9f3b6edb33 --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/bing-search-001.yaml @@ -0,0 +1,70 @@ +# Bing Search results extraction test +id: "bing-search-001" +name: "Extract Bing Search Results" +description: "Extract search results from Bing search page" +enabled: true + +target: + url: "https://www.bing.com/search?q=web+scraping+best+practices" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 45000 + +input: + schema: + type: "object" + properties: + query: + type: "string" + searchResults: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + snippet: + type: "string" + datePublished: + type: "string" + required: + - "title" + - "url" + - "snippet" + sidebarInfo: + type: "object" + properties: + title: + type: "string" + description: + type: "string" + source: + type: "string" + required: + - "searchResults" + instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing" + reasoning: "Testing extraction from Bing search results with different layout than Google" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Search results match the query intent" + - "Results include valid URLs and meaningful snippets" + - "Sidebar information is extracted when present" + - "No duplicate results in the list" + +metadata: + tags: ["search", "bing", "serp", "dynamic"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/github-repo-001.yaml b/eval-server/evals/streamlined-schema-extractor/github-repo-001.yaml new file mode 100644 index 00000000000..5c496c518f5 --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/github-repo-001.yaml @@ -0,0 +1,66 @@ +# Simple structured data test (Streamlined) +id: "github-repo-001" +name: "Extract GitHub Repository Info" +description: "Extract basic repository information from a GitHub page" +enabled: true + +target: + url: "https://github.com/microsoft/TypeScript" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 30000 + +input: + schema: + type: "object" + properties: + name: + type: "string" + description: + type: "string" + language: + type: "string" + stars: + type: "number" + forks: + type: "number" + topics: + type: "array" + items: + type: "string" + readme: + type: "object" + properties: + summary: + type: "string" + required: + - "name" + - "description" + instruction: "Extract repository metadata and basic statistics" + reasoning: "Testing extraction from a well-structured GitHub repository page" + +validation: + type: "hybrid" + snapshot: + exclude_paths: + - "stars" + - "forks" + structure_only: false + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Repository name matches the GitHub page" + - "Description accurately reflects the project purpose" + - "Programming language is correctly identified" + - "Topic tags are relevant to the project" + +metadata: + tags: ["github", "repository", "structured"] + priority: "high" + timeout: 30000 + retries: 1 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/google-flights-001.yaml b/eval-server/evals/streamlined-schema-extractor/google-flights-001.yaml new file mode 100644 index 00000000000..981ccbd48dc --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/google-flights-001.yaml @@ -0,0 +1,106 @@ +# Google Flights search extraction test +id: "google-flights-001" +name: "Extract Google Flights Search Results" +description: "Extract flight options from Google Flights search" +enabled: true + +target: + url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 60000 + +input: + schema: + type: "object" + properties: + searchCriteria: + type: "object" + properties: + origin: + type: "string" + destination: + type: "string" + departureDate: + type: "string" + returnDate: + type: "string" + tripType: + type: "string" + passengers: + type: "number" + flights: + type: "array" + items: + type: "object" + properties: + airline: + type: "string" + flightNumber: + type: "string" + departureTime: + type: "string" + arrivalTime: + type: "string" + duration: + type: "string" + stops: + type: "number" + price: + type: "object" + properties: + amount: + type: "number" + currency: + type: "string" + cabin: + type: "string" + bookingUrl: + type: "string" + format: "url" + legroom: + type: "string" + amenities: + type: "array" + items: + type: "string" + required: + - "airline" + - "departureTime" + - "arrivalTime" + - "price" + priceInsights: + type: "object" + properties: + trend: + type: "string" + recommendation: + type: "string" + averagePrice: + type: "number" + required: + - "flights" + instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results" + reasoning: "Testing extraction from complex travel search interface with dynamic pricing" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Flight times are in proper format" + - "Prices are numeric values with currency" + - "Airlines and flight numbers are accurate" + - "Stop information is correctly identified" + - "Duration is in readable format" + +metadata: + tags: ["travel", "flights", "google", "booking"] + priority: "high" + timeout: 60000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/google-search-001.yaml b/eval-server/evals/streamlined-schema-extractor/google-search-001.yaml new file mode 100644 index 00000000000..c1725d481d6 --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/google-search-001.yaml @@ -0,0 +1,76 @@ +# Google Search results extraction test +id: "google-search-001" +name: "Extract Google Search Results" +description: "Extract search results from Google search page" +enabled: true + +target: + url: "https://www.google.com/search?q=chrome+devtools+tutorial" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 45000 + +input: + schema: + type: "object" + properties: + query: + type: "string" + searchResults: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + snippet: + type: "string" + domain: + type: "string" + required: + - "title" + - "url" + - "snippet" + featuredSnippet: + type: "object" + properties: + content: + type: "string" + source: + type: "string" + url: + type: "string" + format: "url" + relatedSearches: + type: "array" + items: + type: "string" + required: + - "searchResults" + instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches" + reasoning: "Testing extraction from Google search results page with various result types" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Search results are relevant to the query" + - "Each result has a valid title, URL, and snippet" + - "URLs are properly resolved and not node IDs" + - "Related searches are extracted if present" + - "Featured snippet is captured when available" + +metadata: + tags: ["search", "google", "serp", "dynamic"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/homedepot-001.yaml b/eval-server/evals/streamlined-schema-extractor/homedepot-001.yaml new file mode 100644 index 00000000000..1d268488a3b --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/homedepot-001.yaml @@ -0,0 +1,92 @@ +# Home Depot product search extraction test +id: "homedepot-001" +name: "Extract Home Depot Product Search" +description: "Extract product listings from Home Depot search results" +enabled: true + +target: + url: "https://www.homedepot.com/s/power%2520drill" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 60000 + +input: + schema: + type: "object" + properties: + searchQuery: + type: "string" + totalResults: + type: "number" + products: + type: "array" + items: + type: "object" + properties: + name: + type: "string" + brand: + type: "string" + price: + type: "number" + originalPrice: + type: "number" + savings: + type: "number" + rating: + type: "number" + reviewCount: + type: "number" + productUrl: + type: "string" + format: "url" + imageUrl: + type: "string" + format: "url" + availability: + type: "string" + features: + type: "array" + items: + type: "string" + required: + - "name" + - "price" + - "productUrl" + filters: + type: "object" + properties: + brands: + type: "array" + items: + type: "string" + priceRanges: + type: "array" + items: + type: "string" + required: + - "products" + instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability" + reasoning: "Testing extraction from e-commerce search results with product cards and filters" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Products are relevant to the search query" + - "Prices are numeric values in USD" + - "Product URLs link to Home Depot product pages" + - "Ratings are on a 5-star scale" + - "Key product features are captured" + +metadata: + tags: ["ecommerce", "homedepot", "products", "search"] + priority: "high" + timeout: 60000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/macys-001.yaml b/eval-server/evals/streamlined-schema-extractor/macys-001.yaml new file mode 100644 index 00000000000..28a2c1056c1 --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/macys-001.yaml @@ -0,0 +1,106 @@ +# Macy's product listing extraction test +id: "macys-001" +name: "Extract Macy's Product Listings" +description: "Extract fashion products from Macy's category page" +enabled: true + +target: + url: "https://www.macys.com/shop/womens-clothing/womens-dresses" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 60000 + +input: + schema: + type: "object" + properties: + category: + type: "string" + totalProducts: + type: "number" + products: + type: "array" + items: + type: "object" + properties: + name: + type: "string" + brand: + type: "string" + currentPrice: + type: "number" + originalPrice: + type: "number" + discount: + type: "string" + colors: + type: "array" + items: + type: "string" + sizes: + type: "array" + items: + type: "string" + rating: + type: "number" + reviewCount: + type: "number" + productUrl: + type: "string" + format: "url" + imageUrl: + type: "string" + format: "url" + promotions: + type: "array" + items: + type: "string" + required: + - "name" + - "brand" + - "currentPrice" + refinements: + type: "object" + properties: + brands: + type: "array" + items: + type: "string" + sizes: + type: "array" + items: + type: "string" + colors: + type: "array" + items: + type: "string" + priceRanges: + type: "array" + items: + type: "string" + required: + - "products" + instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's" + reasoning: "Testing extraction from fashion e-commerce with complex product attributes" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Products are from the correct category" + - "Prices reflect current and sale prices" + - "Color and size options are captured" + - "Brand names are accurately extracted" + - "Promotional text is included when present" + +metadata: + tags: ["ecommerce", "macys", "fashion", "products"] + priority: "high" + timeout: 60000 + retries: 3 + flaky: true + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/wikipedia-001.yaml b/eval-server/evals/streamlined-schema-extractor/wikipedia-001.yaml new file mode 100644 index 00000000000..88983bd32c6 --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/wikipedia-001.yaml @@ -0,0 +1,76 @@ +# Wikipedia article extraction test (Streamlined) +id: "wikipedia-chrome-devtools-001" +name: "Extract Chrome DevTools Wikipedia Article" +description: "Extract structured information from the Chrome DevTools Wikipedia page" +enabled: true + +target: + url: "https://en.wikipedia.org/wiki/Chrome_DevTools" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 45000 + +input: + schema: + type: "object" + properties: + title: + type: "string" + summary: + type: "string" + tableOfContents: + type: "array" + items: + type: "string" + infobox: + type: "object" + properties: + developer: + type: "string" + initialRelease: + type: "string" + operatingSystem: + type: "string" + license: + type: "string" + externalLinks: + type: "array" + items: + type: "object" + properties: + text: + type: "string" + url: + type: "string" + format: "url" + required: + - "title" + - "summary" + instruction: "Extract the main article information including title, summary, table of contents, and infobox details" + reasoning: "Testing extraction from a stable, well-structured Wikipedia page" + +validation: + type: "hybrid" + snapshot: + exclude_paths: + - "externalLinks[*].url" + structure_only: false + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Article title matches the Wikipedia page title" + - "Summary captures the main description of Chrome DevTools" + - "Table of contents includes major sections" + - "Infobox contains key technical details" + - "External links are properly resolved URLs" + +metadata: + tags: ["wikipedia", "documentation", "stable"] + priority: "high" + timeout: 45000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/streamlined-schema-extractor/wikipedia-search-001.yaml b/eval-server/evals/streamlined-schema-extractor/wikipedia-search-001.yaml new file mode 100644 index 00000000000..c432c20d4f0 --- /dev/null +++ b/eval-server/evals/streamlined-schema-extractor/wikipedia-search-001.yaml @@ -0,0 +1,77 @@ +# Wikipedia search results extraction test +id: "wikipedia-search-001" +name: "Extract Wikipedia Search Results" +description: "Extract search results from Wikipedia search" +enabled: true + +target: + url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "extract_schema_streamlined" +timeout: 30000 + +input: + schema: + type: "object" + properties: + searchTerm: + type: "string" + resultCount: + type: "number" + searchResults: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + snippet: + type: "string" + category: + type: "string" + wordCount: + type: "number" + lastEdited: + type: "string" + required: + - "title" + - "url" + - "snippet" + suggestedArticles: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + url: + type: "string" + format: "url" + required: + - "searchResults" + instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date" + reasoning: "Testing extraction from Wikipedia's internal search with rich metadata" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4.1-mini" + temperature: 0.3 + criteria: + - "Search results are Wikipedia articles" + - "Each result has a valid Wikipedia URL" + - "Snippets contain relevant content highlights" + - "Metadata like word count is extracted when available" + +metadata: + tags: ["search", "wikipedia", "encyclopedia"] + priority: "high" + timeout: 30000 + retries: 2 + flaky: false + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-booking-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-booking-001.yaml new file mode 100644 index 00000000000..fab657a67fe --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-booking-001.yaml @@ -0,0 +1,47 @@ +# Hotel Search Workflow - Web Task Agent +id: "web-task-agent-booking-001" +name: "Hotel Search Workflow" +description: "Test web task agent orchestrating complex multi-step booking search" +enabled: true + +target: + url: "https://www.booking.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17" + reasoning: "Customer is looking for travel booking" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully searched for hotels in San Francisco" + - "Results show hotels available for March 15-17 dates" + - "Guest count of 2 adults is reflected in the search results" + - "Returned multiple hotel options with relevant details" + - "Each hotel includes essential information (name, price, location)" + - "Results are presented in a clear, readable format" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify hotel search results are displayed for San Francisco" + - "Check that dates March 15-17 are correctly selected" + - "Confirm guest count shows 2 adults" + - "Ensure search results show hotels with availability for specified dates" + +metadata: + tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-ecommerce-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-ecommerce-001.yaml new file mode 100644 index 00000000000..b05bab0b54f --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-ecommerce-001.yaml @@ -0,0 +1,55 @@ +# E-commerce web task evaluation (matches DevTools test case) +id: "web-task-agent-ecommerce-001" +name: "E-commerce Product Search" +description: "Test web task agent handling product search on shopping site" +enabled: true + +target: + url: "https://www.amazon.com" + +tool: "web_task_agent" +timeout: 90000 + +input: + task: "Search Amazon for \"wireless headphones\" and find products under $100" + reasoning: "Testing e-commerce search workflow with price filtering" + context: "User wants to find wireless headphones with specific price constraint" + extraction_schema: + type: "object" + properties: + products: + type: "array" + items: + type: "object" + properties: + name: + type: "string" + price: + type: "string" + rating: + type: "string" + url: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "hybrid" + llm_judge: + model: "gpt-4o" + criteria: + - "Successfully navigated to product search" + - "Applied appropriate filters correctly" + - "Extracted product details accurately" + - "Provided meaningful comparison of features" + - "Stayed within specified price range" + snapshot: + structure_only: true + exclude_paths: + - "timestamp" + - "sessionId" + +metadata: + tags: ["web-task", "multi-step", "ecommerce", "search"] + priority: "high" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-error-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-error-001.yaml new file mode 100644 index 00000000000..bb7c2645f00 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-error-001.yaml @@ -0,0 +1,47 @@ +# Error Recovery Workflow - Web Task Agent +id: "web-task-agent-error-001" +name: "Error Recovery Workflow" +description: "Test web task agent handling action_agent failures and retry logic" +enabled: true + +target: + url: "https://www.google.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for \"nonexistent test query 12345\" and handle any issues that arise" + reasoning: "Customer is asking for this response" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Attempted to search for the unusual query \"nonexistent test query 12345\"" + - "Either found some results OR provided clear explanation why no results were found" + - "Response handles the edge case gracefully without errors" + - "If no results found, suggested alternative actions or explanations" + - "Maintained professional tone despite unusual request" + - "Final output is coherent and helpful to the user" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Check if search was attempted despite unusual query" + - "Verify error handling did not break the page interaction" + - "Confirm agent attempted to complete the task or provided clear error info" + - "Ensure page is still functional after error recovery attempts" + +metadata: + tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-extract-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-extract-001.yaml new file mode 100644 index 00000000000..7dda4681661 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-extract-001.yaml @@ -0,0 +1,62 @@ +# Structured Data Extraction - Web Task Agent +id: "web-task-agent-extract-001" +name: "Structured Data Extraction" +description: "Test web task agent extracting structured data from search results" +enabled: true + +target: + url: "https://news.ycombinator.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts" + reasoning: "User is looking to understand the top stories on Hacker News" + extraction_schema: + type: "object" + properties: + stories: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + score: + type: "number" + comments: + type: "number" + url: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully returned exactly 5 Hacker News stories in structured text format" + - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL" + - "Results are presented in readable text format similar to the example provided" + - "Response includes all required fields: title, score, comments count, URL" + - "Maintained proper orchestration pattern throughout the extraction process" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Hacker News homepage is loaded and displaying stories" + - "Check that top stories are visible with scores and comment counts" + - "Confirm story titles and metadata are clearly displayed" + - "Ensure page structure allows for data extraction" + +metadata: + tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-finance-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-finance-001.yaml new file mode 100644 index 00000000000..ad873ab9172 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-finance-001.yaml @@ -0,0 +1,70 @@ +# Stock Information Research - Web Task Agent +id: "web-task-agent-finance-001" +name: "Stock Information Research" +description: "Test extracting stock prices and financial information" +enabled: true + +target: + url: "https://finance.yahoo.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance" + reasoning: "Users need automated financial data collection for investment decisions" + extraction_schema: + type: "object" + properties: + stock_info: + type: "object" + properties: + symbol: + type: "string" + company_name: + type: "string" + current_price: + type: "string" + change: + type: "string" + change_percent: + type: "string" + market_cap: + type: "string" + pe_ratio: + type: "string" + volume: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully found Apple (AAPL) stock information" + - "Current stock price is clearly stated" + - "Market cap information is included" + - "Price change and percentage change are provided" + - "Additional metrics (PE ratio, volume) included when available" + - "Financial data is current and presented in readable text format (not JSON)" + - "Stock information is well-organized and easy to understand" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Yahoo Finance shows Apple (AAPL) stock page" + - "Check that current stock price and change are visible" + - "Confirm market cap and trading volume are displayed" + - "Ensure financial metrics and charts are shown" + +metadata: + tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-flight-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-flight-001.yaml new file mode 100644 index 00000000000..bf79e2419c6 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-flight-001.yaml @@ -0,0 +1,47 @@ +# Complex Flight Search - Web Task Agent +id: "web-task-agent-flight-001" +name: "Complex Flight Search" +description: "Test web task agent handling complex flight search with multiple criteria" +enabled: true + +target: + url: "https://www.kayak.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30" + reasoning: "Customer is looking for finding the best flight options" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)" + - "Flight results show March 20 departure date" + - "Flight results show March 30 return date" + - "Returned multiple flight options with airlines and prices" + - "Each flight includes essential details (times, airlines, prices)" + - "Results clearly distinguish between outbound and return flights" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify flight search results are displayed" + - "Check SEA to NRT route is correctly selected" + - "Confirm dates March 20 departure and March 30 return" + - "Ensure flight options are showing with prices and airlines" + +metadata: + tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-food-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-food-001.yaml new file mode 100644 index 00000000000..8bbf0324bb9 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-food-001.yaml @@ -0,0 +1,70 @@ +# Restaurant Search and Menu Extraction - Web Task Agent +id: "web-task-agent-food-001" +name: "Restaurant Search and Menu Extraction" +description: "Test searching restaurants and extracting menu information" +enabled: true + +target: + url: "https://www.yelp.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details" + reasoning: "Users want to quickly compare restaurants, menus, and reviews" + extraction_schema: + type: "object" + properties: + restaurants: + type: "array" + items: + type: "object" + properties: + name: + type: "string" + rating: + type: "string" + price_range: + type: "string" + cuisine: + type: "string" + address: + type: "string" + phone: + type: "string" + hours: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully found Italian restaurants in San Francisco" + - "Each restaurant includes name, rating, and price range" + - "Location/address information is provided for each restaurant" + - "Contact details (phone/hours) included when available" + - "All restaurants listed serve Italian cuisine" + - "Results are presented in clear, structured text format (not JSON)" + - "Restaurants are numbered or organized clearly for easy comparison" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Yelp search results for Italian restaurants" + - "Check that restaurants show ratings and price ranges" + - "Confirm location filter shows San Francisco results" + - "Ensure restaurant listings include contact information" + +metadata: + tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-iframe-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-iframe-001.yaml new file mode 100644 index 00000000000..fe38d9cffad --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-iframe-001.yaml @@ -0,0 +1,85 @@ +# ANA Airlines Iframe Content Extraction - Web Task Agent +id: "web-task-agent-iframe-001" +name: "ANA Airlines Iframe Content Extraction" +description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines" +enabled: true + +target: + url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements." + reasoning: "Testing iframe content extraction and complex airline booking site navigation" + extraction_schema: + type: "object" + properties: + flights: + type: "array" + items: + type: "object" + properties: + flight_number: + type: "string" + airline: + type: "string" + departure_time: + type: "string" + arrival_time: + type: "string" + departure_date: + type: "string" + arrival_date: + type: "string" + duration: + type: "string" + aircraft: + type: "string" + price: + type: "string" + cabin_class: + type: "string" + stops: + type: "string" + booking_interface_status: + type: "string" + iframe_content_found: + type: "boolean" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully navigated ANA Airlines booking interface" + - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)" + - "Extracted flight information from ANA flight search results" + - "Flight details include ANA flight numbers and accurate route (SEA to NRT)" + - "Extracted pricing information in appropriate currency" + - "Handled any booking interface elements, popups, or navigation flows" + - "Results show flights for the correct dates (March 20-30, 2026)" + - "Successfully demonstrated iframe content extraction capabilities" + - "Booking interface status indicates successful page interaction" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify ANA Airlines flight search page loaded correctly" + - "Check that search parameters show SEA to NRT route" + - "Confirm flight results are displayed (may be in iframes)" + - "Ensure booking interface elements are functional" + - "Verify flight information is accessible and extractable" + +metadata: + tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-jobs-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-jobs-001.yaml new file mode 100644 index 00000000000..06de5beb368 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-jobs-001.yaml @@ -0,0 +1,70 @@ +# Job Search Workflow - Web Task Agent +id: "web-task-agent-jobs-001" +name: "Job Search Workflow" +description: "Test web task agent orchestrating job search on LinkedIn" +enabled: true + +target: + url: "https://www.linkedin.com/jobs" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results" + reasoning: "User wants to find job opportunities in tech industry" + extraction_schema: + type: "object" + properties: + jobs: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + company: + type: "string" + location: + type: "string" + salary: + type: "string" + description: + type: "string" + url: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction" + - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location" + - "If using forms: delegated keyword and location input to action_agent" + - "Extracted job listings using schema_based_extractor" + - "Returned structured job data in readable text format (not JSON)" + - "Each job listing includes title, company, location, and other relevant fields" + - "Results are numbered or organized clearly for easy reading" + - "Demonstrated proper workflow orchestration for job search" + - "Never used direct browser interaction tools" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify LinkedIn job search results are displayed" + - "Check that search shows Software Engineer jobs in San Francisco" + - "Confirm job listings include company names and titles" + - "Ensure at least 5 job results are visible" + +metadata: + tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-learning-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-learning-001.yaml new file mode 100644 index 00000000000..58dec4d06cc --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-learning-001.yaml @@ -0,0 +1,71 @@ +# Online Course Search - Web Task Agent +id: "web-task-agent-learning-001" +name: "Online Course Search" +description: "Test searching and extracting course information from learning platforms" +enabled: true + +target: + url: "https://www.coursera.org" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for \"Machine Learning\" courses and extract details for top 5 results" + reasoning: "Users want to compare courses across platforms for learning decisions" + extraction_schema: + type: "object" + properties: + courses: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + instructor: + type: "string" + university: + type: "string" + rating: + type: "string" + duration: + type: "string" + price: + type: "string" + description: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully found Machine Learning courses on Coursera" + - "Returned details for top 5 courses as requested" + - "Each course includes title, instructor, university, and rating" + - "Duration and pricing information included for each course" + - "Course descriptions or key topics are provided" + - "Results are presented in structured text format (not JSON)" + - "Courses are numbered (1-5) and well-organized for easy comparison" + - "Each course entry is clearly formatted and readable" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Coursera search results for Machine Learning" + - "Check that courses show titles, instructors, and ratings" + - "Confirm course details include duration and pricing" + - "Ensure search results are relevant to Machine Learning" + +metadata: + tags: ["web-task", "education", "coursera", "courses", "learning", "popular"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-nav-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-nav-001.yaml new file mode 100644 index 00000000000..313d1fcdaab --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-nav-001.yaml @@ -0,0 +1,48 @@ +# Site Navigation Workflow - Web Task Agent +id: "web-task-agent-nav-001" +name: "Site Navigation Workflow" +description: "Test web task agent orchestrating navigation between different sections of a site" +enabled: true + +target: + url: "https://www.wikipedia.org" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 90000 + +input: + task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning" + reasoning: "User is looking to explore Wikipedia content through structured navigation" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Orchestrated Wikipedia search via action_agent calls" + - "Navigated to artificial intelligence article through action_agent" + - "Located machine learning section via action_agent coordination" + - "Extracted relevant information about machine learning" + - "Demonstrated multi-step navigation workflow" + - "Maintained orchestration pattern throughout navigation" + - "Provided structured summary of found information" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify navigation reached artificial intelligence Wikipedia page" + - "Check that machine learning section or content is visible" + - "Confirm successful navigation through multiple page sections" + - "Ensure content related to machine learning is displayed" + +metadata: + tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-news-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-news-001.yaml new file mode 100644 index 00000000000..412a45ec32d --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-news-001.yaml @@ -0,0 +1,66 @@ +# News Article Aggregation - Web Task Agent +id: "web-task-agent-news-001" +name: "News Article Aggregation" +description: "Test aggregating news headlines and summaries from news sites" +enabled: true + +target: + url: "https://news.ycombinator.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments" + reasoning: "Users want automated news monitoring for research and awareness" + extraction_schema: + type: "object" + properties: + articles: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + score: + type: "number" + comments_count: + type: "number" + url: + type: "string" + top_comment: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully extracted 10 Hacker News stories as requested" + - "Each story includes title, score, and comment count" + - "URLs are provided for each story" + - "Stories appear to be from the current top/front page" + - "Results are presented in clear, numbered text format (1-10), not JSON" + - "All required fields are present and properly formatted in readable text" + - "Each story is clearly separated and easy to read" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Hacker News stories are visible with scores" + - "Check that story titles and comment counts are shown" + - "Confirm top stories section is properly displayed" + - "Ensure story metadata is accessible for extraction" + +metadata: + tags: ["web-task", "news", "hackernews", "aggregation", "popular"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-realestate-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-realestate-001.yaml new file mode 100644 index 00000000000..9cf2b947a28 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-realestate-001.yaml @@ -0,0 +1,72 @@ +# Real Estate Property Search - Web Task Agent +id: "web-task-agent-realestate-001" +name: "Real Estate Property Search" +description: "Test property search workflow on real estate platforms" +enabled: true + +target: + url: "https://www.zillow.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for houses for sale in Austin, Texas under $500k and extract property details" + reasoning: "User wants to find affordable housing options in a specific location" + extraction_schema: + type: "object" + properties: + properties: + type: "array" + items: + type: "object" + properties: + address: + type: "string" + price: + type: "string" + bedrooms: + type: "number" + bathrooms: + type: "number" + sqft: + type: "string" + lot_size: + type: "string" + year_built: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Orchestrated location search via action_agent" + - "Delegated price filter setting to action_agent" + - "Coordinated property type selection through action_agent" + - "Applied search filters through proper action_agent calls" + - "Extracted property listings with schema_based_extractor" + - "Returned structured property data in readable text format (not JSON)" + - "Each property includes address, price, bedrooms, bathrooms, and other key details" + - "Properties are clearly numbered or organized for easy comparison" + - "Demonstrated complex real estate search workflow orchestration" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Zillow search results for Austin, Texas properties" + - "Check that properties shown are under $500k" + - "Confirm property listings show price, beds, baths info" + - "Ensure search results match the specified criteria" + +metadata: + tags: ["web-task", "real-estate", "zillow", "property-search", "popular"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-001.yaml new file mode 100644 index 00000000000..f90cd8f8526 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-001.yaml @@ -0,0 +1,63 @@ +# Infinite Scroll Content Loading - Web Task Agent +id: "web-task-agent-scroll-001" +name: "Infinite Scroll Content Loading" +description: "Test web task agent handling infinite scroll pages to load more content" +enabled: true + +target: + url: "https://twitter.com" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content" + reasoning: "Testing infinite scroll functionality for dynamic content loading" + extraction_schema: + type: "object" + properties: + tweets: + type: "array" + items: + type: "object" + properties: + author: + type: "string" + content: + type: "string" + likes: + type: "string" + retweets: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully used scroll_page tool to scroll down the page" + - "Loaded additional content through scrolling actions" + - "Extracted at least 20 tweets from the feed" + - "Each tweet includes author and content information" + - "Demonstrated proper handling of dynamically loaded content" + - "Results are presented in clear, numbered text format" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify initial Twitter feed is loaded" + - "Check that scrolling action loaded additional tweets" + - "Confirm at least 20 tweets are visible after scrolling" + - "Ensure page scrolled down significantly from initial position" + +metadata: + tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-002.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-002.yaml new file mode 100644 index 00000000000..858178e216a --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-002.yaml @@ -0,0 +1,67 @@ +# Product Review Scrolling - Web Task Agent +id: "web-task-agent-scroll-002" +name: "Product Review Scrolling" +description: "Test scrolling to load more product reviews on e-commerce sites" +enabled: true + +target: + url: "https://www.amazon.com/dp/B08N5WRWNW" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details" + reasoning: "Users need to see multiple reviews beyond initial visible ones" + extraction_schema: + type: "object" + properties: + reviews: + type: "array" + items: + type: "object" + properties: + rating: + type: "string" + title: + type: "string" + author: + type: "string" + date: + type: "string" + verified: + type: "boolean" + content: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Used scroll_page tool to navigate to reviews section" + - "Scrolled within reviews area to load additional reviews" + - "Extracted multiple product reviews with ratings" + - "Each review includes rating, author, and content" + - "Successfully handled lazy-loaded review content" + - "Presented reviews in structured, readable format" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Amazon product page is loaded" + - "Check that page scrolled to reviews section" + - "Confirm additional reviews loaded after scrolling" + - "Ensure review content is fully visible" + +metadata: + tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-003.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-003.yaml new file mode 100644 index 00000000000..c1b3597e642 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-003.yaml @@ -0,0 +1,63 @@ +# News Article Progressive Loading - Web Task Agent +id: "web-task-agent-scroll-003" +name: "News Article Progressive Loading" +description: "Test scrolling through news sites that load articles progressively" +enabled: true + +target: + url: "https://medium.com/topic/technology" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles" + reasoning: "Testing progressive content loading on news/blog platforms" + extraction_schema: + type: "object" + properties: + articles: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + author: + type: "string" + reading_time: + type: "string" + preview: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Used scroll_page tool multiple times to load content" + - "Successfully loaded at least 15 articles through scrolling" + - "Extracted article titles and author information" + - "Handled Medium's progressive loading mechanism" + - "Articles are from technology topic as requested" + - "Results presented in clear, numbered format" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Medium technology page is loaded" + - "Check that initial articles are visible" + - "Confirm scrolling loaded additional articles" + - "Ensure at least 15 articles are visible after scrolling" + +metadata: + tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-004.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-004.yaml new file mode 100644 index 00000000000..96cf5798c3b --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-004.yaml @@ -0,0 +1,63 @@ +# Search Results Infinite Scroll - Web Task Agent +id: "web-task-agent-scroll-004" +name: "Search Results Infinite Scroll" +description: "Test handling search results that use infinite scroll instead of pagination" +enabled: true + +target: + url: "https://www.pinterest.com/search/pins/?q=web%20design" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details" + reasoning: "Testing infinite scroll on visual search platforms" + extraction_schema: + type: "object" + properties: + pins: + type: "array" + items: + type: "object" + properties: + title: + type: "string" + description: + type: "string" + saves: + type: "string" + source: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully performed search for \"web design\" pins" + - "Used scroll_page tool to trigger infinite scroll loading" + - "Loaded at least 30 pins through scrolling actions" + - "Extracted pin titles and metadata" + - "Handled Pinterest's masonry layout and lazy loading" + - "Results are well-organized and readable" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Pinterest search results for web design" + - "Check initial pins are displayed" + - "Confirm scrolling loaded many more pins" + - "Ensure grid layout shows 30+ pins after scrolling" + +metadata: + tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-005.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-005.yaml new file mode 100644 index 00000000000..169befe8606 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-005.yaml @@ -0,0 +1,75 @@ +# Google Flights Scroll and Show More - Web Task Agent +id: "web-task-agent-scroll-005" +name: "Google Flights Scroll and Show More" +description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options" +enabled: true + +target: + url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo." + reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights" + extraction_schema: + type: "object" + properties: + flights: + type: "array" + items: + type: "object" + properties: + airline: + type: "string" + departure_time: + type: "string" + arrival_time: + type: "string" + duration: + type: "string" + stops: + type: "string" + price: + type: "string" + aircraft: + type: "string" + total_flights_found: + type: "number" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully extracted initial flight results from Google Flights" + - "Used scroll_page tool to scroll down the flight results list" + - "Located and clicked \"Show more flights\" button using action_agent" + - "Loaded additional flight options beyond the initial set" + - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)" + - "Each flight includes airline, times, duration, stops, and price" + - "Flights are for the correct dates (March 20-30, 2026)" + - "Results are presented in clear, numbered format" + - "Successfully combined scrolling and clicking actions to load more content" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Google Flights page shows SEA to NRT flights" + - "Check that initial flight results are displayed" + - "Confirm scrolling occurred and \"Show more flights\" button was visible" + - "Ensure additional flights loaded after clicking the button" + - "Verify at least 20 flight options are now visible" + +metadata: + tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-search-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-search-001.yaml new file mode 100644 index 00000000000..b431605b93c --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-search-001.yaml @@ -0,0 +1,43 @@ +# Basic web task search evaluation (matches DevTools test case) +id: "web-task-agent-search-001" +name: "Site-Specific Search Task" +description: "Test web task agent orchestrating a search workflow on a specific site" +enabled: true + +target: + url: "chrome://new-tab-page" + +tool: "web_task_agent" +timeout: 60000 + +input: + task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results" + reasoning: "Testing basic site-specific search workflow orchestration" + context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow" + +schedule: + type: "on_demand" + +validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o" + criteria: + - "Successfully returned exactly 3 search results in structured text format" + - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\"" + - "Each result includes a URL in the format \"URL: [link]\"" + - "Results are presented in a clear, readable text format (not JSON)" + - "Response includes a brief summary or conclusion statement" + visual_verification: + enabled: true + capture_before: true + capture_after: true + prompts: + - "Verify search was completed and results page is showing" + - "Check that search results are related to \"Chrome DevTools automation\"" + - "Confirm at least 3 search results are visible on the page" + - "Ensure the search workflow was completed successfully" + +metadata: + tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"] + priority: "normal" \ No newline at end of file diff --git a/eval-server/evals/web-task-agent/web-task-agent-social-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-social-001.yaml new file mode 100644 index 00000000000..3f913c77ba3 --- /dev/null +++ b/eval-server/evals/web-task-agent/web-task-agent-social-001.yaml @@ -0,0 +1,62 @@ +# Social Media Content Extraction - Web Task Agent +id: "web-task-agent-social-001" +name: "Social Media Content Extraction" +description: "Test extracting trending topics and posts from social media" +enabled: true + +target: + url: "https://twitter.com/explore" + wait_for: "networkidle" + wait_timeout: 5000 + +tool: "web_task_agent" +timeout: 180000 + +input: + task: "Extract the top 5 trending topics from Twitter/X explore page" + reasoning: "User wants to stay updated on current trends" + extraction_schema: + type: "object" + properties: + trends: + type: "array" + items: + type: "object" + properties: + topic: + type: "string" + posts_count: + type: "string" + category: + type: "string" + +schedule: + type: "on_demand" + +validation: + type: "llm_judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Successfully accessed Twitter/X explore page and found trending topics" + - "Returned exactly 5 trending topics as requested" + - "Each topic includes the trend name/hashtag" + - "Post counts or metrics are included when available" + - "Topics are current/recent trends (not outdated)" + - "Results are presented in clear, numbered text format (not JSON)" + - "Each trend is properly numbered (1., 2., 3., etc.) for readability" + visual_verification: + enabled: true + capture_before_action: true + capture_after_action: true + verification_prompts: + - "Verify Twitter/X explore page is loaded" + - "Check that trending topics section is visible" + - "Confirm trending topics show names and post counts" + - "Ensure page shows current trending content" + +metadata: + tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"] + priority: "high" + owner: "devtools-team" \ No newline at end of file diff --git a/eval-server/package-lock.json b/eval-server/package-lock.json new file mode 100644 index 00000000000..494fa5e41b8 --- /dev/null +++ b/eval-server/package-lock.json @@ -0,0 +1,829 @@ +{ + "name": "bo-eval-server", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "bo-eval-server", + "version": "1.0.0", + "license": "MIT", + "dependencies": { + "dotenv": "^16.3.1", + "js-yaml": "^4.1.0", + "openai": "^4.24.1", + "uuid": "^9.0.1", + "winston": "^3.11.0", + "ws": "^8.16.0" + }, + "devDependencies": { + "@types/ws": "^8.5.10" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@colors/colors": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.6.0.tgz", + "integrity": "sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==", + "license": "MIT", + "engines": { + "node": ">=0.1.90" + } + }, + "node_modules/@dabh/diagnostics": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/@dabh/diagnostics/-/diagnostics-2.0.3.tgz", + "integrity": "sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==", + "license": "MIT", + "dependencies": { + "colorspace": "1.1.x", + "enabled": "2.0.x", + "kuler": "^2.0.0" + } + }, + "node_modules/@types/node": { + "version": "24.0.13", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.13.tgz", + "integrity": "sha512-Qm9OYVOFHFYg3wJoTSrz80hoec5Lia/dPp84do3X7dZvLikQvM1YpmvTBEdIr/e+U8HTkFjLHLnl78K/qjf+jQ==", + "license": "MIT", + "dependencies": { + "undici-types": "~7.8.0" + } + }, + "node_modules/@types/node-fetch": { + "version": "2.6.12", + "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.12.tgz", + "integrity": "sha512-8nneRWKCg3rMtF69nLQJnOYUcbafYeFSjqkw3jCRLsqkWFlHaoQrr5mXmofFGOx3DKn7UfmBMyov8ySvLRVldA==", + "license": "MIT", + "dependencies": { + "@types/node": "*", + "form-data": "^4.0.0" + } + }, + "node_modules/@types/triple-beam": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz", + "integrity": "sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==", + "license": "MIT" + }, + "node_modules/@types/ws": { + "version": "8.18.1", + "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz", + "integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, + "node_modules/agentkeepalive": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.6.0.tgz", + "integrity": "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==", + "license": "MIT", + "dependencies": { + "humanize-ms": "^1.2.1" + }, + "engines": { + "node": ">= 8.0.0" + } + }, + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", + "license": "Python-2.0" + }, + "node_modules/async": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", + "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==", + "license": "MIT" + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/color": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/color/-/color-3.2.1.tgz", + "integrity": "sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==", + "license": "MIT", + "dependencies": { + "color-convert": "^1.9.3", + "color-string": "^1.6.0" + } + }, + "node_modules/color-convert": { + "version": "1.9.3", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", + "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==", + "license": "MIT", + "dependencies": { + "color-name": "1.1.3" + } + }, + "node_modules/color-name": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", + "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==", + "license": "MIT" + }, + "node_modules/color-string": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz", + "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==", + "license": "MIT", + "dependencies": { + "color-name": "^1.0.0", + "simple-swizzle": "^0.2.2" + } + }, + "node_modules/colorspace": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.4.tgz", + "integrity": "sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==", + "license": "MIT", + "dependencies": { + "color": "^3.1.3", + "text-hex": "1.0.x" + } + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/dotenv": { + "version": "16.6.1", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz", + "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/enabled": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz", + "integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==", + "license": "MIT" + }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/fecha": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz", + "integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==", + "license": "MIT" + }, + "node_modules/fn.name": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz", + "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==", + "license": "MIT" + }, + "node_modules/form-data": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.3.tgz", + "integrity": "sha512-qsITQPfmvMOSAdeyZ+12I1c+CKSstAFAwu+97zrnWAbIr5u8wfsExUzCesVLC8NgHuRUqNN4Zy6UPWUTRGslcA==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/form-data-encoder": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz", + "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==", + "license": "MIT" + }, + "node_modules/formdata-node": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz", + "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==", + "license": "MIT", + "dependencies": { + "node-domexception": "1.0.0", + "web-streams-polyfill": "4.0.0-beta.3" + }, + "engines": { + "node": ">= 12.20" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/humanize-ms": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz", + "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==", + "license": "MIT", + "dependencies": { + "ms": "^2.0.0" + } + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "license": "ISC" + }, + "node_modules/is-arrayish": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", + "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==", + "license": "MIT" + }, + "node_modules/is-stream": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", + "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", + "license": "MIT", + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/js-yaml": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", + "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "license": "MIT", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/kuler": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz", + "integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==", + "license": "MIT" + }, + "node_modules/logform": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/logform/-/logform-2.7.0.tgz", + "integrity": "sha512-TFYA4jnP7PVbmlBIfhlSe+WKxs9dklXMTEGcBCIvLhE/Tn3H6Gk1norupVW7m5Cnd4bLcr08AytbyV/xj7f/kQ==", + "license": "MIT", + "dependencies": { + "@colors/colors": "1.6.0", + "@types/triple-beam": "^1.3.2", + "fecha": "^4.2.0", + "ms": "^2.1.1", + "safe-stable-stringify": "^2.3.1", + "triple-beam": "^1.3.0" + }, + "engines": { + "node": ">= 12.0.0" + } + }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/node-domexception": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz", + "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==", + "deprecated": "Use your platform's native DOMException instead", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/jimmywarting" + }, + { + "type": "github", + "url": "https://paypal.me/jimmywarting" + } + ], + "license": "MIT", + "engines": { + "node": ">=10.5.0" + } + }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/one-time": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz", + "integrity": "sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==", + "license": "MIT", + "dependencies": { + "fn.name": "1.x.x" + } + }, + "node_modules/openai": { + "version": "4.104.0", + "resolved": "https://registry.npmjs.org/openai/-/openai-4.104.0.tgz", + "integrity": "sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA==", + "license": "Apache-2.0", + "dependencies": { + "@types/node": "^18.11.18", + "@types/node-fetch": "^2.6.4", + "abort-controller": "^3.0.0", + "agentkeepalive": "^4.2.1", + "form-data-encoder": "1.7.2", + "formdata-node": "^4.3.2", + "node-fetch": "^2.6.7" + }, + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, + "node_modules/openai/node_modules/@types/node": { + "version": "18.19.118", + "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.118.tgz", + "integrity": "sha512-hIPK0hSrrcaoAu/gJMzN3QClXE4QdCdFvaenJ0JsjIbExP1JFFVH+RHcBt25c9n8bx5dkIfqKE+uw6BmBns7ug==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/openai/node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "license": "MIT", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/safe-stable-stringify": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.5.0.tgz", + "integrity": "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA==", + "license": "MIT", + "engines": { + "node": ">=10" + } + }, + "node_modules/simple-swizzle": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", + "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==", + "license": "MIT", + "dependencies": { + "is-arrayish": "^0.3.1" + } + }, + "node_modules/stack-trace": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz", + "integrity": "sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==", + "license": "MIT", + "engines": { + "node": "*" + } + }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, + "node_modules/text-hex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", + "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==", + "license": "MIT" + }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, + "node_modules/triple-beam": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz", + "integrity": "sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==", + "license": "MIT", + "engines": { + "node": ">= 14.0.0" + } + }, + "node_modules/undici-types": { + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz", + "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==", + "license": "MIT" + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "license": "MIT" + }, + "node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/web-streams-polyfill": { + "version": "4.0.0-beta.3", + "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz", + "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, + "node_modules/winston": { + "version": "3.17.0", + "resolved": "https://registry.npmjs.org/winston/-/winston-3.17.0.tgz", + "integrity": "sha512-DLiFIXYC5fMPxaRg832S6F5mJYvePtmO5G9v9IgUFPhXm9/GkXarH/TUrBAVzhTCzAj9anE/+GjrgXp/54nOgw==", + "license": "MIT", + "dependencies": { + "@colors/colors": "^1.6.0", + "@dabh/diagnostics": "^2.0.2", + "async": "^3.2.3", + "is-stream": "^2.0.0", + "logform": "^2.7.0", + "one-time": "^1.0.0", + "readable-stream": "^3.4.0", + "safe-stable-stringify": "^2.3.1", + "stack-trace": "0.0.x", + "triple-beam": "^1.3.0", + "winston-transport": "^4.9.0" + }, + "engines": { + "node": ">= 12.0.0" + } + }, + "node_modules/winston-transport": { + "version": "4.9.0", + "resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.9.0.tgz", + "integrity": "sha512-8drMJ4rkgaPo1Me4zD/3WLfI/zPdA9o2IipKODunnGDcuqbHwjsbB79ylv04LCGGzU0xQ6vTznOMpQGaLhhm6A==", + "license": "MIT", + "dependencies": { + "logform": "^2.7.0", + "readable-stream": "^3.6.2", + "triple-beam": "^1.3.0" + }, + "engines": { + "node": ">= 12.0.0" + } + }, + "node_modules/ws": { + "version": "8.18.3", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", + "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", + "license": "MIT", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + } + } +} diff --git a/eval-server/package.json b/eval-server/package.json new file mode 100644 index 00000000000..de89d99b27d --- /dev/null +++ b/eval-server/package.json @@ -0,0 +1,29 @@ +{ + "name": "bo-eval-server", + "version": "1.0.0", + "description": "WebSocket server for evaluating LLM agents with LLM-as-a-judge", + "main": "src/server.js", + "type": "module", + "scripts": { + "start": "node src/server.js", + "dev": "node --watch src/server.js", + "cli": "node src/cli.js" + }, + "keywords": ["websocket", "llm", "evaluation", "rpc"], + "author": "", + "license": "MIT", + "dependencies": { + "ws": "^8.16.0", + "uuid": "^9.0.1", + "winston": "^3.11.0", + "dotenv": "^16.3.1", + "openai": "^4.24.1", + "js-yaml": "^4.1.0" + }, + "devDependencies": { + "@types/ws": "^8.5.10" + }, + "engines": { + "node": ">=18.0.0" + } +} \ No newline at end of file diff --git a/eval-server/schemas/client.schema.json b/eval-server/schemas/client.schema.json new file mode 100644 index 00000000000..5093155b971 --- /dev/null +++ b/eval-server/schemas/client.schema.json @@ -0,0 +1,337 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "title": "Client Configuration Schema", + "description": "Schema for validating client YAML configuration files", + "required": ["client", "settings", "evaluations"], + "properties": { + "client": { + "type": "object", + "required": ["id", "name"], + "properties": { + "id": { + "type": "string", + "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$", + "description": "UUID v4 format client identifier" + }, + "name": { + "type": "string", + "minLength": 1, + "maxLength": 100, + "description": "Human-readable client name" + }, + "secret_key": { + "type": "string", + "description": "Optional authentication secret key" + }, + "description": { + "type": "string", + "description": "Optional client description" + } + } + }, + "settings": { + "type": "object", + "properties": { + "max_concurrent_evaluations": { + "type": "integer", + "minimum": 1, + "maximum": 10, + "default": 3 + }, + "default_timeout": { + "type": "integer", + "minimum": 5000, + "maximum": 300000, + "default": 30000, + "description": "Default timeout in milliseconds" + }, + "retry_policy": { + "type": "object", + "properties": { + "max_retries": { + "type": "integer", + "minimum": 0, + "maximum": 5, + "default": 2 + }, + "backoff_multiplier": { + "type": "number", + "minimum": 1, + "maximum": 5, + "default": 2 + }, + "initial_delay": { + "type": "integer", + "minimum": 100, + "maximum": 10000, + "default": 1000, + "description": "Initial delay in milliseconds" + } + } + } + } + }, + "evaluations": { + "type": "array", + "items": { + "$ref": "#/definitions/evaluation" + } + } + }, + "definitions": { + "evaluation": { + "type": "object", + "required": ["id", "name", "tool", "input"], + "properties": { + "id": { + "type": "string", + "pattern": "^[a-zA-Z0-9-_]+$", + "minLength": 1, + "maxLength": 100, + "description": "Unique evaluation identifier" + }, + "name": { + "type": "string", + "minLength": 1, + "maxLength": 200, + "description": "Human-readable evaluation name" + }, + "description": { + "type": "string", + "description": "Optional evaluation description" + }, + "enabled": { + "type": "boolean", + "default": true, + "description": "Whether this evaluation is enabled" + }, + "target": { + "type": "object", + "properties": { + "url": { + "type": "string", + "format": "uri", + "description": "Target URL for the evaluation" + }, + "wait_for": { + "type": "string", + "enum": ["load", "domcontentloaded", "networkidle"], + "default": "networkidle" + }, + "wait_timeout": { + "type": "integer", + "minimum": 1000, + "maximum": 30000, + "default": 5000 + } + } + }, + "tool": { + "type": "string", + "enum": [ + "extract_schema_data", + "extract_schema_streamlined", + "research_agent", + "action_agent", + "web_task_agent" + ], + "description": "Tool to execute for this evaluation" + }, + "timeout": { + "type": "integer", + "minimum": 5000, + "maximum": 300000, + "description": "Evaluation timeout in milliseconds" + }, + "input": { + "type": "object", + "description": "Tool-specific input parameters" + }, + "schedule": { + "type": "object", + "required": ["type"], + "properties": { + "type": { + "type": "string", + "enum": ["on_demand", "periodic", "once"] + }, + "interval": { + "type": "integer", + "minimum": 60000, + "description": "Interval in milliseconds for periodic schedules" + }, + "run_at": { + "type": "string", + "format": "date-time", + "description": "ISO timestamp for one-time schedules" + } + }, + "if": { + "properties": { + "type": { "const": "periodic" } + } + }, + "then": { + "required": ["interval"] + }, + "else": { + "if": { + "properties": { + "type": { "const": "once" } + } + }, + "then": { + "required": ["run_at"] + } + } + }, + "validation": { + "type": "object", + "required": ["type"], + "properties": { + "type": { + "type": "string", + "enum": ["llm-judge", "snapshot", "hybrid"] + }, + "llm_judge": { + "$ref": "#/definitions/llm_judge_config" + }, + "snapshot": { + "$ref": "#/definitions/snapshot_config" + }, + "hybrid": { + "type": "object", + "properties": { + "weight_llm": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "weight_snapshot": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + } + } + } + }, + "metadata": { + "type": "object", + "properties": { + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "priority": { + "type": "string", + "enum": ["low", "normal", "high"], + "default": "normal" + }, + "owner": { + "type": "string", + "description": "Responsible team or person" + }, + "created": { + "type": "string", + "format": "date" + }, + "modified": { + "type": "string", + "format": "date" + } + } + } + } + }, + "llm_judge_config": { + "type": "object", + "required": ["criteria"], + "properties": { + "model": { + "type": "string", + "default": "gpt-4o-mini", + "description": "LLM model to use for evaluation" + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2, + "default": 0.3 + }, + "criteria": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 1, + "description": "List of evaluation criteria" + }, + "visual_verification": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "default": false + }, + "capture_before": { + "type": "boolean", + "default": true + }, + "capture_after": { + "type": "boolean", + "default": true + }, + "prompts": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + } + }, + "snapshot_config": { + "type": "object", + "properties": { + "structure_only": { + "type": "boolean", + "default": false, + "description": "Compare only structure, not values" + }, + "exclude_paths": { + "type": "array", + "items": { + "type": "string" + }, + "description": "JSONPath expressions for fields to exclude" + }, + "sanitizers": { + "type": "array", + "items": { + "type": "object", + "required": ["path"], + "properties": { + "path": { + "type": "string", + "description": "JSONPath to the field" + }, + "pattern": { + "type": "string", + "description": "Regex pattern to match" + }, + "replacement": { + "type": "string", + "description": "Replacement string" + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/eval-server/src/api-server.js b/eval-server/src/api-server.js new file mode 100644 index 00000000000..8011dea31f8 --- /dev/null +++ b/eval-server/src/api-server.js @@ -0,0 +1,221 @@ +import http from 'http'; +import url from 'url'; +import { EvaluationServer } from './server.js'; +import logger from './logger.js'; + +class APIServer { + constructor(evaluationServer, port = 8081) { + this.evaluationServer = evaluationServer; + this.port = port; + this.server = null; + } + + start() { + this.server = http.createServer((req, res) => { + // Enable CORS + res.setHeader('Access-Control-Allow-Origin', '*'); + res.setHeader('Access-Control-Allow-Methods', 'GET, POST, OPTIONS'); + res.setHeader('Access-Control-Allow-Headers', 'Content-Type'); + + if (req.method === 'OPTIONS') { + res.writeHead(200); + res.end(); + return; + } + + this.handleRequest(req, res); + }); + + this.server.listen(this.port, () => { + logger.info(`API server started on http://localhost:${this.port}`); + }); + } + + async handleRequest(req, res) { + const parsedUrl = url.parse(req.url, true); + const path = parsedUrl.pathname; + const method = req.method; + + try { + // Get body for POST requests + let body = ''; + if (method === 'POST') { + for await (const chunk of req) { + body += chunk; + } + } + + let result; + + switch (path) { + case '/status': + result = this.getStatus(); + break; + + case '/clients': + result = this.getClients(); + break; + + case '/clients/:id/evaluations': + const clientId = parsedUrl.query.id; + result = this.getClientEvaluations(clientId); + break; + + case '/evaluate': + if (method !== 'POST') { + this.sendError(res, 405, 'Method not allowed'); + return; + } + result = await this.triggerEvaluation(JSON.parse(body)); + break; + + default: + this.sendError(res, 404, 'Not found'); + return; + } + + this.sendResponse(res, 200, result); + + } catch (error) { + logger.error('API error:', error); + this.sendError(res, 500, error.message); + } + } + + getStatus() { + const status = this.evaluationServer.getStatus(); + const clients = this.evaluationServer.getClientManager().getAllClients(); + + return { + server: status, + clients: clients.map(client => ({ + id: client.id, + name: client.name, + connected: this.evaluationServer.connectedClients.has(client.id), + ready: this.evaluationServer.connectedClients.get(client.id)?.ready || false + })) + }; + } + + getClients() { + const clients = this.evaluationServer.getClientManager().getAllClients(); + + return clients.map(client => { + const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(client.id); + const connection = this.evaluationServer.connectedClients.get(client.id); + + return { + id: client.id, + name: client.name, + description: client.description, + connected: !!connection, + ready: connection?.ready || false, + evaluations: evaluations.map(evaluation => ({ + id: evaluation.id, + name: evaluation.name, + tool: evaluation.tool, + status: evaluation.status || 'pending', + enabled: evaluation.enabled !== false + })) + }; + }); + } + + getClientEvaluations(clientId) { + if (!clientId) { + throw new Error('Client ID is required'); + } + + const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(clientId); + return { + clientId, + evaluations: evaluations.map(evaluation => ({ + id: evaluation.id, + name: evaluation.name, + description: evaluation.description, + tool: evaluation.tool, + status: evaluation.status || 'pending', + enabled: evaluation.enabled !== false, + lastRun: evaluation.lastRun, + lastResult: evaluation.lastResult + })) + }; + } + + async triggerEvaluation(payload) { + const { clientId, evaluationId, runAll = false } = payload; + + if (!clientId) { + throw new Error('Client ID is required'); + } + + // Check if client is connected + const connection = this.evaluationServer.connectedClients.get(clientId); + if (!connection || !connection.ready) { + throw new Error(`Client '${clientId}' is not connected or not ready`); + } + + if (runAll) { + // Run all evaluations for the client + const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(clientId); + const results = []; + + for (const evaluation of evaluations) { + try { + this.evaluationServer.getClientManager().updateEvaluationStatus(clientId, evaluation.id, 'pending'); + await this.evaluationServer.executeEvaluation(connection, evaluation); + results.push({ id: evaluation.id, status: 'completed' }); + } catch (error) { + results.push({ id: evaluation.id, status: 'failed', error: error.message }); + } + } + + return { + clientId, + type: 'batch', + results + }; + } else { + // Run specific evaluation + if (!evaluationId) { + throw new Error('Evaluation ID is required when runAll is false'); + } + + const evaluation = this.evaluationServer.getClientManager().getClientEvaluations(clientId) + .find(e => e.id === evaluationId); + + if (!evaluation) { + throw new Error(`Evaluation '${evaluationId}' not found for client '${clientId}'`); + } + + this.evaluationServer.getClientManager().updateEvaluationStatus(clientId, evaluationId, 'pending'); + await this.evaluationServer.executeEvaluation(connection, evaluation); + + return { + clientId, + evaluationId, + type: 'single', + status: 'completed' + }; + } + } + + sendResponse(res, statusCode, data) { + res.writeHead(statusCode, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify(data, null, 2)); + } + + sendError(res, statusCode, message) { + res.writeHead(statusCode, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: message })); + } + + stop() { + if (this.server) { + this.server.close(); + logger.info('API server stopped'); + } + } +} + +export { APIServer }; \ No newline at end of file diff --git a/eval-server/src/cli.js b/eval-server/src/cli.js new file mode 100644 index 00000000000..6ad66dc9a5c --- /dev/null +++ b/eval-server/src/cli.js @@ -0,0 +1,307 @@ +#!/usr/bin/env node + +import { EvaluationServer } from './server.js'; +import readline from 'readline'; + +class EvaluationCLI { + constructor() { + this.server = new EvaluationServer(); + this.rl = readline.createInterface({ + input: process.stdin, + output: process.stdout + }); + } + + async start() { + console.log('🚀 Starting Evaluation Server CLI'); + console.log('===================================='); + + // Start the server + this.server.start(); + + // Wait a moment for server to start + await new Promise(resolve => setTimeout(resolve, 1000)); + + this.showHelp(); + this.startInteractiveMode(); + } + + showHelp() { + console.log('\\nAvailable commands:'); + console.log(' status - Show server status'); + console.log(' clients - List all clients and their evaluations'); + console.log(' run - Run specific evaluation for a client'); + console.log(' run-all - Run all evaluations for a client'); + console.log(' eval - Run specific evaluation on all connected clients'); + console.log(' eval all - Run all pending evaluations on all clients'); + console.log(' clients-connected - List connected clients'); + console.log(' help - Show this help'); + console.log(' quit - Exit the CLI'); + console.log(''); + } + + startInteractiveMode() { + this.rl.question('eval-server> ', (input) => { + this.handleCommand(input.trim()); + }); + } + + async handleCommand(input) { + const [command, ...args] = input.split(' '); + + try { + switch (command) { + case 'status': + this.showStatus(); + break; + case 'clients': + this.listClients(); + break; + case 'run': + if (args.length < 2) { + console.log('Usage: run '); + } else { + await this.runSpecificEvaluation(args[0], args[1]); + } + break; + case 'run-all': + if (args.length < 1) { + console.log('Usage: run-all '); + } else { + await this.runAllEvaluations(args[0]); + } + break; + case 'eval': + if (args.length === 0) { + console.log('Usage: eval OR eval all'); + } else { + await this.runEvaluation(args.join(' ')); + } + break; + case 'clients-connected': + this.listConnectedClients(); + break; + case 'help': + this.showHelp(); + break; + case 'quit': + case 'exit': + this.quit(); + return; + case '': + break; + default: + console.log(`Unknown command: ${command}. Type 'help' for available commands.`); + } + } catch (error) { + console.error('Error:', error.message); + } + + this.startInteractiveMode(); + } + + showStatus() { + const status = this.server.getStatus(); + console.log('\\n📊 Server Status:'); + console.log(` Connected clients: ${status.connectedClients}`); + console.log(` Ready clients: ${status.readyClients}`); + console.log(` Active evaluations: ${status.activeEvaluations}`); + console.log(''); + } + + listConnectedClients() { + const clients = Array.from(this.server.connectedClients.values()); + console.log('\\n👥 Connected Clients:'); + + if (clients.length === 0) { + console.log(' No clients connected'); + } else { + clients.forEach(client => { + console.log(` ID: ${client.clientId || client.id}`); + console.log(` Connected: ${client.connectedAt}`); + console.log(` Ready: ${client.ready ? 'Yes' : 'No'}`); + console.log(` Registered: ${client.registered ? 'Yes' : 'No'}`); + console.log(` Address: ${client.remoteAddress}`); + console.log(''); + }); + } + } + + async runEvaluation(task) { + if (task && task.includes('-')) { + console.log(`\\n🔍 Running specific evaluation: "${task}"`); + } else if (task === 'all') { + console.log(`\\n🔍 Running all pending evaluations`); + } else { + console.log(`\\n🔍 Running evaluation: "${task}"`); + } + console.log('====================================='); + + try { + const results = await this.server.evaluateAllClients(task); + + console.log('\\n📋 Evaluation Results:'); + results.forEach((result, index) => { + console.log(`\\n Client ${index + 1} (${result.clientId || 'unknown'}):`); + + if (result.error) { + console.log(` ❌ Error: ${result.error}`); + } else { + console.log(` ✅ Success`); + if (result.evaluationId) { + console.log(` Evaluation ID: ${result.evaluationId}`); + } + if (result.duration) { + console.log(` Duration: ${result.duration}ms`); + } + + if (result.judgeEvaluation?.overall_score) { + console.log(` Overall Score: ${result.judgeEvaluation.overall_score}/10`); + } + + if (result.clientResponse) { + const preview = result.clientResponse.length > 100 + ? result.clientResponse.substring(0, 100) + '...' + : result.clientResponse; + console.log(` Response: ${preview}`); + } + } + }); + + console.log('\\n✅ Evaluation completed'); + } catch (error) { + console.log(`\\n❌ Evaluation failed: ${error.message}`); + } + } + + listClients() { + const clients = this.server.getClientManager().getAllClients(); + console.log('\\n👥 Registered Clients:'); + + if (clients.length === 0) { + console.log(' No clients registered'); + return; + } + + clients.forEach(client => { + console.log(`\\n 📋 ${client.name} (${client.id})`); + console.log(` Description: ${client.description || 'N/A'}`); + console.log(` Secret Key: ${client.secretKey ? '***' : 'None'}`); + + const evaluations = this.server.getClientManager().getClientEvaluations(client.id); + console.log(` Evaluations: ${evaluations.length}`); + + evaluations.forEach(evaluation => { + const status = evaluation.status || 'pending'; + const statusIcon = status === 'completed' ? '✅' : status === 'running' ? '🔄' : status === 'failed' ? '❌' : '⏳'; + console.log(` ${statusIcon} ${evaluation.id}: ${evaluation.name}`); + }); + }); + console.log(''); + } + + async runSpecificEvaluation(clientId, evaluationId) { + console.log(`\\n🎯 Running evaluation '${evaluationId}' for client '${clientId}'...`); + + try { + // Check if client is connected + const connection = this.server.connectedClients.get(clientId); + if (!connection || !connection.ready) { + console.log(`❌ Client '${clientId}' is not connected or not ready`); + return; + } + + // Get the evaluation + const evaluation = this.server.getClientManager().getClientEvaluations(clientId) + .find(e => e.id === evaluationId); + + if (!evaluation) { + console.log(`❌ Evaluation '${evaluationId}' not found for client '${clientId}'`); + return; + } + + // Reset evaluation status to pending + this.server.getClientManager().updateEvaluationStatus(clientId, evaluationId, 'pending'); + + // Execute the evaluation + await this.server.executeEvaluation(connection, evaluation); + + console.log(`✅ Evaluation '${evaluationId}' completed successfully`); + } catch (error) { + console.log(`❌ Evaluation failed: ${error.message}`); + } + } + + async runAllEvaluations(clientId) { + console.log(`\\n🚀 Running all evaluations for client '${clientId}'...`); + + try { + // Check if client is connected + const connection = this.server.connectedClients.get(clientId); + if (!connection || !connection.ready) { + console.log(`❌ Client '${clientId}' is not connected or not ready`); + return; + } + + // Get all evaluations for this client + const evaluations = this.server.getClientManager().getClientEvaluations(clientId); + + if (evaluations.length === 0) { + console.log(`❌ No evaluations found for client '${clientId}'`); + return; + } + + console.log(`Found ${evaluations.length} evaluations to run...`); + + let completed = 0; + let failed = 0; + + for (const evaluation of evaluations) { + console.log(`\\n🔄 Running: ${evaluation.name} (${evaluation.id})`); + + try { + // Reset evaluation status to pending + this.server.getClientManager().updateEvaluationStatus(clientId, evaluation.id, 'pending'); + + // Execute the evaluation + await this.server.executeEvaluation(connection, evaluation); + + console.log(` ✅ Completed: ${evaluation.name}`); + completed++; + } catch (error) { + console.log(` ❌ Failed: ${evaluation.name} - ${error.message}`); + failed++; + } + + // Add a small delay between evaluations + await new Promise(resolve => setTimeout(resolve, 2000)); + } + + console.log(`\\n📊 Results: ${completed} completed, ${failed} failed`); + } catch (error) { + console.log(`❌ Batch evaluation failed: ${error.message}`); + } + } + + + quit() { + console.log('\\n👋 Shutting down...'); + this.server.stop(); + this.rl.close(); + process.exit(0); + } +} + +// Start CLI if this file is run directly +if (import.meta.url === `file://${process.argv[1]}`) { + const cli = new EvaluationCLI(); + + process.on('SIGINT', () => { + cli.quit(); + }); + + cli.start().catch(error => { + console.error('Failed to start CLI:', error.message); + process.exit(1); + }); +} \ No newline at end of file diff --git a/eval-server/src/client-manager.js b/eval-server/src/client-manager.js new file mode 100644 index 00000000000..3b1ec3bd6b1 --- /dev/null +++ b/eval-server/src/client-manager.js @@ -0,0 +1,310 @@ +import fs from 'fs'; +import path from 'path'; +import yaml from 'js-yaml'; +import { v4 as uuidv4 } from 'uuid'; +import logger from './logger.js'; + +class ClientManager { + constructor(clientsDir = './clients', evalsDir = './evals') { + this.clientsDir = path.resolve(clientsDir); + this.evalsDir = path.resolve(evalsDir); + this.clients = new Map(); + this.evaluations = new Map(); // clientId -> evaluations array + + // Ensure directories exist + if (!fs.existsSync(this.clientsDir)) { + fs.mkdirSync(this.clientsDir, { recursive: true }); + } + if (!fs.existsSync(this.evalsDir)) { + fs.mkdirSync(this.evalsDir, { recursive: true }); + } + + this.loadAllClients(); + this.loadAllEvaluations(); + } + + /** + * Load all client YAML files on startup + */ + loadAllClients() { + try { + const files = fs.readdirSync(this.clientsDir) + .filter(f => f.endsWith('.yaml') || f.endsWith('.yml')); + + for (const file of files) { + const clientId = path.basename(file, path.extname(file)); + try { + this.loadClient(clientId); + } catch (error) { + logger.error(`Failed to load client ${clientId}:`, error); + } + } + + logger.info(`Loaded ${this.clients.size} clients`); + } catch (error) { + logger.error('Failed to load clients:', error); + } + } + + /** + * Load a specific client's YAML configuration + */ + loadClient(clientId) { + const yamlPath = path.join(this.clientsDir, `${clientId}.yaml`); + + if (!fs.existsSync(yamlPath)) { + throw new Error(`Client YAML not found: ${yamlPath}`); + } + + const yamlContent = fs.readFileSync(yamlPath, 'utf8'); + const config = yaml.load(yamlContent); + + // Validate client configuration + if (!config.client || config.client.id !== clientId) { + throw new Error(`Invalid client configuration: ID mismatch`); + } + + // Store client info + this.clients.set(clientId, { + id: config.client.id, + name: config.client.name, + secretKey: config.client.secret_key, + description: config.client.description, + settings: config.settings || {}, + yamlPath + }); + + // Note: Evaluations are now loaded separately from the evals directory + // Initialize empty evaluations array for this client + if (!this.evaluations.has(clientId)) { + this.evaluations.set(clientId, []); + } + + logger.info(`Loaded client ${clientId}`); + return config; + } + + /** + * Load all evaluations from the evals directory structure + */ + loadAllEvaluations() { + try { + // Find all category directories + const categories = fs.readdirSync(this.evalsDir) + .filter(dir => fs.statSync(path.join(this.evalsDir, dir)).isDirectory()); + + let totalEvaluations = 0; + + for (const category of categories) { + const categoryDir = path.join(this.evalsDir, category); + const evalFiles = fs.readdirSync(categoryDir) + .filter(f => f.endsWith('.yaml') || f.endsWith('.yml')); + + for (const file of evalFiles) { + try { + const evalPath = path.join(categoryDir, file); + const yamlContent = fs.readFileSync(evalPath, 'utf8'); + const evaluation = yaml.load(yamlContent); + + if (evaluation.enabled !== false) { + // Add evaluation to all clients for now + // In the future, you might want to have client-specific evaluation assignments + for (const [clientId] of this.clients) { + const clientEvals = this.evaluations.get(clientId) || []; + clientEvals.push({ + ...evaluation, + clientId, + status: 'pending', + category, + filePath: evalPath + }); + this.evaluations.set(clientId, clientEvals); + } + totalEvaluations++; + } + } catch (error) { + logger.error(`Failed to load evaluation ${file}:`, error); + } + } + } + + // Update the client evaluation counts + for (const [clientId] of this.clients) { + const evalCount = this.evaluations.get(clientId)?.length || 0; + logger.info(`Loaded client ${clientId} with ${evalCount} evaluations`); + } + + logger.info(`Loaded ${totalEvaluations} evaluations from ${categories.length} categories`); + } catch (error) { + logger.error('Failed to load evaluations:', error); + } + } + + /** + * Register a new client with authentication + */ + registerClient(clientId, secretKey, capabilities, skipSecretValidation = false) { + const client = this.clients.get(clientId); + + if (!client) { + throw new Error(`Client ${clientId} not found. Please create a YAML configuration file.`); + } + + // Verify secret key if configured (unless we're skipping validation) + if (!skipSecretValidation && client.secretKey && client.secretKey !== secretKey) { + throw new Error('Invalid secret key'); + } + + // Update client capabilities + client.capabilities = capabilities; + client.lastRegistered = new Date().toISOString(); + + return { + success: true, + clientName: client.name, + evaluationsCount: this.evaluations.get(clientId)?.length || 0 + }; + } + + /** + * Get client information + */ + getClient(clientId) { + return this.clients.get(clientId); + } + + /** + * Get evaluations for a client + */ + getClientEvaluations(clientId) { + return this.evaluations.get(clientId) || []; + } + + /** + * Get next pending evaluation for a client + */ + getNextEvaluation(clientId) { + const evaluations = this.evaluations.get(clientId) || []; + return evaluations.find(e => e.status === 'pending'); + } + + /** + * Update evaluation status + */ + updateEvaluationStatus(clientId, evaluationId, status, result = null) { + const evaluations = this.evaluations.get(clientId); + if (!evaluations) return; + + const evaluation = evaluations.find(e => e.id === evaluationId); + if (evaluation) { + evaluation.status = status; + evaluation.lastRun = new Date().toISOString(); + if (result) { + evaluation.lastResult = result; + } + } + } + + /** + * Create a new client with default configuration + */ + async createClient(clientName, secretKey = null) { + const clientId = uuidv4(); + return this.createClientWithId(clientId, clientName, secretKey); + } + + /** + * Create a new client with a specific ID + */ + async createClientWithId(clientId, clientName, secretKey = null) { + const yamlPath = path.join(this.clientsDir, `${clientId}.yaml`); + + // Create simplified client configuration (evaluations come from evals directory) + const defaultConfig = { + client: { + id: clientId, + name: clientName, + secret_key: secretKey, + description: `Auto-generated DevTools evaluation client` + }, + settings: { + max_concurrent_evaluations: 3, + default_timeout: 45000, + retry_policy: { + max_retries: 2, + backoff_multiplier: 2, + initial_delay: 1000 + } + } + }; + + // Write YAML file + const yamlContent = yaml.dump(defaultConfig, { indent: 2 }); + fs.writeFileSync(yamlPath, yamlContent); + + // Load the new client + this.loadClient(clientId); + + // Load evaluations for the new client + this.loadAllEvaluations(); + + logger.info(`Created new client: ${clientId}`); + return { clientId, yamlPath }; + } + + /** + * Reload a specific client's configuration + */ + reloadClient(clientId) { + try { + this.loadClient(clientId); + logger.info(`Reloaded client: ${clientId}`); + return true; + } catch (error) { + logger.error(`Failed to reload client ${clientId}:`, error); + return false; + } + } + + /** + * Get all active clients + */ + getAllClients() { + return Array.from(this.clients.values()); + } + + /** + * Validate client exists and is authorized + */ + validateClient(clientId, secretKey = null, skipSecretValidation = false) { + const client = this.clients.get(clientId); + + logger.debug('validateClient', { + clientId, + clientExists: !!client, + hasSecretKey: !!secretKey, + skipSecretValidation, + clientSecretKey: client ? '[REDACTED]' : 'N/A' + }); + + if (!client) { + logger.debug('Client not found', { clientId }); + return { valid: false, reason: 'Client not found' }; + } + + // Skip secret key validation if explicitly requested (for new auth flow) + if (!skipSecretValidation && secretKey !== null && client.secretKey && client.secretKey !== secretKey) { + logger.warn('Secret key mismatch', { + clientId, + hasProvidedKey: !!secretKey, + hasStoredKey: !!client.secretKey + }); + return { valid: false, reason: 'Invalid secret key' }; + } + + logger.debug('Client validation successful', { clientId }); + return { valid: true }; + } +} + +export { ClientManager }; \ No newline at end of file diff --git a/eval-server/src/config.js b/eval-server/src/config.js new file mode 100644 index 00000000000..0f6c9f3e9b3 --- /dev/null +++ b/eval-server/src/config.js @@ -0,0 +1,40 @@ +import { config } from 'dotenv'; + +config(); + +export const CONFIG = { + server: { + port: parseInt(process.env.PORT) || 8080, + host: process.env.HOST || 'localhost' + }, + + llm: { + apiKey: process.env.OPENAI_API_KEY, + model: process.env.JUDGE_MODEL || 'gpt-4', + temperature: parseFloat(process.env.JUDGE_TEMPERATURE) || 0.1 + }, + + logging: { + level: process.env.LOG_LEVEL || 'info', + dir: process.env.LOG_DIR || './logs' + }, + + rpc: { + timeout: parseInt(process.env.RPC_TIMEOUT) || 30000, + maxConcurrentEvaluations: parseInt(process.env.MAX_CONCURRENT_EVALUATIONS) || 10 + } +}; + +export function validateConfig() { + const errors = []; + + if (!CONFIG.llm.apiKey) { + errors.push('OPENAI_API_KEY is required'); + } + + if (CONFIG.server.port < 1 || CONFIG.server.port > 65535) { + errors.push('PORT must be between 1 and 65535'); + } + + return errors; +} \ No newline at end of file diff --git a/eval-server/src/evaluator.js b/eval-server/src/evaluator.js new file mode 100644 index 00000000000..95ac14ab9f1 --- /dev/null +++ b/eval-server/src/evaluator.js @@ -0,0 +1,117 @@ +import OpenAI from 'openai'; +import { CONFIG } from './config.js'; +import logger from './logger.js'; + +export class LLMEvaluator { + constructor() { + if (!CONFIG.llm.apiKey) { + throw new Error('OpenAI API key is required'); + } + + this.openai = new OpenAI({ + apiKey: CONFIG.llm.apiKey + }); + } + + async evaluate(task, agentResponse) { + try { + const prompt = this.buildEvaluationPrompt(task, agentResponse); + + const completion = await this.openai.chat.completions.create({ + model: CONFIG.llm.model, + messages: [ + { + role: 'system', + content: 'You are an expert evaluator of AI agent responses. Provide objective, detailed evaluations.' + }, + { + role: 'user', + content: prompt + } + ], + temperature: CONFIG.llm.temperature, + max_tokens: 1000 + }); + + const evaluation = completion.choices[0].message.content; + const usage = completion.usage; + + logger.info('LLM evaluation completed', { + tokens_used: usage.total_tokens, + model: CONFIG.llm.model + }); + + return this.parseEvaluation(evaluation); + } catch (error) { + logger.error('LLM evaluation failed', { error: error.message }); + throw error; + } + } + + buildEvaluationPrompt(task, agentResponse) { + return `Please evaluate the following AI agent response to a given task. + +TASK: +${task} + +AGENT RESPONSE: +${agentResponse} + +Please evaluate the response on the following criteria and provide a JSON response: + +1. **Correctness**: Is the response factually accurate and correct? +2. **Completeness**: Does the response fully address the task? +3. **Clarity**: Is the response clear and well-structured? +4. **Relevance**: Is the response relevant to the task? +5. **Helpfulness**: How helpful is the response to the user? + +Provide your evaluation in the following JSON format: +{ + "overall_score": , + "criteria_scores": { + "correctness": , + "completeness": , + "clarity": , + "relevance": , + "helpfulness": + }, + "reasoning": "", + "strengths": [""], + "weaknesses": [""], + "suggestions": [""] +}`; + } + + parseEvaluation(evaluationText) { + try { + // Try to extract JSON from the response + const jsonMatch = evaluationText.match(/\{[\s\S]*\}/); + if (jsonMatch) { + return JSON.parse(jsonMatch[0]); + } + + // If no JSON found, return a structured response with the raw text + return { + overall_score: null, + criteria_scores: {}, + reasoning: evaluationText, + strengths: [], + weaknesses: [], + suggestions: [], + raw_evaluation: evaluationText + }; + } catch (error) { + logger.warn('Failed to parse evaluation JSON', { error: error.message }); + return { + overall_score: null, + criteria_scores: {}, + reasoning: evaluationText, + strengths: [], + weaknesses: [], + suggestions: [], + raw_evaluation: evaluationText, + parse_error: error.message + }; + } + } +} \ No newline at end of file diff --git a/eval-server/src/logger.js b/eval-server/src/logger.js new file mode 100644 index 00000000000..5452cffbb41 --- /dev/null +++ b/eval-server/src/logger.js @@ -0,0 +1,102 @@ +import winston from 'winston'; +import { existsSync, mkdirSync } from 'fs'; +import { CONFIG } from './config.js'; + +// Ensure logs directory exists +if (!existsSync(CONFIG.logging.dir)) { + mkdirSync(CONFIG.logging.dir, { recursive: true }); +} + +const logger = winston.createLogger({ + level: CONFIG.logging.level, + format: winston.format.combine( + winston.format.timestamp(), + winston.format.errors({ stack: true }), + winston.format.json() + ), + defaultMeta: { service: 'bo-eval-server' }, + transports: [ + new winston.transports.File({ + filename: `${CONFIG.logging.dir}/error.log`, + level: 'error' + }), + new winston.transports.File({ + filename: `${CONFIG.logging.dir}/combined.log` + }), + new winston.transports.Console({ + format: winston.format.combine( + winston.format.colorize(), + winston.format.simple() + ) + }) + ] +}); + +export function logEvaluation(evaluationData) { + const logEntry = { + type: 'evaluation', + timestamp: new Date().toISOString(), + ...evaluationData + }; + + // Pretty print evaluation summary to console + console.log('\n' + '='.repeat(80)); + console.log(`📊 EVALUATION COMPLETED: ${evaluationData.name}`); + console.log('='.repeat(80)); + console.log(`🆔 ID: ${evaluationData.evaluationId}`); + console.log(`🔧 Tool: ${evaluationData.tool}`); + console.log(`⏱️ Duration: ${evaluationData.duration}ms`); + console.log(`👤 Client: ${evaluationData.clientId}`); + + if (evaluationData.response?.output?.output) { + console.log(`\n📝 Output:\n${evaluationData.response.output.output}`); + } + + if (evaluationData.validation?.result) { + const val = evaluationData.validation.result; + console.log(`\n📋 Validation:`); + console.log(` ✅ Passed: ${evaluationData.validation.passed ? 'YES' : 'NO'}`); + console.log(` 📊 Overall Score: ${val.overall_score}/10`); + if (val.strengths?.length > 0) { + console.log(` 💪 Strengths: ${val.strengths.join(', ')}`); + } + if (val.weaknesses?.length > 0) { + console.log(` ⚠️ Weaknesses: ${val.weaknesses.join(', ')}`); + } + } + + console.log('='.repeat(80) + '\n'); + + // Also log structured data for file logs + logger.info('Evaluation completed', logEntry); + + // Also save to dedicated evaluation log + const evaluationLogger = winston.createLogger({ + format: winston.format.json(), + transports: [ + new winston.transports.File({ + filename: `${CONFIG.logging.dir}/evaluations.jsonl` + }) + ] + }); + + evaluationLogger.info(logEntry); +} + +export function logRpcCall(callData) { + logger.info('RPC call', { + type: 'rpc', + timestamp: new Date().toISOString(), + ...callData + }); +} + +export function logConnection(connectionData) { + logger.info('Connection event', { + type: 'connection', + timestamp: new Date().toISOString(), + ...connectionData + }); +} + +export default logger; \ No newline at end of file diff --git a/eval-server/src/rpc-client.js b/eval-server/src/rpc-client.js new file mode 100644 index 00000000000..8de13cac81b --- /dev/null +++ b/eval-server/src/rpc-client.js @@ -0,0 +1,122 @@ +import { v4 as uuidv4 } from 'uuid'; +import { CONFIG } from './config.js'; +import { logRpcCall } from './logger.js'; + +export class RpcClient { + constructor() { + this.pendingRequests = new Map(); + } + + async callMethod(ws, method, params, timeout = CONFIG.rpc.timeout) { + return new Promise((resolve, reject) => { + const id = uuidv4(); + const request = { + jsonrpc: '2.0', + method, + params, + id + }; + + // Set up timeout + const timeoutId = setTimeout(() => { + this.pendingRequests.delete(id); + logRpcCall({ + id, + method, + params, + status: 'timeout', + error: 'Request timeout' + }); + reject(new Error(`RPC call timeout after ${timeout}ms`)); + }, timeout); + + // Store the request for correlation + this.pendingRequests.set(id, { + resolve, + reject, + timeoutId, + method, + params, + timestamp: Date.now() + }); + + // Send the request + try { + ws.send(JSON.stringify(request)); + logRpcCall({ + id, + method, + params, + status: 'sent' + }); + } catch (error) { + this.pendingRequests.delete(id); + clearTimeout(timeoutId); + logRpcCall({ + id, + method, + params, + status: 'error', + error: error.message + }); + reject(error); + } + }); + } + + handleResponse(message) { + try { + const response = JSON.parse(message); + + // Check if it's a valid JSON-RPC response + if (response.jsonrpc !== '2.0' || !response.id) { + return false; + } + + const pendingRequest = this.pendingRequests.get(response.id); + if (!pendingRequest) { + return false; + } + + // Clean up + this.pendingRequests.delete(response.id); + clearTimeout(pendingRequest.timeoutId); + + // Handle response + if (response.error) { + logRpcCall({ + id: response.id, + method: pendingRequest.method, + params: pendingRequest.params, + status: 'error', + error: response.error, + duration: Date.now() - pendingRequest.timestamp + }); + pendingRequest.reject(new Error(response.error.message || 'RPC error')); + } else { + logRpcCall({ + id: response.id, + method: pendingRequest.method, + params: pendingRequest.params, + status: 'success', + result: response.result, + duration: Date.now() - pendingRequest.timestamp + }); + pendingRequest.resolve(response.result); + } + + return true; + } catch (error) { + return false; + } + } + + cleanup() { + // Cleanup any pending requests + for (const [id, request] of this.pendingRequests) { + clearTimeout(request.timeoutId); + request.reject(new Error('Connection closed')); + } + this.pendingRequests.clear(); + } +} \ No newline at end of file diff --git a/eval-server/src/server.js b/eval-server/src/server.js new file mode 100644 index 00000000000..f15f269242c --- /dev/null +++ b/eval-server/src/server.js @@ -0,0 +1,635 @@ +import { WebSocketServer } from 'ws'; +import { v4 as uuidv4 } from 'uuid'; +import { CONFIG, validateConfig } from './config.js'; +import { RpcClient } from './rpc-client.js'; +import { LLMEvaluator } from './evaluator.js'; +import { logConnection, logEvaluation } from './logger.js'; +import logger from './logger.js'; +import { ClientManager } from './client-manager.js'; +import { APIServer } from './api-server.js'; + +class EvaluationServer { + constructor() { + this.connectedClients = new Map(); + this.rpcClient = new RpcClient(); + this.evaluator = new LLMEvaluator(); + this.evaluationQueue = []; + this.activeEvaluations = 0; + this.clientManager = new ClientManager('./clients', './evals'); + this.apiServer = new APIServer(this); + } + + start() { + // Validate configuration + const configErrors = validateConfig(); + if (configErrors.length > 0) { + logger.error('Configuration errors:', configErrors); + process.exit(1); + } + + // Create WebSocket server + this.wss = new WebSocketServer({ + port: CONFIG.server.port, + host: CONFIG.server.host + }); + + this.wss.on('connection', this.handleConnection.bind(this)); + this.wss.on('error', (error) => { + logger.error('WebSocket server error', { error: error.message }); + }); + + logger.info(`Evaluation server started on ws://${CONFIG.server.host}:${CONFIG.server.port}`); + + // Start API server + this.apiServer.start(); + + this.startEvaluationProcessor(); + } + + handleConnection(ws, request) { + const connectionId = uuidv4(); // Temporary ID until registration + const connection = { + id: connectionId, + ws, + rpcClient: new RpcClient(), + connectedAt: new Date().toISOString(), + remoteAddress: request.socket.remoteAddress, + registered: false, + clientId: null + }; + + // Store temporarily with connection ID + this.connectedClients.set(connectionId, connection); + + logConnection({ + event: 'connected', + connectionId, + remoteAddress: connection.remoteAddress, + totalConnections: this.connectedClients.size + }); + + ws.on('message', (message) => { + this.handleMessage(connection, message); + }); + + ws.on('close', () => { + this.handleDisconnection(connection); + }); + + ws.on('error', (error) => { + logger.error('WebSocket connection error', { + connectionId: connection.id, + clientId: connection.clientId, + error: error.message + }); + }); + + // Send welcome message + this.sendMessage(ws, { + type: 'welcome', + serverId: 'server-001', + version: '1.0.0', + timestamp: new Date().toISOString() + }); + } + + async handleMessage(connection, message) { + try { + // Parse message first + const data = JSON.parse(message); + + // Try to handle as RPC response first + if (data.jsonrpc === '2.0' && (data.result || data.error) && data.id) { + if (connection.rpcClient.handleResponse(message)) { + return; + } + // If RPC client couldn't handle it, log but don't treat as unknown + logger.debug('RPC response could not be handled', { + connectionId: connection.id, + clientId: connection.clientId, + id: data.id + }); + return; + } + + // Handle other message types + switch (data.type) { + case 'register': + await this.handleRegistration(connection, data); + break; + case 'ping': + this.sendMessage(connection.ws, { + type: 'pong', + timestamp: new Date().toISOString() + }); + break; + case 'ready': + if (!connection.registered) { + logger.warn('Received ready signal from unregistered client', { + connectionId: connection.id + }); + return; + } + connection.ready = true; + logger.info('Client ready for evaluations', { + clientId: connection.clientId + }); + // Don't automatically start evaluations - wait for manual trigger + // this.processClientEvaluations(connection.clientId); + break; + case 'status': + this.handleStatusUpdate(connection, data); + break; + case 'auth_verify': + this.handleAuthVerification(connection, data); + break; + default: + logger.warn('Unknown message type', { + connectionId: connection.id, + clientId: connection.clientId, + type: data.type, + messageKeys: Object.keys(data) + }); + } + } catch (error) { + logger.warn('Failed to parse message', { + connectionId: connection.id, + error: error.message, + messageLength: message.length + }); + } + } + + async handleRegistration(connection, data) { + try { + const { clientId, secretKey, capabilities } = data; + + logger.info('Registration attempt', { + clientId, + hasSecretKey: !!secretKey, + secretKey: secretKey ? '[REDACTED]' : 'none' + }); + + // Check if client exists (don't validate secret key yet - that happens later) + const validation = this.clientManager.validateClient(clientId, null, true); + if (!validation.valid) { + if (validation.reason === 'Client not found') { + // Auto-create new client configuration + try { + logger.info('Auto-creating new client configuration', { clientId }); + await this.clientManager.createClientWithId(clientId, `DevTools Client ${clientId.substring(0, 8)}`, 'hello'); + + // Send rejection for first-time registration to allow server to set secret key + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId, + status: 'rejected', + reason: 'New client created. Please reconnect to complete registration.', + newClient: true + }); + logger.info('New client configuration created, requesting reconnection', { clientId }); + return; + } catch (error) { + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId, + status: 'rejected', + reason: `Failed to create client configuration: ${error.message}` + }); + logger.error('Failed to auto-create client', { clientId, error: error.message }); + return; + } + } else { + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId, + status: 'rejected', + reason: validation.reason + }); + logger.warn('Client registration rejected', { + clientId, + reason: validation.reason + }); + return; + } + } + + // Get client info including the server's secret key for this client + const client = this.clientManager.getClient(clientId); + if (!client) { + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId, + status: 'rejected', + reason: 'Client configuration not found' + }); + return; + } + + // Send server's secret key to client for verification + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId, + status: 'auth_required', + serverSecretKey: client.secretKey || '', + message: 'Please verify secret key' + }); + + // Store connection info but don't register yet + connection.clientId = clientId; + connection.capabilities = capabilities; + connection.awaitingAuth = true; + + logger.info('Client registered successfully', { + clientId, + capabilities: capabilities?.tools?.join(', ') + }); + + } catch (error) { + logger.error('Registration error', { error: error.message }); + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId: data.clientId, + status: 'rejected', + reason: error.message + }); + } + } + + handleStatusUpdate(connection, data) { + if (!connection.registered) return; + + const { evaluationId, status, progress, message } = data; + + logger.info('Evaluation status update', { + clientId: connection.clientId, + evaluationId, + status, + progress, + message + }); + + // Update evaluation status in client manager + this.clientManager.updateEvaluationStatus( + connection.clientId, + evaluationId, + status + ); + } + + handleAuthVerification(connection, data) { + if (!connection.awaitingAuth) { + logger.warn('Received auth verification from non-awaiting connection', { + connectionId: connection.id, + clientId: connection.clientId + }); + return; + } + + const { clientId, verified } = data; + + if (verified) { + // Authentication successful - complete registration (skip secret validation since already verified) + const result = this.clientManager.registerClient(clientId, '', connection.capabilities, true); + + connection.registered = true; + connection.awaitingAuth = false; + + // Move connection to use clientId as key + this.connectedClients.delete(connection.id); + this.connectedClients.set(clientId, connection); + + // Send final acknowledgment + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId, + status: 'accepted', + message: result.clientName ? `Welcome ${result.clientName}` : 'Client authenticated successfully', + evaluationsCount: result.evaluationsCount + }); + + logger.info('Client authenticated and registered', { clientId }); + } else { + // Authentication failed + this.sendMessage(connection.ws, { + type: 'registration_ack', + clientId, + status: 'rejected', + reason: 'Secret key verification failed' + }); + + logger.warn('Client authentication failed', { clientId }); + connection.ws.close(1008, 'Authentication failed'); + } + } + + handleDisconnection(connection) { + connection.rpcClient.cleanup(); + + // Remove by connection ID or client ID + if (connection.registered && connection.clientId) { + this.connectedClients.delete(connection.clientId); + } else { + this.connectedClients.delete(connection.id); + } + + logConnection({ + event: 'disconnected', + connectionId: connection.id, + clientId: connection.clientId, + totalConnections: this.connectedClients.size + }); + } + + sendMessage(ws, data) { + if (ws.readyState === ws.OPEN) { + ws.send(JSON.stringify(data)); + } + } + + async processClientEvaluations(clientId) { + const client = this.connectedClients.get(clientId); + if (!client || !client.ready) return; + + // Get next pending evaluation for this client + const evaluation = this.clientManager.getNextEvaluation(clientId); + if (!evaluation) { + logger.info('No pending evaluations for client', { clientId }); + return; + } + + // Execute the evaluation + try { + await this.executeEvaluation(client, evaluation); + + // Process next evaluation after a delay + setTimeout(() => { + this.processClientEvaluations(clientId); + }, 1000); + } catch (error) { + logger.error('Failed to execute evaluation', { + clientId, + evaluationId: evaluation.id, + error: error.message + }); + } + } + + async executeEvaluation(client, evaluation) { + const startTime = Date.now(); + const rpcId = `rpc-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`; + + try { + logger.info('Starting evaluation', { + clientId: client.clientId, + evaluationId: evaluation.id, + tool: evaluation.tool + }); + + // Update status to running + this.clientManager.updateEvaluationStatus( + client.clientId, + evaluation.id, + 'running' + ); + + // Prepare RPC request + const rpcRequest = { + jsonrpc: '2.0', + method: 'evaluate', + params: { + evaluationId: evaluation.id, + name: evaluation.name, + url: evaluation.target?.url || evaluation.url, + tool: evaluation.tool, + input: evaluation.input, + timeout: evaluation.timeout || 30000, + metadata: { + tags: evaluation.metadata?.tags || [], + retries: evaluation.settings?.retry_policy?.max_retries || 0 + } + }, + id: rpcId + }; + + // Send RPC request with proper timeout + const response = await client.rpcClient.callMethod( + client.ws, + 'evaluate', + rpcRequest.params, + evaluation.timeout || 45000 + ); + + logger.info('Evaluation response received', { + clientId: client.clientId, + evaluationId: evaluation.id, + executionTime: response.executionTime + }); + + // Validate response based on YAML configuration + let validationResult = null; + if (evaluation.validation) { + validationResult = await this.validateResponse( + response, + evaluation + ); + } + + // Update evaluation status + this.clientManager.updateEvaluationStatus( + client.clientId, + evaluation.id, + 'completed', + { + response, + validation: validationResult, + duration: Date.now() - startTime + } + ); + + // Log evaluation + logEvaluation({ + evaluationId: evaluation.id, + clientId: client.clientId, + name: evaluation.name, + tool: evaluation.tool, + response, + validation: validationResult, + timestamp: new Date().toISOString(), + duration: Date.now() - startTime + }); + + } catch (error) { + logger.error('Evaluation failed', { + clientId: client.clientId, + evaluationId: evaluation.id, + error: error.message + }); + + // Update status to failed + this.clientManager.updateEvaluationStatus( + client.clientId, + evaluation.id, + 'failed', + { + error: error.message, + duration: Date.now() - startTime + } + ); + + throw error; + } + } + + async validateResponse(response, evaluation) { + const validation = evaluation.validation; + + if (validation.type === 'llm-judge' || validation.type === 'hybrid') { + const llmConfig = validation.llm_judge || validation.llm_judge; + + // Prepare prompt with criteria + const criteria = llmConfig.criteria || []; + const task = `${evaluation.name} - ${evaluation.description || ''}`; + + // Use LLM evaluator + const judgeResult = await this.evaluator.evaluate( + task, + JSON.stringify(response.output || response), + { + criteria, + model: llmConfig.model + } + ); + + return { + type: 'llm-judge', + result: judgeResult, + passed: judgeResult.score >= 0.7 // Configurable threshold + }; + } + + // Add other validation types as needed + return null; + } + + async evaluateAllClients(task) { + const readyClients = Array.from(this.connectedClients.values()) + .filter(client => client.ready); + + if (readyClients.length === 0) { + throw new Error('No ready clients available'); + } + + logger.info(`Starting evaluation for ${readyClients.length} clients`, { task }); + + // If task looks like an evaluation ID, run that specific evaluation + if (task && task.includes('-')) { + const evaluationPromises = readyClients.map(async (client) => { + try { + // Find the specific evaluation by ID + const evaluation = this.clientManager.getClientEvaluations(client.clientId) + .find(e => e.id === task); + + if (!evaluation) { + logger.warn(`Evaluation '${task}' not found for client ${client.clientId}`); + return { + error: `Evaluation '${task}' not found`, + clientId: client.clientId + }; + } + + // Reset evaluation status to pending + this.clientManager.updateEvaluationStatus(client.clientId, evaluation.id, 'pending'); + + // Execute the specific evaluation + await this.executeEvaluation(client, evaluation); + + return { + success: true, + clientId: client.clientId, + evaluationId: evaluation.id + }; + } catch (error) { + return { + error: error.message, + clientId: client.clientId + }; + } + }); + + const results = await Promise.all(evaluationPromises); + + logger.info('Specific evaluation completed', { + evaluationId: task, + totalClients: readyClients.length, + successfulEvaluations: results.filter(r => !r.error).length, + failedEvaluations: results.filter(r => r.error).length + }); + + return results; + } + + // Otherwise, process all pending evaluations (original behavior) + const evaluationPromises = readyClients.map(client => + this.processClientEvaluations(client.clientId).catch(error => ({ + error: error.message, + clientId: client.clientId + })) + ); + + const results = await Promise.all(evaluationPromises); + + logger.info('Batch evaluation completed', { + totalClients: readyClients.length, + successfulEvaluations: results.filter(r => !r.error).length, + failedEvaluations: results.filter(r => r.error).length + }); + + return results; + } + + startEvaluationProcessor() { + // This method can be extended to process evaluation queues + // For now, it's a placeholder for future batch processing functionality + logger.info('Evaluation processor started'); + } + + getStatus() { + return { + connectedClients: this.connectedClients.size, + readyClients: Array.from(this.connectedClients.values()) + .filter(client => client.ready).length, + activeEvaluations: this.activeEvaluations + }; + } + + getClientManager() { + return this.clientManager; + } + + stop() { + if (this.wss) { + this.wss.close(); + logger.info('Evaluation server stopped'); + } + + if (this.apiServer) { + this.apiServer.stop(); + } + } +} + +// Start the server if this file is run directly +if (import.meta.url === `file://${process.argv[1]}`) { + const server = new EvaluationServer(); + + process.on('SIGINT', () => { + logger.info('Received SIGINT, shutting down gracefully'); + server.stop(); + process.exit(0); + }); + + server.start(); +} + +export { EvaluationServer }; \ No newline at end of file diff --git a/eval-server/templates/default-client.yaml b/eval-server/templates/default-client.yaml new file mode 100644 index 00000000000..a74e0defebf --- /dev/null +++ b/eval-server/templates/default-client.yaml @@ -0,0 +1,58 @@ +# Default client configuration template +# This file is used as a template when creating new clients + +client: + id: "{CLIENT_ID}" + name: "{CLIENT_NAME}" + secret_key: "{SECRET_KEY}" # Optional + description: "Auto-generated client configuration" + +settings: + max_concurrent_evaluations: 3 + default_timeout: 30000 + retry_policy: + max_retries: 2 + backoff_multiplier: 2 + initial_delay: 1000 + +evaluations: + # Example evaluation - disabled by default + - id: "example-schema-extraction" + name: "Example Schema Extraction" + description: "A sample evaluation for schema extraction" + enabled: false + + target: + url: "https://example.com" + wait_for: "networkidle" + wait_timeout: 5000 + + tool: "extract_schema_data" + timeout: 30000 + + input: + schema: + type: "object" + properties: + title: + type: "string" + description: "Page title" + content: + type: "string" + description: "Main content" + + schedule: + type: "on_demand" + + validation: + type: "llm-judge" + llm_judge: + model: "gpt-4o-mini" + temperature: 0.3 + criteria: + - "Title should be extracted correctly" + - "Content should be meaningful and not empty" + + metadata: + tags: ["example", "schema-extraction"] + priority: "normal" \ No newline at end of file diff --git a/front_end/panels/ai_chat/BUILD.gn b/front_end/panels/ai_chat/BUILD.gn index 9abdfea6200..2443386510d 100644 --- a/front_end/panels/ai_chat/BUILD.gn +++ b/front_end/panels/ai_chat/BUILD.gn @@ -83,6 +83,10 @@ devtools_module("ai_chat") { "common/log.ts", "common/context.ts", "common/page.ts", + "common/WebSocketRPCClient.ts", + "common/EvaluationConfig.ts", + "evaluation/EvaluationProtocol.ts", + "evaluation/EvaluationAgent.ts", "tracing/TracingProvider.ts", "tracing/LangfuseProvider.ts", "tracing/TracingConfig.ts", @@ -174,6 +178,10 @@ _ai_chat_sources = [ "common/log.ts", "common/context.ts", "common/page.ts", + "common/WebSocketRPCClient.ts", + "common/EvaluationConfig.ts", + "evaluation/EvaluationProtocol.ts", + "evaluation/EvaluationAgent.ts", "tracing/TracingProvider.ts", "tracing/LangfuseProvider.ts", "tracing/TracingConfig.ts", diff --git a/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts b/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts index e45b4bb6e1f..5b9d926be9b 100644 --- a/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts +++ b/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts @@ -35,7 +35,7 @@ const ERROR_SPECIFIC_RETRY_CONFIGS: ErrorRetryConfig = { baseDelayMs: 60000, // 60 seconds for rate limits maxDelayMs: 300000, // Max 5 minutes backoffMultiplier: 1, // No exponential backoff for rate limits - jitterMs: 5000, // Small jitter to avoid thundering herd + jitterMs: 1000, // Small jitter to avoid thundering herd }, [ErrorType.NETWORK_ERROR]: { @@ -228,10 +228,7 @@ export class LLMRetryManager { const result = await operation(); if (attempt > 1 && this.config.enableLogging) { - logger.info(`Operation succeeded on attempt ${attempt}`, { - context: options.context, - totalTime: Date.now() - startTime, - }); + logger.info(`Operation succeeded on attempt ${attempt}${options.context ? ` (context: ${options.context})` : ''} - total time: ${Date.now() - startTime}ms`); } return result; @@ -240,11 +237,7 @@ export class LLMRetryManager { const errorType = LLMErrorClassifier.classifyError(lastError); if (this.config.enableLogging) { - logger.error(`Operation failed on attempt ${attempt}:`, { - error: lastError.message, - errorType, - context: options.context, - }); + logger.error(`Operation failed on attempt ${attempt}: ${lastError instanceof Error ? lastError.message : String(lastError)} (type: ${errorType}${options.context ? `, context: ${options.context}` : ''})`); } // Check if we should retry this error type diff --git a/front_end/panels/ai_chat/LLM/OpenAIProvider.ts b/front_end/panels/ai_chat/LLM/OpenAIProvider.ts index f0e4f881160..9bd558b157a 100644 --- a/front_end/panels/ai_chat/LLM/OpenAIProvider.ts +++ b/front_end/panels/ai_chat/LLM/OpenAIProvider.ts @@ -301,7 +301,7 @@ export class OpenAIProvider extends LLMBaseProvider { if (!response.ok) { const errorData = await response.json(); - logger.error('OpenAI API error:', errorData); + logger.error('OpenAI API error:', JSON.stringify(errorData)); const error = new Error(`OpenAI API error: ${response.statusText} - ${errorData?.error?.message || 'Unknown error'}`); // Create tracing observation for API errors @@ -319,7 +319,7 @@ export class OpenAIProvider extends LLMBaseProvider { return data; } catch (error) { - logger.error('OpenAI API request failed:', error); + logger.error('OpenAI API request failed:', error instanceof Error ? error.message : String(error)); // Create tracing observation for network/fetch errors if (error instanceof Error) { diff --git a/front_end/panels/ai_chat/common/EvaluationConfig.ts b/front_end/panels/ai_chat/common/EvaluationConfig.ts new file mode 100644 index 00000000000..8c1e7e7aada --- /dev/null +++ b/front_end/panels/ai_chat/common/EvaluationConfig.ts @@ -0,0 +1,258 @@ +// Copyright 2025 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import { createLogger } from '../core/Logger.js'; +import { WebSocketRPCClient } from './WebSocketRPCClient.js'; +import { createAndConnectEvaluationAgent, getEvaluationAgent, disconnectEvaluationAgent } from '../evaluation/EvaluationAgent.js'; + +const logger = createLogger('EvaluationConfig'); + +// Localized strings for evaluation configuration +// TODO: Move to proper i18n system when evaluation UI is added to SettingsDialog +const EvaluationStrings = { + testingConnection: 'Testing connection...', + connectionSuccessful: 'Connection successful', + connectionFailed: 'Connection failed', + unknownConnectionError: 'Unknown connection error', + evaluationNotEnabled: 'Evaluation is not enabled', + clientIdNotAvailable: 'Client ID not available', +}; + +export interface EvaluationConfiguration { + enabled: boolean; + endpoint: string; + secretKey?: string; + clientId?: string; +} + +class EvaluationConfigStore { + private static instance: EvaluationConfigStore; + private config: EvaluationConfiguration = { + enabled: false, + endpoint: 'ws://localhost:8080', + secretKey: '', + clientId: '' + }; + private rpcClient: WebSocketRPCClient | null = null; + + private constructor() { + this.loadFromLocalStorage(); + this.ensureClientId(); + } + + static getInstance(): EvaluationConfigStore { + if (!EvaluationConfigStore.instance) { + EvaluationConfigStore.instance = new EvaluationConfigStore(); + } + return EvaluationConfigStore.instance; + } + + private loadFromLocalStorage(): void { + try { + const enabled = localStorage.getItem('ai_chat_evaluation_enabled') === 'true'; + const endpoint = localStorage.getItem('ai_chat_evaluation_endpoint') || 'ws://localhost:8080'; + const secretKey = localStorage.getItem('ai_chat_evaluation_secret_key') || ''; + const clientId = localStorage.getItem('ai_chat_evaluation_client_id') || ''; + + this.config = { + enabled, + endpoint, + secretKey, + clientId + }; + + logger.info('Loaded evaluation config from localStorage'); + } catch (error) { + logger.warn('Failed to load evaluation config from localStorage:', error); + } + } + + getConfig(): EvaluationConfiguration { + return { ...this.config }; + } + + setConfig(newConfig: EvaluationConfiguration): void { + // Preserve existing client ID if new config doesn't have one + const preservedClientId = newConfig.clientId || this.config.clientId; + + this.config = { ...newConfig, clientId: preservedClientId }; + + // Ensure we have a client ID (generate if needed) + this.ensureClientId(); + + logger.info('Evaluation configuration updated', { + enabled: this.config.enabled, + endpoint: this.config.endpoint, + clientId: this.config.clientId + }); + + // Save to localStorage for persistence + try { + localStorage.setItem('ai_chat_evaluation_enabled', String(this.config.enabled)); + localStorage.setItem('ai_chat_evaluation_endpoint', this.config.endpoint); + localStorage.setItem('ai_chat_evaluation_secret_key', this.config.secretKey || ''); + localStorage.setItem('ai_chat_evaluation_client_id', this.config.clientId || ''); + } catch (error) { + logger.warn('Failed to save evaluation config to localStorage:', error); + } + + // Disconnect existing client if configuration changed + if (this.rpcClient) { + this.rpcClient.disconnect(); + this.rpcClient = null; + } + } + + private ensureClientId(): void { + if (!this.config.clientId) { + // Generate a unique client ID for this installation + const clientId = this.generateUUID(); + this.config.clientId = clientId; + + try { + localStorage.setItem('ai_chat_evaluation_client_id', clientId); + logger.info('Generated and saved new client ID:', clientId); + } catch (error) { + logger.warn('Failed to save client ID to localStorage:', error); + } + } else { + logger.debug('Using existing client ID:', this.config.clientId); + } + } + + private generateUUID(): string { + // Generate UUID v4 + return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { + const r = Math.random() * 16 | 0; + const v = c === 'x' ? r : (r & 0x3 | 0x8); + return v.toString(16); + }); + } + + isEnabled(): boolean { + return this.config.enabled; + } + + async connect(): Promise { + if (!this.config.enabled) { + throw new Error(EvaluationStrings.evaluationNotEnabled); + } + + // Ensure client ID exists + this.ensureClientId(); + + if (!this.config.clientId) { + throw new Error(EvaluationStrings.clientIdNotAvailable); + } + + // Check if already connected + const existingAgent = getEvaluationAgent(); + if (existingAgent && existingAgent.isConnected()) { + logger.info('Already connected to evaluation service'); + return; + } + + // Create and connect evaluation agent + await createAndConnectEvaluationAgent( + this.config.clientId, + this.config.endpoint, + this.config.secretKey + ); + + logger.info('Connected to evaluation service with client ID:', this.config.clientId); + } + + disconnect(): void { + disconnectEvaluationAgent(); + logger.info('Disconnected from evaluation service'); + } + + getClientId(): string | undefined { + return this.config.clientId; + } + + isConnected(): boolean { + const agent = getEvaluationAgent(); + return agent ? agent.isConnected() : false; + } + + async testConnection(): Promise<{ success: boolean; message: string }> { + try { + const client = new WebSocketRPCClient({ + endpoint: this.config.endpoint, + secretKey: this.config.secretKey, + connectionTimeout: 5000 + }); + + await client.connect(); + + // Try to make a ping call to test the connection + try { + await client.call('ping', {}, 5000); + } catch (error) { + // Ping might not be implemented, that's okay + logger.debug('Ping method not available, connection still valid'); + } + + client.disconnect(); + return { success: true, message: EvaluationStrings.connectionSuccessful }; + } catch (error) { + const message = error instanceof Error ? error.message : EvaluationStrings.unknownConnectionError; + logger.error('Connection test failed:', error); + return { success: false, message }; + } + } +} + +export function getEvaluationConfig(): EvaluationConfiguration { + return EvaluationConfigStore.getInstance().getConfig(); +} + +export function setEvaluationConfig(config: EvaluationConfiguration): void { + EvaluationConfigStore.getInstance().setConfig(config); +} + +export function isEvaluationEnabled(): boolean { + return EvaluationConfigStore.getInstance().isEnabled(); +} + +export async function connectToEvaluationService(): Promise { + return EvaluationConfigStore.getInstance().connect(); +} + +export function disconnectFromEvaluationService(): void { + EvaluationConfigStore.getInstance().disconnect(); +} + +export function getEvaluationClientId(): string | undefined { + return EvaluationConfigStore.getInstance().getClientId(); +} + +export function isEvaluationConnected(): boolean { + return EvaluationConfigStore.getInstance().isConnected(); +} + +export async function testEvaluationConnection(): Promise<{ success: boolean; message: string }> { + return EvaluationConfigStore.getInstance().testConnection(); +} + +// Expose configuration functions globally for console access +declare global { + interface Window { + getEvaluationConfig?: typeof getEvaluationConfig; + setEvaluationConfig?: typeof setEvaluationConfig; + isEvaluationEnabled?: typeof isEvaluationEnabled; + connectToEvaluationService?: typeof connectToEvaluationService; + disconnectFromEvaluationService?: typeof disconnectFromEvaluationService; + } +} + +// Make functions available globally in development +if (typeof window !== 'undefined') { + window.getEvaluationConfig = getEvaluationConfig; + window.setEvaluationConfig = setEvaluationConfig; + window.isEvaluationEnabled = isEvaluationEnabled; + window.connectToEvaluationService = connectToEvaluationService; + window.disconnectFromEvaluationService = disconnectFromEvaluationService; +} \ No newline at end of file diff --git a/front_end/panels/ai_chat/common/WebSocketRPCClient.ts b/front_end/panels/ai_chat/common/WebSocketRPCClient.ts new file mode 100644 index 00000000000..197e2d6a3c9 --- /dev/null +++ b/front_end/panels/ai_chat/common/WebSocketRPCClient.ts @@ -0,0 +1,290 @@ +// Copyright 2025 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import { createLogger } from '../core/Logger.js'; + +const logger = createLogger('WebSocketRPCClient'); + +export interface RPCRequest { + id: string; + method: string; + params?: any; +} + +export interface RPCResponse { + id: string; + result?: any; + error?: { + code: number; + message: string; + data?: any; + }; +} + +export interface WebSocketRPCClientOptions { + endpoint: string; + secretKey?: string; + reconnectAttempts?: number; + reconnectDelay?: number; + connectionTimeout?: number; +} + +export class WebSocketRPCClient { + private websocket: WebSocket | null = null; + private endpoint: string; + private secretKey?: string; + private reconnectAttempts: number; + private reconnectDelay: number; + private connectionTimeout: number; + private currentReconnectAttempt = 0; + private reconnectTimeoutId: number | null = null; + private pendingRequests = new Map void; + reject: (error: Error) => void; + timeout: number; + }>(); + private isConnecting = false; + private isConnected = false; + private eventListeners = new Map>(); + + constructor(options: WebSocketRPCClientOptions) { + this.endpoint = options.endpoint; + this.secretKey = options.secretKey; + this.reconnectAttempts = options.reconnectAttempts ?? 3; + this.reconnectDelay = options.reconnectDelay ?? 1000; + this.connectionTimeout = options.connectionTimeout ?? 5000; + } + + public async connect(): Promise { + if (this.isConnecting || this.isConnected) { + logger.warn('Already connecting or connected'); + return; + } + + this.isConnecting = true; + + return new Promise((resolve, reject) => { + const connectionTimer = setTimeout(() => { + this.isConnecting = false; + reject(new Error('Connection timeout')); + }, this.connectionTimeout); + + try { + this.websocket = new WebSocket(this.endpoint); + + this.websocket.onopen = () => { + clearTimeout(connectionTimer); + this.isConnecting = false; + this.isConnected = true; + this.currentReconnectAttempt = 0; + logger.info('WebSocket connected', { endpoint: this.endpoint, readyState: this.websocket?.readyState }); + this.emit('connected'); + + // Note: Authentication is handled via the register message in the evaluation protocol + + resolve(); + }; + + this.websocket.onmessage = (event) => { + logger.debug('Received WebSocket message:', event.data); + this.handleMessage(event); + }; + + this.websocket.onclose = (event) => { + this.isConnected = false; + logger.warn('WebSocket connection closed', { + code: event.code, + reason: event.reason, + wasClean: event.wasClean, + endpoint: this.endpoint + }); + this.emit('disconnected'); + + if (!event.wasClean && this.currentReconnectAttempt < this.reconnectAttempts) { + this.scheduleReconnect(); + } + }; + + this.websocket.onerror = (error) => { + clearTimeout(connectionTimer); + this.isConnecting = false; + const errorDetails = { + type: error.type, + readyState: this.websocket?.readyState, + url: this.endpoint, + timestamp: new Date().toISOString(), + message: 'WebSocket connection error' + }; + logger.error('WebSocket error:', JSON.stringify(errorDetails)); + this.emit('error', errorDetails); + + if (this.isConnecting) { + reject(new Error('WebSocket connection failed')); + } + }; + + } catch (error) { + clearTimeout(connectionTimer); + this.isConnecting = false; + reject(error); + } + }); + } + + public disconnect(): void { + if (this.reconnectTimeoutId) { + clearTimeout(this.reconnectTimeoutId); + this.reconnectTimeoutId = null; + } + + if (this.websocket) { + this.websocket.close(1000, 'Manual disconnect'); + this.websocket = null; + } + + this.isConnected = false; + this.isConnecting = false; + this.currentReconnectAttempt = 0; + + // Reject all pending requests + for (const [id, request] of this.pendingRequests) { + clearTimeout(request.timeout); + request.reject(new Error('Connection closed')); + } + this.pendingRequests.clear(); + } + + public async call(method: string, params?: any, timeout = 30000): Promise { + if (!this.isConnected) { + throw new Error('WebSocket not connected'); + } + + const id = this.generateRequestId(); + const request: RPCRequest = { id, method, params }; + + return new Promise((resolve, reject) => { + const timeoutId = setTimeout(() => { + this.pendingRequests.delete(id); + reject(new Error(`RPC call timeout: ${method}`)); + }, timeout); + + this.pendingRequests.set(id, { + resolve, + reject, + timeout: timeoutId, + }); + + try { + this.websocket!.send(JSON.stringify(request)); + logger.debug('Sent RPC request', { method, id }); + } catch (error) { + this.pendingRequests.delete(id); + clearTimeout(timeoutId); + reject(error); + } + }); + } + + public send(message: any): void { + if (!this.isConnected || !this.websocket) { + throw new Error('WebSocket not connected'); + } + + this.websocket.send(JSON.stringify(message)); + logger.debug('Sent message:', message); + } + + public isConnectionReady(): boolean { + return this.isConnected && this.websocket?.readyState === WebSocket.OPEN; + } + + public on(event: string, callback: Function): void { + if (!this.eventListeners.has(event)) { + this.eventListeners.set(event, new Set()); + } + this.eventListeners.get(event)!.add(callback); + } + + public off(event: string, callback: Function): void { + const listeners = this.eventListeners.get(event); + if (listeners) { + listeners.delete(callback); + } + } + + private emit(event: string, data?: any): void { + const listeners = this.eventListeners.get(event); + if (listeners) { + for (const callback of listeners) { + try { + callback(data); + } catch (error) { + logger.error('Error in event listener:', error); + } + } + } + } + + private handleMessage(event: MessageEvent): void { + try { + const message = JSON.parse(event.data); + logger.debug('Received WebSocket message', { type: message.type, id: message.id }); + + // Check if this is an RPC response (has id and either result or error) + if (message.id && (message.hasOwnProperty('result') || message.hasOwnProperty('error'))) { + const response: RPCResponse = message; + + const pendingRequest = this.pendingRequests.get(response.id); + if (!pendingRequest) { + logger.warn('Received response for unknown request ID:', response.id); + return; + } + + this.pendingRequests.delete(response.id); + clearTimeout(pendingRequest.timeout); + + if (response.error) { + pendingRequest.reject(new Error(`RPC Error: ${response.error.message} (Code: ${response.error.code})`)); + } else { + pendingRequest.resolve(response.result); + } + } else { + // This is a general WebSocket message (like welcome, evaluation requests, etc.) + this.emit('message', message); + } + + } catch (error) { + logger.error('Failed to parse WebSocket message:', error); + } + } + + + private scheduleReconnect(): void { + if (this.reconnectTimeoutId) { + return; + } + + this.currentReconnectAttempt++; + const delay = this.reconnectDelay * Math.pow(2, this.currentReconnectAttempt - 1); + + logger.info(`Scheduling reconnect attempt ${this.currentReconnectAttempt}/${this.reconnectAttempts} in ${delay}ms`); + + this.reconnectTimeoutId = setTimeout(() => { + this.reconnectTimeoutId = null; + this.connect().catch((error) => { + logger.error('Reconnect failed:', error); + if (this.currentReconnectAttempt < this.reconnectAttempts) { + this.scheduleReconnect(); + } else { + logger.error('Max reconnect attempts reached'); + this.emit('reconnect_failed'); + } + }); + }, delay); + } + + private generateRequestId(): string { + return `rpc_${Date.now()}_${Math.random().toString(36).substring(2, 9)}`; + } +} \ No newline at end of file diff --git a/front_end/panels/ai_chat/evaluation/EvaluationAgent.ts b/front_end/panels/ai_chat/evaluation/EvaluationAgent.ts new file mode 100644 index 00000000000..768fbf0cceb --- /dev/null +++ b/front_end/panels/ai_chat/evaluation/EvaluationAgent.ts @@ -0,0 +1,674 @@ +// Copyright 2025 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import { WebSocketRPCClient } from '../common/WebSocketRPCClient.js'; +import { getEvaluationConfig, getEvaluationClientId } from '../common/EvaluationConfig.js'; +import { ToolRegistry } from '../agent_framework/ConfigurableAgentTool.js'; +import { AgentService } from '../core/AgentService.js'; +import { createLogger } from '../core/Logger.js'; +import { createTracingProvider, withTracingContext, isTracingEnabled, getTracingConfig } from '../tracing/TracingConfig.js'; +import type { TracingProvider, TracingContext } from '../tracing/TracingProvider.js'; +import { + RegisterMessage, + ReadyMessage, + StatusMessage, + WelcomeMessage, + RegistrationAckMessage, + AuthVerifyMessage, + EvaluationRequest, + EvaluationSuccessResponse, + EvaluationErrorResponse, + ErrorCodes, + isWelcomeMessage, + isRegistrationAckMessage, + isEvaluationRequest, + isPongMessage, + createRegisterMessage, + createReadyMessage, + createAuthVerifyMessage, + createStatusMessage, + createSuccessResponse, + createErrorResponse +} from './EvaluationProtocol.js'; + +const logger = createLogger('EvaluationAgent'); + +export interface EvaluationAgentOptions { + clientId: string; + endpoint: string; + secretKey?: string; +} + +export class EvaluationAgent { + private client: WebSocketRPCClient | null = null; + private clientId: string; + private endpoint: string; + private secretKey?: string; + private registered = false; + private ready = false; + private activeEvaluations = new Map(); + private heartbeatInterval: number | null = null; + private authPromise: Promise | null = null; + private authResolve: ((value?: void) => void) | null = null; + private authReject: ((reason?: any) => void) | null = null; + private tracingProvider: TracingProvider; + + constructor(options: EvaluationAgentOptions) { + this.clientId = options.clientId; + this.endpoint = options.endpoint; + this.secretKey = options.secretKey; + this.tracingProvider = createTracingProvider(); + + logger.info('EvaluationAgent created with tracing provider', { + clientId: this.clientId, + providerType: this.tracingProvider.constructor.name, + tracingEnabled: isTracingEnabled(), + tracingConfig: getTracingConfig() + }); + } + + public async connect(): Promise { + if (this.client && this.client.isConnectionReady() && this.registered) { + logger.warn('Already connected and authenticated'); + return; + } + + logger.info('Connecting to evaluation server', { + endpoint: this.endpoint, + clientId: this.clientId + }); + + // Create authentication promise + this.authPromise = new Promise((resolve, reject) => { + this.authResolve = resolve; + this.authReject = reject; + }); + + this.client = new WebSocketRPCClient({ + endpoint: this.endpoint, + secretKey: this.secretKey, + reconnectAttempts: 5, + reconnectDelay: 2000 + }); + + // Setup event handlers + this.setupEventHandlers(); + + // Connect to server + await this.client.connect(); + + // Wait for authentication to complete + await this.authPromise; + } + + public disconnect(): void { + if (this.heartbeatInterval) { + clearInterval(this.heartbeatInterval); + this.heartbeatInterval = null; + } + + if (this.client) { + this.client.disconnect(); + this.client = null; + } + + this.registered = false; + this.ready = false; + this.activeEvaluations.clear(); + + logger.info('Disconnected from evaluation server'); + } + + public isConnected(): boolean { + return (this.client?.isConnectionReady() && this.registered) || false; + } + + public isRegistered(): boolean { + return this.registered; + } + + public isReady(): boolean { + return this.ready; + } + + private setupEventHandlers(): void { + if (!this.client) return; + + this.client.on('connected', () => { + logger.info('WebSocket connected, waiting for welcome message'); + }); + + this.client.on('disconnected', () => { + logger.info('WebSocket disconnected'); + this.registered = false; + this.ready = false; + this.stopHeartbeat(); + }); + + this.client.on('message', (data: any) => { + this.handleMessage(data); + }); + + this.client.on('error', (error: any) => { + logger.error('WebSocket error:', typeof error === 'object' ? JSON.stringify(error) : error); + }); + } + + private async handleMessage(message: any): Promise { + try { + if (isWelcomeMessage(message)) { + logger.info('Received welcome message from server', { + serverId: message.serverId, + version: message.version + }); + await this.register(); + } + else if (isRegistrationAckMessage(message)) { + this.handleRegistrationAck(message); + } + else if (isEvaluationRequest(message)) { + await this.handleEvaluationRequest(message); + } + else if (isPongMessage(message)) { + logger.debug('Received pong'); + } + else { + logger.warn('Unknown message type:', message); + } + } catch (error) { + logger.error('Error handling message:', error instanceof Error ? error.message : String(error)); + } + } + + private async register(): Promise { + if (!this.client) return; + + const tools: string[] = []; + + const registerMessage = createRegisterMessage( + this.clientId, + { + tools, + maxConcurrency: 3, + version: '1.0.0' + } + // Note: No secret key sent - server will send its key for client verification + ); + + logger.info('Registering with server', { + clientId: this.clientId, + tools: tools.join(', ') + }); + + this.client.send(registerMessage); + } + + private handleRegistrationAck(message: RegistrationAckMessage): void { + if (message.status === 'accepted') { + logger.info('Registration accepted', { + evaluationsCount: message.evaluationsCount + }); + this.registered = true; + this.sendReady(); + this.startHeartbeat(); + + // Resolve auth promise - connection is complete + if (this.authResolve) { + this.authResolve(); + this.authResolve = null; + this.authReject = null; + } + } else if (message.status === 'auth_required') { + logger.info('Server requesting authentication verification'); + this.handleAuthRequest(message); + } else { + if (message.newClient) { + logger.info('New client created, will retry connection', { + reason: message.reason + }); + // For new clients, the server created the config and asks to reconnect + // We can attempt to reconnect after a short delay + setTimeout(() => { + if (this.client) { + this.register(); + } + }, 1000); + } else { + logger.error('Registration rejected', { + reason: message.reason + }); + + // Reject auth promise - authentication failed + if (this.authReject) { + this.authReject(new Error(`Registration rejected: ${message.reason}`)); + this.authResolve = null; + this.authReject = null; + } + + this.disconnect(); + } + } + } + + private async handleAuthRequest(message: RegistrationAckMessage): Promise { + if (!message.serverSecretKey) { + logger.error('Server did not provide secret key for verification'); + this.disconnect(); + return; + } + + // Get the client's configured secret key from EvaluationConfig + const config = getEvaluationConfig(); + const clientSecretKey = config.secretKey || ''; + + // Verify if the server's secret key matches the client's configured key + const verified = clientSecretKey === message.serverSecretKey; + + logger.info('Verifying secret key', { + hasClientKey: !!clientSecretKey, + hasServerKey: !!message.serverSecretKey, + verified + }); + + // Send verification response + const authMessage = createAuthVerifyMessage(message.clientId, verified); + this.client?.send(authMessage); + + if (!verified) { + logger.error('Secret key verification failed - keys do not match'); + // Reject auth promise immediately since we know auth will fail + if (this.authReject) { + this.authReject(new Error('Secret key verification failed - keys do not match')); + this.authResolve = null; + this.authReject = null; + } + } + } + + private sendReady(): void { + if (!this.client || !this.registered) return; + + const readyMessage = createReadyMessage(); + this.client.send(readyMessage); + this.ready = true; + + logger.info('Sent ready signal to server'); + } + + private async handleEvaluationRequest(request: EvaluationRequest): Promise { + const { params, id } = request; + const startTime = Date.now(); + + logger.info('Received evaluation request', { + evaluationId: params.evaluationId, + tool: params.tool, + url: params.url + }); + + // Track active evaluation + this.activeEvaluations.set(params.evaluationId, { + startTime, + tool: params.tool + }); + + // Create a trace for this evaluation + const traceId = `eval-${params.evaluationId}-${Date.now()}`; + const sessionId = `eval-session-${Date.now()}`; + const tracingContext: TracingContext = { + traceId, + sessionId, + parentObservationId: undefined + }; + + try { + // Initialize tracing provider if not already done + await this.tracingProvider.initialize(); + + // Create session for this evaluation + await this.tracingProvider.createSession(sessionId, { + type: 'evaluation', + source: 'evaluation-server', + evaluationId: params.evaluationId + }); + + // Create root trace for the evaluation + await this.tracingProvider.createTrace( + traceId, + sessionId, + `Evaluation: ${params.tool}`, + params.input, + { + evaluationId: params.evaluationId, + tool: params.tool, + url: params.url, + source: 'evaluation-server' + }, + 'evaluation-agent', + ['evaluation', params.tool] + ); + + logger.info('Trace created successfully for evaluation', { + traceId, + sessionId, + evaluationId: params.evaluationId + }); + } catch (error) { + logger.warn('Failed to create trace:', error); + } + + try { + // Send status update + this.sendStatus(params.evaluationId, 'running', 0.1, 'Starting evaluation...'); + + // Get the tool from registry + const tool = ToolRegistry.getRegisteredTool(params.tool); + if (!tool) { + throw new Error(`Tool not found: ${params.tool}`); + } + + // Navigate to URL if needed + if (params.url) { + this.sendStatus(params.evaluationId, 'running', 0.2, 'Navigating to URL...'); + + try { + // Use the correct navigate_url tool from registry + const navigateUrlTool = ToolRegistry.getRegisteredTool('navigate_url'); + if (navigateUrlTool) { + logger.info('Navigating to URL using navigate_url tool', { url: params.url }); + const navigationResult = await this.executeToolWithTimeout( + navigateUrlTool, + { + url: params.url, + reasoning: `Navigate to ${params.url} for evaluation ${params.evaluationId}` + }, + 15000, // 15 second timeout for navigation + tracingContext, + 'navigate_url' + ); + logger.info('Navigation result', { navigationResult }); + this.sendStatus(params.evaluationId, 'running', 0.3, 'Navigation completed successfully'); + } else { + // Fallback: try action_agent for navigation + const actionTool = ToolRegistry.getRegisteredTool('action_agent'); + if (actionTool) { + logger.info('Navigating to URL using action_agent fallback', { url: params.url }); + const navigationResult = await this.executeToolWithTimeout( + actionTool, + { + task: `Navigate to ${params.url}`, + reasoning: 'Navigation required for evaluation' + }, + 15000, // 15 second timeout for navigation + tracingContext, + 'action_agent' + ); + logger.info('Action agent navigation result', { navigationResult }); + this.sendStatus(params.evaluationId, 'running', 0.3, 'Navigation completed via action agent'); + } else { + logger.error('No navigation tools available in registry'); + this.sendStatus(params.evaluationId, 'running', 0.3, 'ERROR: No navigation tools available'); + throw new Error('Navigation failed: No navigation tools available'); + } + } + } catch (error) { + logger.error('Navigation failed', { url: params.url, error: error instanceof Error ? error.message : error }); + this.sendStatus(params.evaluationId, 'running', 0.3, `Navigation failed: ${error instanceof Error ? error.message : 'Unknown error'} - continuing with current page`); + // Continue with evaluation even if navigation fails, but log the issue prominently + } + } + + // Execute the tool + this.sendStatus(params.evaluationId, 'running', 0.5, `Executing ${params.tool}...`); + + const toolResult = await this.executeToolWithTimeout( + tool, + params.input, + params.timeout || 30000, + tracingContext, + params.tool + ); + + const executionTime = Date.now() - startTime; + + // Send JSON-RPC success response + const rpcResponse = createSuccessResponse( + id, + toolResult, + executionTime, + [{ + tool: params.tool, + timestamp: new Date().toISOString(), + duration: executionTime, + status: 'success' + }], + { + url: params.url, + evaluationId: params.evaluationId + } + ); + + if (this.client) { + this.client.send(rpcResponse); + } + + this.sendStatus(params.evaluationId, 'completed', 1.0, 'Evaluation completed successfully'); + + // Update trace with success + try { + await this.tracingProvider.finalizeTrace(traceId, { + output: toolResult, + statusMessage: 'completed', + metadata: { + executionTime, + evaluationId: params.evaluationId + } + }); + } catch (error) { + logger.warn('Failed to update trace:', error); + } + + logger.info('Evaluation completed successfully', { + evaluationId: params.evaluationId, + executionTime + }); + + } catch (error) { + const executionTime = Date.now() - startTime; + const errorMessage = error instanceof Error ? error.message : 'Unknown error'; + + logger.error(`Evaluation failed: ${errorMessage} (evaluationId: ${params.evaluationId})`); + + // Send JSON-RPC error response + const rpcResponse = createErrorResponse( + id, + ErrorCodes.TOOL_EXECUTION_ERROR, + 'Tool execution failed', + { + tool: params.tool, + error: errorMessage, + url: params.url, + timestamp: new Date().toISOString() + } + ); + + if (this.client) { + this.client.send(rpcResponse); + } + + this.sendStatus(params.evaluationId, 'failed', 1.0, errorMessage); + + // Update trace with error + try { + await this.tracingProvider.finalizeTrace(traceId, { + error: errorMessage, + statusMessage: 'failed', + metadata: { + executionTime, + evaluationId: params.evaluationId + } + }); + } catch (updateError) { + logger.warn('Failed to update trace with error:', updateError); + } + + } finally { + this.activeEvaluations.delete(params.evaluationId); + } + } + + private async executeToolWithTimeout( + tool: any, + input: any, + timeout: number, + tracingContext?: TracingContext, + toolName?: string + ): Promise { + const spanId = `tool-exec-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + const startTime = new Date(); + + // Create tool execution span if tracing context is provided + if (tracingContext) { + try { + await this.tracingProvider.createObservation({ + id: spanId, + name: `Tool: ${toolName || 'unknown'}`, + type: 'span', + startTime, + input, + metadata: { + tool: toolName, + timeout + } + }, tracingContext.traceId); + } catch (error) { + logger.warn('Failed to create tool execution span:', error); + } + } + + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { + // Update span with timeout error + if (tracingContext) { + this.tracingProvider.updateObservation(spanId, { + endTime: new Date(), + error: `Tool execution timeout after ${timeout}ms` + }).catch(err => logger.warn('Failed to update span with timeout:', err)); + } + reject(new Error(`Tool execution timeout after ${timeout}ms`)); + }, timeout); + + // Execute tool with tracing context if available + const executePromise = tracingContext + ? withTracingContext(tracingContext, () => tool.execute(input)) + : tool.execute(input); + + executePromise + .then((result: any) => { + clearTimeout(timer); + + // Update span with success + if (tracingContext) { + this.tracingProvider.updateObservation(spanId, { + endTime: new Date(), + output: result + }).catch(err => logger.warn('Failed to update span with result:', err)); + } + + resolve(result); + }) + .catch((error: Error) => { + clearTimeout(timer); + + // Update span with error + if (tracingContext) { + this.tracingProvider.updateObservation(spanId, { + endTime: new Date(), + error: error.message + }).catch(err => logger.warn('Failed to update span with error:', err)); + } + + reject(error); + }); + }); + } + + private sendStatus( + evaluationId: string, + status: 'running' | 'completed' | 'failed', + progress?: number, + message?: string + ): void { + if (!this.client || !this.ready) return; + + const statusMessage = createStatusMessage( + evaluationId, + status, + progress, + message + ); + + this.client.send(statusMessage); + } + + private startHeartbeat(): void { + if (this.heartbeatInterval) return; + + this.heartbeatInterval = setInterval(() => { + if (this.client && this.ready) { + this.client.send({ + type: 'ping', + timestamp: new Date().toISOString() + }); + } + }, 30000); // Send ping every 30 seconds + } + + private stopHeartbeat(): void { + if (this.heartbeatInterval) { + clearInterval(this.heartbeatInterval); + this.heartbeatInterval = null; + } + } + + public getActiveEvaluationsCount(): number { + return this.activeEvaluations.size; + } + + public getActiveEvaluations(): string[] { + return Array.from(this.activeEvaluations.keys()); + } +} + +// Global instance management +let evaluationAgent: EvaluationAgent | null = null; + +export function getEvaluationAgent(): EvaluationAgent | null { + return evaluationAgent; +} + +export async function createAndConnectEvaluationAgent( + clientId: string, + endpoint: string, + secretKey?: string +): Promise { + if (evaluationAgent) { + evaluationAgent.disconnect(); + } + + evaluationAgent = new EvaluationAgent({ + clientId, + endpoint, + secretKey + }); + + await evaluationAgent.connect(); + return evaluationAgent; +} + +export function disconnectEvaluationAgent(): void { + if (evaluationAgent) { + evaluationAgent.disconnect(); + evaluationAgent = null; + } +} \ No newline at end of file diff --git a/front_end/panels/ai_chat/evaluation/EvaluationProtocol.ts b/front_end/panels/ai_chat/evaluation/EvaluationProtocol.ts new file mode 100644 index 00000000000..c21b24e43e8 --- /dev/null +++ b/front_end/panels/ai_chat/evaluation/EvaluationProtocol.ts @@ -0,0 +1,247 @@ +// Copyright 2025 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +/** + * Protocol definitions for WebSocket evaluation communication + */ + +// Client → Server Messages + +export interface RegisterMessage { + type: 'register'; + clientId: string; + secretKey?: string; + capabilities: ClientCapabilities; +} + +export interface ClientCapabilities { + tools: string[]; + maxConcurrency: number; + version: string; +} + +export interface ReadyMessage { + type: 'ready'; + timestamp: string; +} + +export interface StatusMessage { + type: 'status'; + evaluationId: string; + status: 'running' | 'completed' | 'failed'; + progress?: number; // 0-1 + message?: string; +} + +export interface PingMessage { + type: 'ping'; + timestamp: string; +} + +// Server → Client Messages + +export interface WelcomeMessage { + type: 'welcome'; + serverId: string; + version: string; + timestamp: string; +} + +export interface RegistrationAckMessage { + type: 'registration_ack'; + clientId: string; + status: 'accepted' | 'rejected' | 'auth_required'; + message?: string; + evaluationsCount?: number; + reason?: string; // Only present if rejected + serverSecretKey?: string; // Present when status is 'auth_required' + newClient?: boolean; // Present when a new client was created +} + +export interface AuthVerifyMessage { + type: 'auth_verify'; + clientId: string; + verified: boolean; +} + +export interface PongMessage { + type: 'pong'; + timestamp: string; +} + +// JSON-RPC Messages + +export interface EvaluationRequest { + jsonrpc: '2.0'; + method: 'evaluate'; + params: EvaluationParams; + id: string; +} + +export interface EvaluationParams { + evaluationId: string; + name: string; + url: string; + tool: string; + input: any; + timeout: number; + metadata: { + tags: string[]; + retries: number; + priority?: 'low' | 'normal' | 'high'; + }; +} + +export interface EvaluationSuccessResponse { + jsonrpc: '2.0'; + result: { + status: 'success'; + output: any; + executionTime: number; + toolCalls?: ToolCall[]; + metadata?: Record; + }; + id: string; +} + +export interface ToolCall { + tool: string; + timestamp: string; + duration: number; + status: 'success' | 'failed'; + error?: string; +} + +export interface EvaluationErrorResponse { + jsonrpc: '2.0'; + error: { + code: number; + message: string; + data?: { + tool: string; + error: string; + url?: string; + timestamp: string; + stackTrace?: string; + }; + }; + id: string; +} + +// Error codes +export const ErrorCodes = { + PARSE_ERROR: -32700, + INVALID_REQUEST: -32600, + METHOD_NOT_FOUND: -32601, + INVALID_PARAMS: -32602, + INTERNAL_ERROR: -32603, + + // Custom error codes + TOOL_EXECUTION_ERROR: -32000, + TIMEOUT_ERROR: -32001, + AUTHENTICATION_ERROR: -32002, + RATE_LIMIT_EXCEEDED: -32003, + INVALID_TOOL: -32004, + RESOURCE_ERROR: -32005 +} as const; + +// Type guards + +export function isWelcomeMessage(msg: any): msg is WelcomeMessage { + return msg?.type === 'welcome'; +} + +export function isRegistrationAckMessage(msg: any): msg is RegistrationAckMessage { + return msg?.type === 'registration_ack'; +} + +export function isEvaluationRequest(msg: any): msg is EvaluationRequest { + return msg?.jsonrpc === '2.0' && msg?.method === 'evaluate'; +} + +export function isPongMessage(msg: any): msg is PongMessage { + return msg?.type === 'pong'; +} + +// Helper functions + +export function createRegisterMessage( + clientId: string, + capabilities: ClientCapabilities, + secretKey?: string +): RegisterMessage { + return { + type: 'register', + clientId, + secretKey, + capabilities + }; +} + +export function createReadyMessage(): ReadyMessage { + return { + type: 'ready', + timestamp: new Date().toISOString() + }; +} + +export function createAuthVerifyMessage(clientId: string, verified: boolean): AuthVerifyMessage { + return { + type: 'auth_verify', + clientId, + verified + }; +} + +export function createStatusMessage( + evaluationId: string, + status: 'running' | 'completed' | 'failed', + progress?: number, + message?: string +): StatusMessage { + return { + type: 'status', + evaluationId, + status, + progress, + message + }; +} + +export function createSuccessResponse( + id: string, + output: any, + executionTime: number, + toolCalls?: ToolCall[], + metadata?: Record +): EvaluationSuccessResponse { + return { + jsonrpc: '2.0', + result: { + status: 'success', + output, + executionTime, + toolCalls, + metadata + }, + id + }; +} + +export function createErrorResponse( + id: string, + code: number, + message: string, + data?: any +): EvaluationErrorResponse { + return { + jsonrpc: '2.0', + error: { + code, + message, + data + }, + id + }; +} \ No newline at end of file diff --git a/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts b/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts index 157a8b9d581..723fab6889f 100644 --- a/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts +++ b/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts @@ -9,6 +9,8 @@ import { createLogger } from '../../core/Logger.js'; import { SanitizationUtils } from '../utils/SanitizationUtils.js'; import { ErrorHandlingUtils } from '../utils/ErrorHandlingUtils.js'; import type { ToolExecutionResult } from '../utils/EvaluationTypes.js'; +import { createTracingProvider } from '../../tracing/TracingConfig.js'; +import type { TracingProvider, TracingContext } from '../../tracing/TracingProvider.js'; const logger = createLogger('GenericToolEvaluator'); @@ -29,17 +31,19 @@ export class GenericToolEvaluator { private navigateTool: NavigateURLTool; private config: EvaluationConfig; private hooks?: TestExecutionHooks; + private tracingProvider: TracingProvider; constructor(config: EvaluationConfig, hooks?: TestExecutionHooks) { this.config = config; this.navigateTool = new NavigateURLTool(); this.hooks = hooks; + this.tracingProvider = createTracingProvider(); } /** * Run a test case for any tool */ - async runTest(testCase: TestCase, tool: Tool): Promise { + async runTest(testCase: TestCase, tool: Tool, tracingContext?: TracingContext): Promise { const startTime = Date.now(); // Use withErrorHandling wrapper for better error management @@ -56,7 +60,44 @@ export class GenericToolEvaluator { await this.hooks.beforeNavigation(testCase); } + // Create navigation span + const navSpanId = `nav-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + const navStartTime = new Date(); + + if (tracingContext) { + try { + await this.tracingProvider.createObservation({ + id: navSpanId, + name: 'Navigation', + type: 'span', + startTime: navStartTime, + input: { url: testCase.url }, + metadata: { + phase: 'navigation', + url: testCase.url, + testId: testCase.id || testCase.name + } + }, tracingContext.traceId); + } catch (error) { + logger.warn('Failed to create navigation span:', error); + } + } + const navResult = await this.navigateTool.execute({ url: testCase.url, reasoning: `Navigate to ${testCase.url} for test case ${testCase.name}` }); + + // Update navigation span + if (tracingContext) { + try { + await this.tracingProvider.updateObservation(navSpanId, { + endTime: new Date(), + output: navResult, + error: (navResult && typeof navResult === 'object' && 'error' in navResult) ? String(navResult.error) : undefined + }); + } catch (error) { + logger.warn('Failed to update navigation span:', error); + } + } + if ('error' in navResult) { throw new Error(`Navigation failed: ${navResult.error}`); } @@ -71,6 +112,28 @@ export class GenericToolEvaluator { } // 2. Execute the tool with the input - wrapped with error handling + const toolSpanId = `tool-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + const toolStartTime = new Date(); + + if (tracingContext) { + try { + await this.tracingProvider.createObservation({ + id: toolSpanId, + name: `Tool Execution: ${testCase.tool}`, + type: 'span', + startTime: toolStartTime, + input: testCase.input, + metadata: { + phase: 'tool-execution', + tool: testCase.tool, + testId: testCase.id || testCase.name + } + }, tracingContext.traceId); + } catch (error) { + logger.warn('Failed to create tool execution span:', error); + } + } + const toolResult = await ErrorHandlingUtils.withErrorHandling( async () => { return await tool.execute(testCase.input); @@ -79,6 +142,19 @@ export class GenericToolEvaluator { logger, `GenericToolEvaluator.toolExecution:${testCase.tool}` ); + + // Update tool execution span + if (tracingContext) { + try { + await this.tracingProvider.updateObservation(toolSpanId, { + endTime: new Date(), + output: toolResult, + error: (toolResult && typeof toolResult === 'object' && 'error' in toolResult) ? String(toolResult.error) : undefined + }); + } catch (error) { + logger.warn('Failed to update tool execution span:', error); + } + } // Call afterToolExecution hook @@ -151,7 +227,7 @@ export class GenericToolEvaluator { /** * Run a test with retry logic */ - private async runTestWithRetries(testCase: TestCase, tool: Tool): Promise { + private async runTestWithRetries(testCase: TestCase, tool: Tool, tracingContext?: TracingContext): Promise { const maxRetries = testCase.metadata?.retries || this.config.retries || 1; let lastResult: TestResult | null = null; let lastError: unknown = null; @@ -162,7 +238,7 @@ export class GenericToolEvaluator { await new Promise(resolve => setTimeout(resolve, 2000 * attempt)); // Exponential backoff } - lastResult = await this.runTest(testCase, tool); + lastResult = await this.runTest(testCase, tool, tracingContext); // Only retry on errors, not on test failures if (lastResult.status !== 'error') { diff --git a/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts b/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts index adb3f7853fc..b55507b809b 100644 --- a/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts +++ b/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts @@ -9,6 +9,8 @@ import { ToolRegistry } from '../../agent_framework/ConfigurableAgentTool.js'; import type { EvaluationConfig, TestResult, TestCase } from '../framework/types.js'; import { createLogger } from '../../core/Logger.js'; import { TIMING_CONSTANTS } from '../../core/Constants.js'; +import { createTracingProvider, isTracingEnabled, getTracingConfig } from '../../tracing/TracingConfig.js'; +import type { TracingProvider, TracingContext } from '../../tracing/TracingProvider.js'; const logger = createLogger('EvaluationRunner'); @@ -19,6 +21,8 @@ export class EvaluationRunner { private evaluator: GenericToolEvaluator; private llmEvaluator: LLMEvaluator; private config: EvaluationConfig; + private tracingProvider: TracingProvider; + private sessionId: string; constructor(judgeModel?: string) { // Get API key from AgentService @@ -46,36 +50,162 @@ export class EvaluationRunner { this.evaluator = new GenericToolEvaluator(this.config); this.llmEvaluator = new LLMEvaluator(this.config.evaluationApiKey, this.config.evaluationModel); + + // Initialize tracing + this.tracingProvider = createTracingProvider(); + this.sessionId = `evaluation-session-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + + logger.info('EvaluationRunner created with tracing provider', { + sessionId: this.sessionId, + providerType: this.tracingProvider.constructor.name, + tracingEnabled: isTracingEnabled(), + tracingConfig: getTracingConfig() + }); + + // Initialize tracing provider + this.initializeTracing(); + } + + private async initializeTracing(): Promise { + if (isTracingEnabled()) { + try { + logger.info('Initializing tracing for evaluation runner', { + sessionId: this.sessionId, + providerType: this.tracingProvider.constructor.name + }); + + await this.tracingProvider.initialize(); + await this.tracingProvider.createSession(this.sessionId, { + type: 'evaluation', + runner: 'EvaluationRunner', + timestamp: new Date().toISOString() + }); + + logger.info('Tracing initialized successfully for evaluation runner'); + } catch (error) { + logger.warn('Failed to initialize tracing for evaluation:', error); + } + } else { + logger.info('Tracing disabled, skipping initialization'); + } } /** * Run a single test case */ async runSingleTest(testCase: TestCase): Promise { + const traceId = `eval-${testCase.id || testCase.name}-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + const startTime = new Date(); logger.debug(`[EvaluationRunner] Running test: ${testCase.name}`); logger.debug(`[EvaluationRunner] URL: ${testCase.url}`); logger.debug(`[EvaluationRunner] Tool: ${testCase.tool}`); + // Create tracing context + const tracingContext: TracingContext = { + sessionId: this.sessionId, + traceId, + parentObservationId: undefined + }; + + // Create trace for this evaluation + if (isTracingEnabled()) { + try { + logger.info('Creating trace for evaluation', { + traceId, + sessionId: this.sessionId, + testName: testCase.name, + tool: testCase.tool, + providerType: this.tracingProvider.constructor.name + }); + + await this.tracingProvider.createTrace( + traceId, + this.sessionId, + `Evaluation: ${testCase.name}`, + { + testCase: { + id: testCase.id, + name: testCase.name, + tool: testCase.tool, + url: testCase.url, + description: testCase.description + } + }, + { + type: 'evaluation', + tool: testCase.tool, + url: testCase.url, + testId: testCase.id || testCase.name + }, + 'evaluation-runner', + ['evaluation', testCase.tool, 'test'] + ); + + logger.info('Trace created successfully'); + } catch (error) { + logger.error('Failed to create trace for evaluation:', error); + } + } else { + logger.info('Tracing disabled, skipping trace creation'); + } + // Get the tool instance from ToolRegistry based on what the test specifies const tool = ToolRegistry.getRegisteredTool(testCase.tool); if (!tool) { throw new Error(`Tool "${testCase.tool}" not found in ToolRegistry. Ensure it is properly registered.`); } - const result = await this.evaluator.runTest(testCase, tool as any); + const result = await this.evaluator.runTest(testCase, tool as any, tracingContext); // Add LLM evaluation if test passed if (result.status === 'passed' && result.output && testCase.validation.type !== 'snapshot') { logger.debug(`[EvaluationRunner] Adding LLM evaluation...`); + // Create span for LLM evaluation + const llmSpanId = `llm-judge-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`; + const llmStartTime = new Date(); + try { + if (isTracingEnabled()) { + await this.tracingProvider.createObservation({ + id: llmSpanId, + name: 'LLM Judge Evaluation', + type: 'generation', + startTime: llmStartTime, + input: { + output: result.output, + testCase: testCase.name, + validation: testCase.validation + }, + model: this.config.evaluationModel, + metadata: { + tool: testCase.tool, + testId: testCase.id || testCase.name, + phase: 'llm-evaluation' + } + }, traceId); + } + const llmJudgment = await this.llmEvaluator.evaluate( result.output, testCase, testCase.validation ); + // Update LLM evaluation span with result + if (isTracingEnabled()) { + await this.tracingProvider.updateObservation(llmSpanId, { + endTime: new Date(), + output: llmJudgment, + metadata: { + score: llmJudgment.score, + passed: llmJudgment.passed, + explanation: llmJudgment.explanation + } + }); + } + if (result.validation) { result.validation.llmJudge = llmJudgment; result.validation.passed = result.validation.passed && llmJudgment.passed; @@ -83,6 +213,31 @@ export class EvaluationRunner { } } catch (error) { console.warn('[EvaluationRunner] LLM evaluation failed:', error); + // Update span with error + if (isTracingEnabled()) { + try { + await this.tracingProvider.updateObservation(llmSpanId, { + endTime: new Date(), + error: error instanceof Error ? error.message : String(error) + }); + } catch (tracingError) { + logger.warn('Failed to update LLM evaluation span with error:', tracingError); + } + } + } + } + + // Finalize the trace + if (isTracingEnabled()) { + try { + await this.tracingProvider.finalizeTrace(traceId, { + status: result.status, + output: result.output, + duration: Date.now() - startTime.getTime(), + validation: result.validation + }); + } catch (error) { + logger.warn('Failed to finalize trace:', error); } } diff --git a/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts b/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts index fb9e11cc357..a76be741fc6 100644 --- a/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts +++ b/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts @@ -11,6 +11,8 @@ import type { EvaluationConfig, TestResult, TestCase, ValidationConfig } from '. import type { ScreenshotData } from '../utils/EvaluationTypes.js'; import { createLogger } from '../../core/Logger.js'; import { TIMING_CONSTANTS } from '../../core/Constants.js'; +import { createTracingProvider, isTracingEnabled } from '../../tracing/TracingConfig.js'; +import type { TracingProvider, TracingContext } from '../../tracing/TracingProvider.js'; const logger = createLogger('VisionAgentEvaluationRunner'); @@ -43,6 +45,7 @@ export class VisionAgentEvaluationRunner { private screenshotTool: TakeScreenshotTool; private config: EvaluationConfig; private globalVisionEnabled: boolean; + private tracingProvider: TracingProvider; constructor(visionEnabled: boolean = false, judgeModel?: string) { // Get API key from AgentService @@ -71,6 +74,7 @@ export class VisionAgentEvaluationRunner { this.llmEvaluator = new LLMEvaluator(this.config.evaluationApiKey, this.config.evaluationModel); this.screenshotTool = new TakeScreenshotTool(); this.globalVisionEnabled = visionEnabled; + this.tracingProvider = createTracingProvider(); } /** @@ -96,6 +100,44 @@ export class VisionAgentEvaluationRunner { let beforeScreenshot: ScreenshotData | undefined; let afterScreenshot: ScreenshotData | undefined; + // Create a trace for this test + const traceId = `test-${testCase.id}-${Date.now()}`; + const tracingContext: TracingContext = { + traceId, + sessionId: `vision-session-${Date.now()}`, + parentObservationId: undefined + }; + + try { + // Create a root trace for the test + if (isTracingEnabled()) { + await this.tracingProvider.initialize(); + await this.tracingProvider.createSession(tracingContext.sessionId, { + type: 'vision-evaluation', + source: 'ui-dialog' + }); + + await this.tracingProvider.createTrace( + traceId, + tracingContext.sessionId, + `Vision Agent Evaluation: ${testCase.name}`, + testCase.input, + { + testId: testCase.id, + testName: testCase.name, + agent: toolName, + visionEnabled: shouldUseVision, + url: testCase.url, + tags: testCase.metadata?.tags || [] + }, + 'vision-agent-runner', + ['evaluation', 'vision', toolName] + ); + } + } catch (error) { + logger.warn('Failed to create trace:', error); + } + try { // Always create hooks for screenshot capture in VisionAgentEvaluationRunner const visualConfig = testCase.validation.llmJudge?.visualVerification; @@ -138,8 +180,8 @@ export class VisionAgentEvaluationRunner { // Always use evaluator with hooks in VisionAgentEvaluationRunner const evaluator = new GenericToolEvaluator(this.config, testHooks); - // Execute the agent action - const agentResult = await evaluator.runTest(testCase, agent as any); + // Execute the agent action with tracing context + const agentResult = await evaluator.runTest(testCase, agent as any, tracingContext); // Perform evaluation based on vision mode if (agentResult.status === 'passed' && agentResult.output && testCase.validation.type === 'llm-judge') { @@ -212,10 +254,45 @@ export class VisionAgentEvaluationRunner { } }; + // Update trace with final result + try { + if (isTracingEnabled()) { + await this.tracingProvider.finalizeTrace(traceId, { + output: agentResult, + statusMessage: agentResult.status, + metadata: { + ...(agentResult.validation?.llmJudge ? { + llmScore: agentResult.validation.llmJudge.score, + llmConfidence: agentResult.validation.llmJudge.confidence, + llmExplanation: agentResult.validation.llmJudge.explanation + } : {}), + toolsUsed: agentResult.output?.toolUsageStats?.toolsList || [], + toolCallCount: agentResult.output?.toolUsageStats?.totalCalls || 0, + duration: agentResult.duration + } + }); + } + } catch (error) { + logger.warn('Failed to update trace:', error); + } + return agentResult; } catch (error) { logger.error(`❌ Test failed with error:`, error); + + // Update trace with error + try { + if (isTracingEnabled()) { + await this.tracingProvider.finalizeTrace(traceId, { + error: error instanceof Error ? error.message : String(error), + statusMessage: 'error' + }); + } + } catch (updateError) { + logger.warn('Failed to update trace with error:', updateError); + } + return { testId: testCase.id, status: 'error', diff --git a/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts b/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts index 8a45a2a403d..965f8885111 100644 --- a/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts +++ b/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts @@ -59,7 +59,7 @@ export class ErrorHandlingUtils { try { return await operation(); } catch (error) { - logger.error(`[${context}] Operation failed:`, error); + logger.error(`[${context}] Operation failed:`, error instanceof Error ? error.message : String(error)); return errorBuilder(error); } } diff --git a/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts b/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts index 3d6a78e9781..ae0dba4b870 100644 --- a/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts +++ b/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts @@ -108,7 +108,7 @@ export class StreamlinedSchemaExtractorTool implements Tool setTimeout(resolve, this.RETRY_DELAY_MS)); } else { - logger.error(`JSON extraction failed after ${attempt} attempts:`, error); + logger.error(`JSON extraction failed after ${attempt} attempts:`, error instanceof Error ? error.message : String(error)); throw new Error(`Data extraction failed after ${attempt} attempts: ${error instanceof Error ? error.message : String(error)}`); } } @@ -408,7 +408,7 @@ CRITICAL: Only use nodeIds that you can actually see in the accessibility tree a return result; } catch (error) { - logger.error(`Error in URL retry attempt ${attemptNumber}:`, error); + logger.error(`Error in URL retry attempt ${attemptNumber}:`, error instanceof Error ? error.message : String(error)); return null; } } diff --git a/front_end/panels/ai_chat/ui/SettingsDialog.ts b/front_end/panels/ai_chat/ui/SettingsDialog.ts index cb670328784..3c1157eb22e 100644 --- a/front_end/panels/ai_chat/ui/SettingsDialog.ts +++ b/front_end/panels/ai_chat/ui/SettingsDialog.ts @@ -7,6 +7,7 @@ import * as UI from '../../../ui/legacy/legacy.js'; import { LLMClient } from '../LLM/LLMClient.js'; import { createLogger } from '../core/Logger.js'; import { getTracingConfig, setTracingConfig, isTracingEnabled } from '../tracing/TracingConfig.js'; +import { getEvaluationConfig, setEvaluationConfig, isEvaluationEnabled, testEvaluationConnection, connectToEvaluationService, getEvaluationClientId, isEvaluationConnected } from '../common/EvaluationConfig.js'; const logger = createLogger('SettingsDialog'); @@ -312,6 +313,42 @@ const UIStrings = { *@description Test tracing button */ testTracing: 'Test Connection', + /** + *@description Evaluation section title + */ + evaluationSection: 'Evaluation Configuration', + /** + *@description Evaluation enabled label + */ + evaluationEnabled: 'Enable Evaluation', + /** + *@description Evaluation enabled hint + */ + evaluationEnabledHint: 'Enable evaluation service connection for AI Chat interactions', + /** + *@description Evaluation endpoint label + */ + evaluationEndpoint: 'Evaluation Endpoint', + /** + *@description Evaluation endpoint hint + */ + evaluationEndpointHint: 'WebSocket endpoint for the evaluation service (e.g., ws://localhost:8080)', + /** + *@description Evaluation secret key label + */ + evaluationSecretKey: 'Evaluation Secret Key', + /** + *@description Evaluation secret key hint + */ + evaluationSecretKeyHint: 'Secret key for authentication with the evaluation service (optional)', + /** + *@description Connect to evaluation button + */ + connectEvaluation: 'Connect', + /** + *@description Test evaluation button + */ + testEvaluation: 'Test Connection', }; const str_ = i18n.i18n.registerUIStrings('panels/ai_chat/ui/SettingsDialog.ts', UIStrings); @@ -1886,6 +1923,271 @@ export class SettingsDialog { }, 5000); } }); + + // Add evaluation configuration section + const evaluationSection = document.createElement('div'); + evaluationSection.className = 'settings-section evaluation-section'; + contentDiv.appendChild(evaluationSection); + + const evaluationSectionTitle = document.createElement('h3'); + evaluationSectionTitle.className = 'settings-subtitle'; + evaluationSectionTitle.textContent = i18nString(UIStrings.evaluationSection); + evaluationSection.appendChild(evaluationSectionTitle); + + // Get current evaluation configuration + const currentEvaluationConfig = getEvaluationConfig(); + + // Evaluation enabled checkbox + const evaluationEnabledContainer = document.createElement('div'); + evaluationEnabledContainer.className = 'evaluation-enabled-container'; + evaluationSection.appendChild(evaluationEnabledContainer); + + const evaluationEnabledCheckbox = document.createElement('input'); + evaluationEnabledCheckbox.type = 'checkbox'; + evaluationEnabledCheckbox.id = 'evaluation-enabled'; + evaluationEnabledCheckbox.className = 'evaluation-checkbox'; + evaluationEnabledCheckbox.checked = isEvaluationEnabled(); + evaluationEnabledContainer.appendChild(evaluationEnabledCheckbox); + + const evaluationEnabledLabel = document.createElement('label'); + evaluationEnabledLabel.htmlFor = 'evaluation-enabled'; + evaluationEnabledLabel.className = 'evaluation-label'; + evaluationEnabledLabel.textContent = i18nString(UIStrings.evaluationEnabled); + evaluationEnabledContainer.appendChild(evaluationEnabledLabel); + + const evaluationEnabledHint = document.createElement('div'); + evaluationEnabledHint.className = 'settings-hint'; + evaluationEnabledHint.textContent = i18nString(UIStrings.evaluationEnabledHint); + evaluationSection.appendChild(evaluationEnabledHint); + + // Connection status indicator + const connectionStatusContainer = document.createElement('div'); + connectionStatusContainer.className = 'connection-status-container'; + connectionStatusContainer.style.display = 'flex'; + connectionStatusContainer.style.alignItems = 'center'; + connectionStatusContainer.style.gap = '8px'; + connectionStatusContainer.style.marginTop = '8px'; + connectionStatusContainer.style.fontSize = '13px'; + evaluationSection.appendChild(connectionStatusContainer); + + const connectionStatusDot = document.createElement('div'); + connectionStatusDot.className = 'connection-status-dot'; + connectionStatusDot.style.width = '8px'; + connectionStatusDot.style.height = '8px'; + connectionStatusDot.style.borderRadius = '50%'; + connectionStatusDot.style.flexShrink = '0'; + connectionStatusContainer.appendChild(connectionStatusDot); + + const connectionStatusText = document.createElement('span'); + connectionStatusText.className = 'connection-status-text'; + connectionStatusContainer.appendChild(connectionStatusText); + + // Function to update connection status + const updateConnectionStatus = () => { + const isConnected = isEvaluationConnected(); + + logger.debug('Updating connection status', { isConnected }); + + if (isConnected) { + connectionStatusDot.style.backgroundColor = 'var(--color-accent-green)'; + connectionStatusText.textContent = 'Connected to evaluation server'; + connectionStatusText.style.color = 'var(--color-accent-green)'; + } else { + connectionStatusDot.style.backgroundColor = 'var(--color-text-disabled)'; + connectionStatusText.textContent = 'Not connected'; + connectionStatusText.style.color = 'var(--color-text-disabled)'; + } + }; + + // Update status initially and when evaluation is enabled/disabled + updateConnectionStatus(); + + // Set up periodic status updates every 2 seconds + const statusUpdateInterval = setInterval(updateConnectionStatus, 2000); + + // Evaluation configuration container (shown when enabled) + const evaluationConfigContainer = document.createElement('div'); + evaluationConfigContainer.className = 'evaluation-config-container'; + evaluationConfigContainer.style.display = evaluationEnabledCheckbox.checked ? 'block' : 'none'; + evaluationSection.appendChild(evaluationConfigContainer); + + // Client ID display (read-only) + const clientIdLabel = document.createElement('div'); + clientIdLabel.className = 'settings-label'; + clientIdLabel.textContent = 'Client ID'; + evaluationConfigContainer.appendChild(clientIdLabel); + + const clientIdHint = document.createElement('div'); + clientIdHint.className = 'settings-hint'; + clientIdHint.textContent = 'Unique identifier for this DevTools instance'; + evaluationConfigContainer.appendChild(clientIdHint); + + const clientIdInput = document.createElement('input'); + clientIdInput.type = 'text'; + clientIdInput.className = 'settings-input'; + clientIdInput.value = currentEvaluationConfig.clientId || 'Auto-generated on first connection'; + clientIdInput.readOnly = true; + clientIdInput.style.backgroundColor = 'var(--color-background-elevation-1)'; + clientIdInput.style.cursor = 'default'; + evaluationConfigContainer.appendChild(clientIdInput); + + // Evaluation endpoint + const evaluationEndpointLabel = document.createElement('div'); + evaluationEndpointLabel.className = 'settings-label'; + evaluationEndpointLabel.textContent = i18nString(UIStrings.evaluationEndpoint); + evaluationConfigContainer.appendChild(evaluationEndpointLabel); + + const evaluationEndpointHint = document.createElement('div'); + evaluationEndpointHint.className = 'settings-hint'; + evaluationEndpointHint.textContent = i18nString(UIStrings.evaluationEndpointHint); + evaluationConfigContainer.appendChild(evaluationEndpointHint); + + const evaluationEndpointInput = document.createElement('input'); + evaluationEndpointInput.type = 'text'; + evaluationEndpointInput.className = 'settings-input'; + evaluationEndpointInput.placeholder = 'ws://localhost:8080'; + evaluationEndpointInput.value = currentEvaluationConfig.endpoint || 'ws://localhost:8080'; + evaluationConfigContainer.appendChild(evaluationEndpointInput); + + // Evaluation secret key + const evaluationSecretKeyLabel = document.createElement('div'); + evaluationSecretKeyLabel.className = 'settings-label'; + evaluationSecretKeyLabel.textContent = i18nString(UIStrings.evaluationSecretKey); + evaluationConfigContainer.appendChild(evaluationSecretKeyLabel); + + const evaluationSecretKeyHint = document.createElement('div'); + evaluationSecretKeyHint.className = 'settings-hint'; + evaluationSecretKeyHint.textContent = i18nString(UIStrings.evaluationSecretKeyHint); + evaluationConfigContainer.appendChild(evaluationSecretKeyHint); + + const evaluationSecretKeyInput = document.createElement('input'); + evaluationSecretKeyInput.type = 'password'; + evaluationSecretKeyInput.className = 'settings-input'; + evaluationSecretKeyInput.placeholder = 'Optional secret key'; + evaluationSecretKeyInput.value = currentEvaluationConfig.secretKey || ''; + evaluationConfigContainer.appendChild(evaluationSecretKeyInput); + + // Connect and Test buttons container + const evaluationButtonsContainer = document.createElement('div'); + evaluationButtonsContainer.className = 'evaluation-buttons-container'; + evaluationConfigContainer.appendChild(evaluationButtonsContainer); + + const connectEvaluationButton = document.createElement('button'); + connectEvaluationButton.className = 'settings-button connect-button'; + connectEvaluationButton.textContent = i18nString(UIStrings.connectEvaluation); + evaluationButtonsContainer.appendChild(connectEvaluationButton); + + const testEvaluationButton = document.createElement('button'); + testEvaluationButton.className = 'settings-button test-button'; + testEvaluationButton.textContent = i18nString(UIStrings.testEvaluation); + evaluationButtonsContainer.appendChild(testEvaluationButton); + + // Test status message + const testEvaluationStatus = document.createElement('div'); + testEvaluationStatus.className = 'settings-status'; + testEvaluationStatus.style.display = 'none'; + evaluationConfigContainer.appendChild(testEvaluationStatus); + + // Toggle evaluation config visibility + evaluationEnabledCheckbox.addEventListener('change', () => { + evaluationConfigContainer.style.display = evaluationEnabledCheckbox.checked ? 'block' : 'none'; + }); + + // Test evaluation connection + testEvaluationButton.addEventListener('click', async () => { + testEvaluationButton.disabled = true; + testEvaluationStatus.style.display = 'block'; + testEvaluationStatus.textContent = 'Testing connection...'; + testEvaluationStatus.style.backgroundColor = 'var(--color-background-elevation-1)'; + testEvaluationStatus.style.color = 'var(--color-text-primary)'; + + try { + const endpoint = evaluationEndpointInput.value.trim(); + const secretKey = evaluationSecretKeyInput.value.trim(); + + if (!endpoint) { + throw new Error('Endpoint is required for testing'); + } + + // Temporarily update config for testing + setEvaluationConfig({ + enabled: true, + endpoint, + secretKey + }); + + const result = await testEvaluationConnection(); + + if (result.success) { + testEvaluationStatus.textContent = '✓ Connection successful'; + testEvaluationStatus.style.backgroundColor = 'var(--color-accent-green-background)'; + testEvaluationStatus.style.color = 'var(--color-accent-green)'; + } else { + throw new Error(result.message); + } + } catch (error) { + testEvaluationStatus.textContent = `✗ ${error instanceof Error ? error.message : 'Connection failed'}`; + testEvaluationStatus.style.backgroundColor = 'var(--color-accent-red-background)'; + testEvaluationStatus.style.color = 'var(--color-accent-red)'; + } finally { + testEvaluationButton.disabled = false; + setTimeout(() => { + testEvaluationStatus.style.display = 'none'; + }, 5000); + } + }); + + // Connect evaluation service + connectEvaluationButton.addEventListener('click', async () => { + connectEvaluationButton.disabled = true; + testEvaluationStatus.style.display = 'block'; + testEvaluationStatus.textContent = 'Connecting...'; + testEvaluationStatus.style.backgroundColor = 'var(--color-background-elevation-1)'; + testEvaluationStatus.style.color = 'var(--color-text-primary)'; + + try { + const endpoint = evaluationEndpointInput.value.trim(); + const secretKey = evaluationSecretKeyInput.value.trim(); + + if (!endpoint) { + throw new Error('Endpoint is required for connection'); + } + + // Update config and connect + setEvaluationConfig({ + enabled: true, + endpoint, + secretKey + }); + + await connectToEvaluationService(); + + // Update client ID display after connection + const clientId = getEvaluationClientId(); + if (clientId) { + clientIdInput.value = clientId; + } + + testEvaluationStatus.textContent = '✓ Connected successfully'; + testEvaluationStatus.style.backgroundColor = 'var(--color-accent-green-background)'; + testEvaluationStatus.style.color = 'var(--color-accent-green)'; + + // Update connection status indicator with a small delay to ensure connection is established + setTimeout(updateConnectionStatus, 500); + } catch (error) { + testEvaluationStatus.textContent = `✗ ${error instanceof Error ? error.message : 'Connection failed'}`; + testEvaluationStatus.style.backgroundColor = 'var(--color-accent-red-background)'; + testEvaluationStatus.style.color = 'var(--color-accent-red)'; + + // Update connection status indicator + updateConnectionStatus(); + } finally { + connectEvaluationButton.disabled = false; + setTimeout(() => { + testEvaluationStatus.style.display = 'none'; + }, 5000); + } + }); // Add disclaimer section const disclaimerSection = document.createElement('div'); @@ -2067,6 +2369,13 @@ export class SettingsDialog { } else { setTracingConfig({ provider: 'disabled' }); } + + // Save evaluation configuration + setEvaluationConfig({ + enabled: evaluationEnabledCheckbox.checked, + endpoint: evaluationEndpointInput.value.trim() || 'ws://localhost:8080', + secretKey: evaluationSecretKeyInput.value.trim() + }); logger.debug('Settings saved successfully'); logger.debug('Mini Model:', localStorage.getItem(MINI_MODEL_STORAGE_KEY)); @@ -2447,6 +2756,58 @@ export class SettingsDialog { padding-left: 24px; border-left: 2px solid var(--color-details-hairline); } + + /* Evaluation section styles */ + .evaluation-section { + margin-top: 16px; + padding: 16px 20px; + border-bottom: 1px solid var(--color-details-hairline); + } + + .evaluation-enabled-container { + display: flex; + align-items: center; + gap: 8px; + margin-bottom: 8px; + } + + .evaluation-checkbox { + margin: 0; + } + + .evaluation-label { + font-weight: 500; + color: var(--color-text-primary); + cursor: pointer; + } + + .evaluation-config-container { + margin-top: 16px; + padding-left: 24px; + border-left: 2px solid var(--color-details-hairline); + } + + .evaluation-buttons-container { + display: flex; + gap: 8px; + margin-top: 16px; + } + + .connect-button { + background-color: var(--color-accent-blue-background); + color: var(--color-accent-blue); + border: 1px solid var(--color-accent-blue); + } + + .connect-button:hover { + background-color: var(--color-accent-blue); + color: var(--color-background); + } + + .connect-button:disabled { + opacity: 0.6; + cursor: not-allowed; + } `; dialog.contentElement.appendChild(styleElement);