diff --git a/config/gni/devtools_grd_files.gni b/config/gni/devtools_grd_files.gni
index 286948d1f26..32df9ee3567 100644
--- a/config/gni/devtools_grd_files.gni
+++ b/config/gni/devtools_grd_files.gni
@@ -645,6 +645,10 @@ grd_files_bundled_sources = [
   "front_end/panels/ai_chat/common/log.js",
   "front_end/panels/ai_chat/common/context.js",
   "front_end/panels/ai_chat/common/page.js",
+  "front_end/panels/ai_chat/common/WebSocketRPCClient.js",
+  "front_end/panels/ai_chat/common/EvaluationConfig.js",
+  "front_end/panels/ai_chat/evaluation/EvaluationProtocol.js",
+  "front_end/panels/ai_chat/evaluation/EvaluationAgent.js",
   "front_end/panels/ai_chat/tracing/TracingProvider.js",
   "front_end/panels/ai_chat/tracing/LangfuseProvider.js",
   "front_end/panels/ai_chat/tracing/TracingConfig.js",
diff --git a/eval-server/.env.example b/eval-server/.env.example
new file mode 100644
index 00000000000..1e8a74879ce
--- /dev/null
+++ b/eval-server/.env.example
@@ -0,0 +1,16 @@
+# WebSocket Server Configuration
+PORT=8080
+HOST=localhost
+
+# LLM Judge Configuration
+OPENAI_API_KEY=your-openai-api-key-here
+JUDGE_MODEL=gpt-4
+JUDGE_TEMPERATURE=0.1
+
+# Logging Configuration
+LOG_LEVEL=info
+LOG_DIR=./logs
+
+# RPC Configuration
+RPC_TIMEOUT=30000
+MAX_CONCURRENT_EVALUATIONS=10
\ No newline at end of file
diff --git a/eval-server/.gitignore b/eval-server/.gitignore
new file mode 100644
index 00000000000..97aca2ea1cd
--- /dev/null
+++ b/eval-server/.gitignore
@@ -0,0 +1,2 @@
+.env
+node_modules
\ No newline at end of file
diff --git a/eval-server/CLAUDE.md b/eval-server/CLAUDE.md
new file mode 100644
index 00000000000..5db83421a3f
--- /dev/null
+++ b/eval-server/CLAUDE.md
@@ -0,0 +1,103 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+bo-eval-server is a WebSocket-based evaluation server for LLM agents that implements an LLM-as-a-judge evaluation system. The server accepts connections from AI agents, sends them evaluation tasks via RPC calls, collects their responses, and uses an LLM to judge the quality of responses.
+
+## Commands
+
+### Development
+- `npm start` - Start the WebSocket server
+- `npm run dev` - Start server with file watching for development
+- `npm run cli` - Start interactive CLI for server management and testing
+- `npm test` - Run example agent client for testing
+
+### Installation
+- `npm install` - Install dependencies
+- Copy `.env.example` to `.env` and configure environment variables
+
+### Required Environment Variables
+- `OPENAI_API_KEY` - OpenAI API key for LLM judge functionality
+- `PORT` - WebSocket server port (default: 8080)
+
+## Architecture
+
+### Core Components
+
+**WebSocket Server** (`src/server.js`)
+- Accepts connections from LLM agents
+- Manages agent lifecycle (connect, ready, disconnect)
+- Orchestrates evaluation sessions
+- Handles bidirectional RPC communication
+
+**RPC Client** (`src/rpc-client.js`)
+- Implements JSON-RPC 2.0 protocol for server-to-client calls
+- Manages request/response correlation with unique IDs
+- Handles timeouts and error conditions
+- Calls `Evaluate(request: String) -> String` method on connected agents
+
+**LLM Evaluator** (`src/evaluator.js`)
+- Integrates with OpenAI API for LLM-as-a-judge functionality
+- Evaluates agent responses on multiple criteria (correctness, completeness, clarity, relevance, helpfulness)
+- Returns structured JSON evaluation with scores and reasoning
+
+**Logger** (`src/logger.js`)
+- Structured logging using Winston
+- Separate log files for different event types
+- JSON format for easy parsing and analysis
+- Logs all RPC calls, evaluations, and connection events
+
+### Evaluation Flow
+
+1. Agent connects to WebSocket server
+2. Agent sends "ready" signal
+3. Server calls agent's `Evaluate` method with a task
+4. Agent processes task and returns response
+5. Server sends response to LLM judge for evaluation
+6. Results are logged as JSON with scores and detailed feedback
+
+### Project Structure
+
+```
+src/
+├── server.js          # Main WebSocket server and evaluation orchestration
+├── rpc-client.js      # JSON-RPC client for calling agent methods
+├── evaluator.js       # LLM judge integration (OpenAI)
+├── logger.js          # Structured logging and result storage
+├── config.js          # Configuration management
+└── cli.js             # Interactive CLI for testing and management
+
+logs/                  # Log files (created automatically)
+├── combined.log       # All log events
+├── error.log          # Error events only
+└── evaluations.jsonl  # Evaluation results in JSON Lines format
+```
+
+### Key Features
+
+- **Bidirectional RPC**: Server can call methods on connected clients
+- **LLM-as-a-Judge**: Automated evaluation of agent responses using GPT-4
+- **Concurrent Evaluations**: Support for multiple agents and parallel evaluations
+- **Structured Logging**: All interactions logged as JSON for analysis
+- **Interactive CLI**: Built-in CLI for testing and server management
+- **Connection Management**: Robust handling of agent connections and disconnections
+- **Timeout Handling**: Configurable timeouts for RPC calls and evaluations
+
+### Agent Protocol
+
+Agents must implement:
+- WebSocket connection to server
+- JSON-RPC 2.0 protocol support
+- `Evaluate(task: string) -> string` method
+- "ready" message to signal availability for evaluations
+
+### Configuration
+
+All configuration is managed through environment variables and `src/config.js`. Key settings:
+- Server port and host
+- OpenAI API configuration
+- RPC timeouts
+- Logging levels and directories
+- Maximum concurrent evaluations
\ No newline at end of file
diff --git a/eval-server/README.md b/eval-server/README.md
new file mode 100644
index 00000000000..3179bccf573
--- /dev/null
+++ b/eval-server/README.md
@@ -0,0 +1,47 @@
+# bo-eval-server
+
+A WebSocket-based evaluation server for LLM agents using LLM-as-a-judge methodology.
+
+## Quick Start
+
+1. **Install dependencies**
+   ```bash
+   npm install
+   ```
+
+2. **Configure environment**
+   ```bash
+   cp .env.example .env
+   # Edit .env and add your OPENAI_API_KEY
+   ```
+
+3. **Start the server**
+   ```bash
+   npm start
+   ```
+
+4. **Use interactive CLI** (alternative to step 3)
+   ```bash
+   npm run cli
+   ```
+
+## Features
+
+- 🔌 WebSocket server for real-time agent connections
+- 🤖 Bidirectional RPC calls to connected agents
+- ⚖️ LLM-as-a-judge evaluation using OpenAI GPT-4
+- 📊 Structured JSON logging of all evaluations
+- 🖥️ Interactive CLI for testing and management
+- ⚡ Support for concurrent agent evaluations
+
+## Agent Protocol
+
+Your agent needs to:
+
+1. Connect to the WebSocket server (default: `ws://localhost:8080`)
+2. Send a `{"type": "ready"}` message when ready for evaluations
+3. Implement the `Evaluate` RPC method that accepts a string task and returns a string response
+
+## For more details
+
+See [CLAUDE.md](./CLAUDE.md) for comprehensive documentation of the architecture and implementation.
\ No newline at end of file
diff --git a/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml b/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
new file mode 100644
index 00000000000..f5b865f5b55
--- /dev/null
+++ b/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
@@ -0,0 +1,12 @@
+client:
+  id: 1233ae25-9f9e-4f77-924d-865f7d615cef
+  name: DevTools Client 1233ae25
+  secret_key: hello
+  description: Auto-generated DevTools evaluation client
+settings:
+  max_concurrent_evaluations: 3
+  default_timeout: 45000
+  retry_policy:
+    max_retries: 2
+    backoff_multiplier: 2
+    initial_delay: 1000
diff --git a/eval-server/docs/CLIENT_SETUP.md b/eval-server/docs/CLIENT_SETUP.md
new file mode 100644
index 00000000000..53502ae074d
--- /dev/null
+++ b/eval-server/docs/CLIENT_SETUP.md
@@ -0,0 +1,445 @@
+# Client Setup Guide
+
+## Overview
+
+This guide explains how to set up a new evaluation client to connect to the evaluation server. Clients can be any application that implements the WebSocket evaluation protocol, such as Chrome DevTools or custom test agents.
+
+## Prerequisites
+
+- WebSocket client library
+- JSON-RPC 2.0 implementation
+- UUID v4 generator
+- Tools/agents to execute evaluations
+
+## Setup Steps
+
+### 1. Generate Client ID
+
+Generate a unique UUID v4 for your client:
+
+```javascript
+// JavaScript example
+import { v4 as uuidv4 } from 'uuid';
+const clientId = uuidv4(); // e.g., "550e8400-e29b-41d4-a716-446655440000"
+```
+
+Store this ID persistently - it will be used for all connections.
+
+### 2. Request YAML Configuration
+
+Contact the evaluation server administrator to:
+1. Create a YAML evaluation file for your client ID
+2. Optionally set up a secret key for authentication
+3. Configure appropriate evaluations for your client
+
+Example request:
+```
+Client ID: 550e8400-e29b-41d4-a716-446655440000
+Client Name: Chrome DevTools Production
+Tools Available: extract_schema_data, research_agent, action_agent
+Purpose: Automated regression testing
+```
+
+### 3. Implement WebSocket Connection
+
+```javascript
+class EvaluationClient {
+  constructor(serverUrl, clientId, secretKey) {
+    this.serverUrl = serverUrl;
+    this.clientId = clientId;
+    this.secretKey = secretKey;
+    this.ws = null;
+  }
+
+  connect() {
+    this.ws = new WebSocket(this.serverUrl);
+    
+    this.ws.onopen = () => {
+      console.log('Connected to evaluation server');
+    };
+    
+    this.ws.onmessage = (event) => {
+      this.handleMessage(JSON.parse(event.data));
+    };
+    
+    this.ws.onerror = (error) => {
+      console.error('WebSocket error:', error);
+    };
+  }
+}
+```
+
+### 4. Implement Protocol Messages
+
+#### Handle Welcome Message
+```javascript
+handleMessage(message) {
+  switch (message.type) {
+    case 'welcome':
+      // Server is ready, send registration
+      this.register();
+      break;
+    
+    case 'registration_ack':
+      if (message.status === 'accepted') {
+        console.log(`Registered! ${message.evaluationsCount} evaluations assigned`);
+        this.sendReady();
+      } else {
+        console.error('Registration rejected:', message.reason);
+      }
+      break;
+    
+    default:
+      // Handle other messages...
+  }
+}
+```
+
+#### Send Registration
+```javascript
+register() {
+  this.send({
+    type: 'register',
+    clientId: this.clientId,
+    secretKey: this.secretKey, // Optional
+    capabilities: {
+      tools: ['extract_schema_data', 'research_agent'],
+      maxConcurrency: 3,
+      version: '1.0.0'
+    }
+  });
+}
+```
+
+#### Send Ready Signal
+```javascript
+sendReady() {
+  this.send({
+    type: 'ready',
+    timestamp: new Date().toISOString()
+  });
+}
+```
+
+### 5. Implement RPC Handler
+
+```javascript
+handleMessage(message) {
+  // ... existing code ...
+  
+  // Handle JSON-RPC requests
+  if (message.jsonrpc === '2.0' && message.method) {
+    this.handleRpcRequest(message);
+  }
+}
+
+async handleRpcRequest(request) {
+  if (request.method === 'evaluate') {
+    try {
+      const result = await this.executeEvaluation(request.params);
+      
+      this.send({
+        jsonrpc: '2.0',
+        result: {
+          status: 'success',
+          output: result.output,
+          executionTime: result.duration,
+          toolCalls: result.toolCalls,
+          metadata: result.metadata
+        },
+        id: request.id
+      });
+    } catch (error) {
+      this.send({
+        jsonrpc: '2.0',
+        error: {
+          code: -32000,
+          message: error.message,
+          data: {
+            tool: request.params.tool,
+            error: error.toString(),
+            timestamp: new Date().toISOString()
+          }
+        },
+        id: request.id
+      });
+    }
+  }
+}
+```
+
+### 6. Implement Tool Execution
+
+```javascript
+async executeEvaluation(params) {
+  const startTime = Date.now();
+  
+  // Send status update
+  this.send({
+    type: 'status',
+    evaluationId: params.evaluationId,
+    status: 'running',
+    progress: 0.1,
+    message: 'Starting evaluation...'
+  });
+  
+  // Execute the appropriate tool
+  let result;
+  switch (params.tool) {
+    case 'extract_schema_data':
+      result = await this.extractSchema(params.url, params.input);
+      break;
+    
+    case 'research_agent':
+      result = await this.runResearchAgent(params.url, params.input);
+      break;
+    
+    default:
+      throw new Error(`Unknown tool: ${params.tool}`);
+  }
+  
+  const executionTime = Date.now() - startTime;
+  
+  return {
+    output: result,
+    duration: executionTime,
+    toolCalls: [{
+      tool: params.tool,
+      timestamp: new Date().toISOString(),
+      duration: executionTime,
+      status: 'success'
+    }],
+    metadata: {
+      url: params.url,
+      toolVersion: '1.0.0'
+    }
+  };
+}
+```
+
+## Chrome DevTools Integration
+
+For Chrome DevTools specifically:
+
+### 1. Update EvaluationConfig
+
+```typescript
+// In EvaluationConfig.ts
+interface EvaluationConfiguration {
+  enabled: boolean;
+  endpoint: string;
+  secretKey?: string;
+  clientId?: string; // Add client ID field
+}
+
+// Generate and store client ID
+function ensureClientId(): string {
+  let clientId = localStorage.getItem('ai_chat_evaluation_client_id');
+  if (!clientId) {
+    clientId = generateUUID();
+    localStorage.setItem('ai_chat_evaluation_client_id', clientId);
+  }
+  return clientId;
+}
+```
+
+### 2. Create Evaluation Agent
+
+```typescript
+// EvaluationAgent.ts
+import { WebSocketRPCClient } from '../common/WebSocketRPCClient.js';
+import { ToolRegistry } from '../agent_framework/ConfigurableAgentTool.js';
+
+export class EvaluationAgent {
+  private client: WebSocketRPCClient;
+  private clientId: string;
+  
+  constructor(config: EvaluationConfiguration) {
+    this.clientId = config.clientId || ensureClientId();
+    this.client = new WebSocketRPCClient({
+      endpoint: config.endpoint,
+      secretKey: config.secretKey
+    });
+    
+    this.setupHandlers();
+  }
+  
+  private setupHandlers(): void {
+    this.client.on('connected', () => {
+      this.register();
+    });
+    
+    // Handle RPC requests
+    this.client.on('rpc-request', async (request) => {
+      if (request.method === 'evaluate') {
+        const result = await this.handleEvaluation(request.params);
+        return result;
+      }
+    });
+  }
+  
+  private async handleEvaluation(params: any): Promise<any> {
+    const tool = ToolRegistry.getRegisteredTool(params.tool);
+    if (!tool) {
+      throw new Error(`Tool not found: ${params.tool}`);
+    }
+    
+    // Execute tool with params.input
+    const result = await tool.execute(params.input);
+    
+    return {
+      status: 'success',
+      output: result,
+      executionTime: Date.now() - startTime
+    };
+  }
+}
+```
+
+## Testing Your Client
+
+### 1. Local Testing
+
+Use the example agent to test your server setup:
+
+```bash
+# In bo-eval-server directory
+npm test
+```
+
+### 2. Connection Test
+
+```javascript
+// Quick connection test
+const client = new EvaluationClient(
+  'ws://localhost:8080',
+  'your-client-id',
+  'optional-secret'
+);
+
+client.connect();
+
+// Should see:
+// Connected to evaluation server
+// Registered! X evaluations assigned
+```
+
+### 3. Manual Evaluation Test
+
+You can trigger evaluations manually through the server's CLI:
+
+```bash
+npm run cli
+> run-evaluation your-client-id evaluation-id
+```
+
+## Troubleshooting
+
+### Connection Issues
+
+1. **Check server is running**
+   ```bash
+   curl -i -N -H "Connection: Upgrade" -H "Upgrade: websocket" http://localhost:8080
+   ```
+
+2. **Verify client ID exists**
+   - Check `clients/{your-client-id}.yaml` exists on server
+   - Ensure client ID format is valid UUID v4
+
+3. **Authentication failures**
+   - Verify secret key matches server configuration
+   - Check for typos in client ID or secret
+
+### Evaluation Failures
+
+1. **Tool not found**
+   - Ensure tool name in YAML matches client capabilities
+   - Verify tool is registered in your client
+
+2. **Timeouts**
+   - Increase timeout in YAML configuration
+   - Check for infinite loops in tool execution
+
+3. **Invalid input**
+   - Validate input against expected schema
+   - Check for required fields
+
+## Security Best Practices
+
+1. **Store credentials securely**
+   - Never hardcode secret keys
+   - Use environment variables or secure storage
+
+2. **Validate inputs**
+   - Sanitize URLs before navigation
+   - Validate schemas before execution
+
+3. **Resource limits**
+   - Implement timeout handling
+   - Limit concurrent evaluations
+
+4. **Use WSS in production**
+   ```javascript
+   const client = new EvaluationClient(
+     'wss://eval-server.example.com',  // Use WSS
+     clientId,
+     secretKey
+   );
+   ```
+
+## Example: Minimal Client
+
+```javascript
+// minimal-client.js
+import WebSocket from 'ws';
+
+const CLIENT_ID = 'your-uuid-here';
+const SECRET_KEY = 'your-secret-here';
+
+const ws = new WebSocket('ws://localhost:8080');
+
+ws.on('open', () => {
+  console.log('Connected');
+});
+
+ws.on('message', async (data) => {
+  const msg = JSON.parse(data);
+  
+  if (msg.type === 'welcome') {
+    // Register
+    ws.send(JSON.stringify({
+      type: 'register',
+      clientId: CLIENT_ID,
+      secretKey: SECRET_KEY,
+      capabilities: {
+        tools: ['extract_schema_data'],
+        maxConcurrency: 1,
+        version: '1.0.0'
+      }
+    }));
+  }
+  
+  if (msg.type === 'registration_ack' && msg.status === 'accepted') {
+    // Send ready
+    ws.send(JSON.stringify({
+      type: 'ready',
+      timestamp: new Date().toISOString()
+    }));
+  }
+  
+  if (msg.jsonrpc && msg.method === 'evaluate') {
+    // Simple evaluation response
+    ws.send(JSON.stringify({
+      jsonrpc: '2.0',
+      result: {
+        status: 'success',
+        output: { message: 'Evaluation completed' },
+        executionTime: 1000
+      },
+      id: msg.id
+    }));
+  }
+});
+
+ws.on('error', console.error);
+```
\ No newline at end of file
diff --git a/eval-server/docs/PROTOCOL.md b/eval-server/docs/PROTOCOL.md
new file mode 100644
index 00000000000..694e58a69d1
--- /dev/null
+++ b/eval-server/docs/PROTOCOL.md
@@ -0,0 +1,310 @@
+# WebSocket Evaluation Protocol
+
+## Overview
+
+This document describes the WebSocket communication protocol between evaluation clients (e.g., Chrome DevTools) and the evaluation server. The protocol supports client registration, authentication, and bidirectional evaluation task execution using JSON-RPC 2.0.
+
+## Connection Flow
+
+```
+Client                           Server
+  |                                |
+  |------ WebSocket Connect ------>|
+  |                                |
+  |<----- Welcome Message ---------|
+  |                                |
+  |------ Register Message ------->|
+  |                                |
+  |<----- Registration ACK ---------|
+  |                                |
+  |------ Ready Signal ----------->|
+  |                                |
+  |<===== Evaluation Loop ========>|
+```
+
+## Message Types
+
+### 1. Client → Server Messages
+
+#### 1.1 Registration Message
+Sent immediately after receiving the welcome message to register the client with the server.
+
+```json
+{
+  "type": "register",
+  "clientId": "550e8400-e29b-41d4-a716-446655440000",
+  "secretKey": "optional-secret-key",  // Optional field for authentication
+  "capabilities": {
+    "tools": ["extract_schema_data", "research_agent", "action_agent"],
+    "maxConcurrency": 3,
+    "version": "1.0.0"
+  }
+}
+```
+
+**Fields:**
+- `type`: Must be "register"
+- `clientId`: UUID v4 format, unique identifier for the client
+- `secretKey`: Optional authentication key
+- `capabilities`: Object describing client capabilities
+  - `tools`: Array of tool names the client can execute
+  - `maxConcurrency`: Maximum number of concurrent evaluations
+  - `version`: Client version string
+
+#### 1.2 Ready Signal
+Indicates the client is ready to receive evaluation tasks.
+
+```json
+{
+  "type": "ready",
+  "timestamp": "2024-01-01T00:00:00Z"
+}
+```
+
+#### 1.3 Status Update
+Provides progress updates for running evaluations.
+
+```json
+{
+  "type": "status",
+  "evaluationId": "eval-123",
+  "status": "running" | "completed" | "failed",
+  "progress": 0.5,  // Optional, value between 0 and 1
+  "message": "Processing page content..."  // Optional status message
+}
+```
+
+#### 1.4 Heartbeat (Ping)
+Keep-alive message to maintain connection.
+
+```json
+{
+  "type": "ping",
+  "timestamp": "2024-01-01T00:00:00Z"
+}
+```
+
+### 2. Server → Client Messages
+
+#### 2.1 Welcome Message
+Sent immediately after WebSocket connection is established.
+
+```json
+{
+  "type": "welcome",
+  "serverId": "server-001",
+  "version": "1.0.0",
+  "timestamp": "2024-01-01T00:00:00Z"
+}
+```
+
+#### 2.2 Registration Acknowledgment
+Response to client registration.
+
+```json
+{
+  "type": "registration_ack",
+  "clientId": "550e8400-e29b-41d4-a716-446655440000",
+  "status": "accepted" | "rejected",
+  "message": "Client registered successfully",
+  "evaluationsCount": 5,  // Number of evaluations assigned to this client
+  "reason": "Invalid secret key"  // Only present if status is "rejected"
+}
+```
+
+#### 2.3 Heartbeat Response (Pong)
+Response to client ping.
+
+```json
+{
+  "type": "pong",
+  "timestamp": "2024-01-01T00:00:00Z"
+}
+```
+
+## JSON-RPC 2.0 Evaluation Protocol
+
+The evaluation tasks are sent using JSON-RPC 2.0 protocol over the WebSocket connection.
+
+### 3. Evaluation Request (Server → Client)
+
+#### 3.1 Evaluate Method
+Requests the client to execute an evaluation task.
+
+```json
+{
+  "jsonrpc": "2.0",
+  "method": "evaluate",
+  "params": {
+    "evaluationId": "wikipedia-chrome-devtools-001",
+    "name": "Extract Chrome DevTools Wikipedia Article",
+    "url": "https://en.wikipedia.org/wiki/Chrome_DevTools",
+    "tool": "extract_schema_data",
+    "input": {
+      "schema": {
+        "type": "object",
+        "properties": {
+          "title": {"type": "string"},
+          "summary": {"type": "string"},
+          "tableOfContents": {
+            "type": "array",
+            "items": {"type": "string"}
+          }
+        }
+      }
+    },
+    "timeout": 30000,  // Timeout in milliseconds
+    "metadata": {
+      "tags": ["schema-extraction", "wikipedia"],
+      "retries": 2,
+      "priority": "normal"
+    }
+  },
+  "id": "rpc-001"
+}
+```
+
+**Parameters:**
+- `evaluationId`: Unique identifier for this evaluation (from YAML definition)
+- `name`: Human-readable name of the evaluation
+- `url`: Target URL for the evaluation
+- `tool`: Name of the tool to execute
+- `input`: Tool-specific input parameters
+- `timeout`: Maximum execution time in milliseconds
+- `metadata`: Additional evaluation metadata
+
+### 4. Evaluation Response (Client → Server)
+
+#### 4.1 Success Response
+Sent when evaluation completes successfully.
+
+```json
+{
+  "jsonrpc": "2.0",
+  "result": {
+    "status": "success",
+    "output": {
+      "title": "Chrome DevTools",
+      "summary": "Chrome DevTools is a set of web developer tools built directly into the Google Chrome browser.",
+      "tableOfContents": [
+        "Overview",
+        "Features",
+        "History",
+        "Usage"
+      ]
+    },
+    "executionTime": 2500,  // Total execution time in milliseconds
+    "toolCalls": [
+      {
+        "tool": "extract_schema_data",
+        "timestamp": "2024-01-01T00:00:00Z",
+        "duration": 2400,
+        "status": "success"
+      }
+    ],
+    "metadata": {
+      "pageLoadTime": 800,
+      "extractionTime": 1700,
+      "retryCount": 0
+    }
+  },
+  "id": "rpc-001"
+}
+```
+
+#### 4.2 Error Response
+Sent when evaluation fails.
+
+```json
+{
+  "jsonrpc": "2.0",
+  "error": {
+    "code": -32000,
+    "message": "Tool execution failed",
+    "data": {
+      "tool": "extract_schema_data",
+      "error": "Page load timeout after 30000ms",
+      "url": "https://en.wikipedia.org/wiki/Chrome_DevTools",
+      "timestamp": "2024-01-01T00:00:00Z",
+      "stackTrace": "Error: Timeout...\n  at PageLoader.load..."  // Optional
+    }
+  },
+  "id": "rpc-001"
+}
+```
+
+## Error Codes
+
+Standard JSON-RPC 2.0 error codes:
+- `-32700`: Parse error - Invalid JSON was received
+- `-32600`: Invalid request - JSON is not a valid request object
+- `-32601`: Method not found - Method does not exist
+- `-32602`: Invalid params - Invalid method parameters
+- `-32603`: Internal error - Internal JSON-RPC error
+
+Custom error codes for evaluation:
+- `-32000`: Tool execution error - Tool failed during execution
+- `-32001`: Timeout error - Evaluation exceeded timeout
+- `-32002`: Authentication error - Invalid or missing credentials
+- `-32003`: Rate limit exceeded - Too many requests
+- `-32004`: Invalid tool - Requested tool not available
+- `-32005`: Resource error - Unable to access required resources
+
+## Connection Management
+
+### Reconnection
+- Clients should implement automatic reconnection with exponential backoff
+- On reconnection, clients must re-register with the same clientId
+- Server maintains evaluation state across reconnections
+
+### Timeouts
+- Default connection timeout: 60 seconds
+- Ping interval: 30 seconds
+- Evaluation timeout: Specified per evaluation in YAML
+
+### Rate Limiting
+- Server may implement rate limiting per client
+- Rate limit errors use code `-32003`
+- Clients should respect rate limit headers in error responses
+
+## Security Considerations
+
+1. **Authentication**: Clients may use optional secret keys for authentication
+2. **Transport Security**: Production deployments should use WSS (WebSocket Secure)
+3. **Input Validation**: All inputs should be validated against schemas
+4. **Resource Limits**: Enforce timeouts and memory limits for evaluations
+
+## Examples
+
+### Complete Flow Example
+
+1. **Client connects and registers:**
+```json
+// Client → Server
+{"type": "register", "clientId": "550e8400-e29b-41d4-a716-446655440000", "capabilities": {"tools": ["extract_schema_data"], "maxConcurrency": 3, "version": "1.0.0"}}
+
+// Server → Client
+{"type": "registration_ack", "clientId": "550e8400-e29b-41d4-a716-446655440000", "status": "accepted", "message": "Client registered successfully", "evaluationsCount": 2}
+```
+
+2. **Client signals ready:**
+```json
+// Client → Server
+{"type": "ready", "timestamp": "2024-01-01T00:00:00Z"}
+```
+
+3. **Server sends evaluation:**
+```json
+// Server → Client
+{"jsonrpc": "2.0", "method": "evaluate", "params": {"evaluationId": "test-001", "url": "https://example.com", "tool": "extract_schema_data", "input": {"schema": {"type": "object", "properties": {"title": {"type": "string"}}}}, "timeout": 30000}, "id": "rpc-001"}
+```
+
+4. **Client returns result:**
+```json
+// Client → Server
+{"jsonrpc": "2.0", "result": {"status": "success", "output": {"title": "Example Domain"}, "executionTime": 1500}, "id": "rpc-001"}
+```
+
+## Version History
+
+- **1.0.0** (2024-01-01): Initial protocol version
\ No newline at end of file
diff --git a/eval-server/docs/TRIGGERING_EVALUATIONS.md b/eval-server/docs/TRIGGERING_EVALUATIONS.md
new file mode 100644
index 00000000000..61604da488a
--- /dev/null
+++ b/eval-server/docs/TRIGGERING_EVALUATIONS.md
@@ -0,0 +1,334 @@
+# How to Trigger Evaluations
+
+This guide explains all the different ways to trigger evaluations in the system.
+
+## Prerequisites
+
+1. **Server Running**: Make sure the evaluation server is running:
+   ```bash
+   npm start
+   ```
+
+2. **Client Connected**: A DevTools client must be connected and ready. You'll see logs like:
+   ```
+   [info]: Client registered successfully {"clientId":"550e8400...","capabilities":"extract_schema_data, research_agent"}
+   [info]: Client ready for evaluations {"clientId":"550e8400..."}
+   ```
+
+## Method 1: Interactive CLI
+
+Start the interactive CLI:
+```bash
+npm run cli
+```
+
+### Available Commands
+
+#### List Clients and Evaluations
+```bash
+eval-server> clients
+```
+This shows all registered clients and their available evaluations with current status.
+
+#### Run Specific Evaluation
+```bash
+eval-server> run <client-id> <evaluation-id>
+```
+Example:
+```bash
+eval-server> run 550e8400-e29b-41d4-a716-446655440000 wikipedia-chrome-devtools-001
+```
+
+#### Run All Evaluations for a Client
+```bash
+eval-server> run-all <client-id>
+```
+Example:
+```bash
+eval-server> run-all 550e8400-e29b-41d4-a716-446655440000
+```
+
+#### Check Status
+```bash
+eval-server> status
+```
+Shows server status, connected clients, and active evaluations.
+
+#### Get Help
+```bash
+eval-server> help
+```
+
+## Method 2: HTTP API
+
+The server also exposes an HTTP API on port 8081.
+
+### Get Server Status
+```bash
+curl http://localhost:8081/status
+```
+
+### List All Clients
+```bash
+curl http://localhost:8081/clients
+```
+
+### Get Client Evaluations
+```bash
+curl "http://localhost:8081/clients/:id/evaluations?id=550e8400-e29b-41d4-a716-446655440000"
+```
+
+### Trigger Specific Evaluation
+```bash
+curl -X POST http://localhost:8081/evaluate \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "clientId": "550e8400-e29b-41d4-a716-446655440000",
+    "evaluationId": "wikipedia-chrome-devtools-001"
+  }'
+```
+
+### Trigger All Evaluations for a Client
+```bash
+curl -X POST http://localhost:8081/evaluate \\
+  -H "Content-Type: application/json" \\
+  -d '{
+    "clientId": "550e8400-e29b-41d4-a716-446655440000",
+    "runAll": true
+  }'
+```
+
+## Method 3: Automatic Scheduling (YAML Configuration)
+
+Evaluations can be configured to run automatically based on their schedule in the YAML file.
+
+### Schedule Types
+
+#### On-Demand (Manual Only)
+```yaml
+schedule:
+  type: "on_demand"
+```
+Only runs when manually triggered.
+
+#### Periodic (Automatic)
+```yaml
+schedule:
+  type: "periodic"
+  interval: 86400000  # Run every 24 hours (in milliseconds)
+```
+Runs automatically at the specified interval.
+
+#### One-Time (Automatic)
+```yaml
+schedule:
+  type: "once"
+  run_at: "2024-12-25T09:00:00Z"  # Run once at specific time
+```
+Runs once at the specified time.
+
+## Method 4: Programmatic Integration
+
+You can integrate the evaluation system into your own applications:
+
+### Node.js Example
+```javascript
+import { EvaluationServer } from './src/server.js';
+
+const server = new EvaluationServer();
+server.start();
+
+// Wait for client to connect
+setTimeout(async () => {
+  const clientId = '550e8400-e29b-41d4-a716-446655440000';
+  const evaluationId = 'wikipedia-chrome-devtools-001';
+  
+  // Get client connection
+  const connection = server.connectedAgents.get(clientId);
+  if (connection && connection.ready) {
+    // Get evaluation
+    const evaluation = server.getClientManager()
+      .getClientEvaluations(clientId)
+      .find(e => e.id === evaluationId);
+    
+    if (evaluation) {
+      // Execute evaluation
+      await server.executeEvaluation(connection, evaluation);
+      console.log('Evaluation completed!');
+    }
+  }
+}, 5000);
+```
+
+### Python Example (using HTTP API)
+```python
+import requests
+import json
+
+def trigger_evaluation(client_id, evaluation_id):
+    response = requests.post('http://localhost:8081/evaluate', 
+        headers={'Content-Type': 'application/json'},
+        json={
+            'clientId': client_id,
+            'evaluationId': evaluation_id
+        })
+    
+    if response.status_code == 200:
+        return response.json()
+    else:
+        raise Exception(f"Failed to trigger evaluation: {response.text}")
+
+# Example usage
+result = trigger_evaluation(
+    '550e8400-e29b-41d4-a716-446655440000',
+    'wikipedia-chrome-devtools-001'
+)
+print(json.dumps(result, indent=2))
+```
+
+## Method 5: Webhook Integration
+
+You can set up webhooks to trigger evaluations from external systems:
+
+### GitHub Actions Example
+```yaml
+name: Run Evaluations
+on:
+  schedule:
+    - cron: '0 9 * * *'  # Daily at 9 AM
+  workflow_dispatch:  # Manual trigger
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger Evaluation
+        run: |
+          curl -X POST ${{ secrets.EVAL_SERVER_URL }}/evaluate \\
+            -H "Content-Type: application/json" \\
+            -d '{
+              "clientId": "${{ secrets.CLIENT_ID }}",
+              "runAll": true
+            }'
+```
+
+### Slack Bot Example
+```javascript
+// Slack bot command: /eval wikipedia
+app.command('/eval', async ({ command, ack, respond }) => {
+  await ack();
+  
+  const evaluationId = command.text.trim();
+  const clientId = process.env.DEFAULT_CLIENT_ID;
+  
+  try {
+    const response = await fetch('http://localhost:8081/evaluate', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ clientId, evaluationId })
+    });
+    
+    const result = await response.json();
+    await respond(`✅ Evaluation '${evaluationId}' completed successfully!`);
+  } catch (error) {
+    await respond(`❌ Evaluation failed: ${error.message}`);
+  }
+});
+```
+
+## Monitoring Evaluation Results
+
+### Real-time Logs
+Monitor the server logs to see evaluation progress:
+```bash
+tail -f logs/combined.log
+```
+
+### Status Checking
+Check evaluation status via API:
+```bash
+# Get all evaluations for a client
+curl "http://localhost:8081/clients/:id/evaluations?id=CLIENT_ID"
+
+# Check server status
+curl http://localhost:8081/status
+```
+
+### Log Files
+Evaluation results are logged to:
+- `logs/combined.log` - All logs
+- `logs/error.log` - Error logs only
+
+## Troubleshooting
+
+### Client Not Connected
+```
+❌ Client 'CLIENT_ID' is not connected or not ready
+```
+**Solutions:**
+1. Make sure DevTools is running and connected
+2. Check that the client ID matches
+3. Verify the WebSocket connection is working
+
+### Evaluation Not Found
+```
+❌ Evaluation 'EVAL_ID' not found for client 'CLIENT_ID'
+```
+**Solutions:**
+1. Check the YAML file for the correct evaluation ID
+2. Ensure the evaluation is enabled (`enabled: true`)
+3. Reload the server if you changed the YAML file
+
+### Tool Not Available
+```
+Tool execution failed: Tool not found: tool_name
+```
+**Solutions:**
+1. Verify the tool is registered in DevTools
+2. Check that the tool name matches exactly
+3. Ensure DevTools has the required capabilities
+
+### Connection Timeout
+```
+WebSocket connection failed
+```
+**Solutions:**
+1. Check if the server is running on the correct port
+2. Verify firewall settings
+3. Check network connectivity
+
+## Best Practices
+
+1. **Start Simple**: Begin with on-demand evaluations before setting up automation
+2. **Monitor Logs**: Always monitor logs when running evaluations
+3. **Test Connections**: Use the `status` command to verify everything is connected
+4. **Gradual Rollout**: Test individual evaluations before running batch operations
+5. **Error Handling**: Implement proper error handling in automated systems
+6. **Rate Limiting**: Don't run too many evaluations simultaneously
+
+## Example Workflow
+
+Here's a typical workflow for triggering evaluations:
+
+```bash
+# 1. Start the server
+npm start
+
+# 2. In another terminal, start the CLI
+npm run cli
+
+# 3. Check status and clients
+eval-server> status
+eval-server> clients
+
+# 4. Run a specific evaluation
+eval-server> run 550e8400-e29b-41d4-a716-446655440000 wikipedia-chrome-devtools-001
+
+# 5. Check results in logs
+# (Monitor the server logs for detailed results)
+
+# 6. Run all evaluations if needed
+eval-server> run-all 550e8400-e29b-41d4-a716-446655440000
+```
+
+This comprehensive guide covers all the ways to trigger and monitor evaluations in your system!
\ No newline at end of file
diff --git a/eval-server/docs/YAML_SCHEMA.md b/eval-server/docs/YAML_SCHEMA.md
new file mode 100644
index 00000000000..eecb185cc45
--- /dev/null
+++ b/eval-server/docs/YAML_SCHEMA.md
@@ -0,0 +1,328 @@
+# YAML Evaluation Schema Documentation
+
+## Overview
+
+This document describes the YAML schema used to define evaluations for each client. Each client has a dedicated YAML file stored in the `clients/` directory, named after their client ID.
+
+## File Location
+
+```
+bo-eval-server/
+└── clients/
+    ├── 550e8400-e29b-41d4-a716-446655440000.yaml
+    ├── 771f9500-f39c-52e5-b827-557766551111.yaml
+    └── ...
+```
+
+## Schema Structure
+
+### Root Level
+
+```yaml
+# Client identification and authentication
+client:
+  id: "550e8400-e29b-41d4-a716-446655440000"  # Required: UUID v4
+  name: "Chrome DevTools Agent"                 # Required: Human-readable name
+  secret_key: "optional-secret-key"            # Optional: Authentication key
+  description: "Production DevTools instance"   # Optional: Client description
+
+# Client-specific settings
+settings:
+  max_concurrent_evaluations: 3     # Maximum parallel evaluations
+  default_timeout: 30000           # Default timeout in milliseconds
+  retry_policy:
+    max_retries: 2                 # Maximum retry attempts
+    backoff_multiplier: 2          # Exponential backoff multiplier
+    initial_delay: 1000            # Initial retry delay in ms
+
+# List of evaluations assigned to this client
+evaluations:
+  - id: "eval-001"
+    # ... evaluation definition
+  - id: "eval-002"
+    # ... evaluation definition
+```
+
+### Evaluation Definition
+
+Each evaluation in the `evaluations` array follows this structure:
+
+```yaml
+- id: "wikipedia-chrome-devtools-001"        # Required: Unique evaluation ID
+  name: "Extract Chrome DevTools Wikipedia"  # Required: Display name
+  description: "Extract structured data"     # Optional: Detailed description
+  enabled: true                             # Optional: Enable/disable (default: true)
+  
+  # Target configuration
+  target:
+    url: "https://en.wikipedia.org/wiki/Chrome_DevTools"  # Required: Target URL
+    wait_for: "networkidle"    # Optional: Wait condition (load|domcontentloaded|networkidle)
+    wait_timeout: 5000         # Optional: Wait timeout in ms
+  
+  # Tool configuration
+  tool: "extract_schema_data"   # Required: Tool to execute
+  timeout: 30000               # Optional: Override default timeout
+  
+  # Tool-specific input
+  input:
+    schema:                    # For extract_schema_data tool
+      type: "object"
+      properties:
+        title:
+          type: "string"
+        summary:
+          type: "string"
+    
+  # Scheduling configuration
+  schedule:
+    type: "on_demand"          # on_demand|periodic|once
+    # For periodic:
+    interval: 3600000          # Interval in milliseconds
+    # For once:
+    run_at: "2024-01-01T00:00:00Z"  # ISO timestamp
+  
+  # Validation configuration
+  validation:
+    type: "llm-judge"          # llm-judge|snapshot|hybrid
+    
+    # For llm-judge validation
+    llm_judge:
+      model: "gpt-4o-mini"     # LLM model to use
+      temperature: 0.3         # Model temperature
+      criteria:                # Evaluation criteria
+        - "Title should be accurately extracted"
+        - "Summary should be comprehensive"
+        - "All required fields should be present"
+      
+      # Visual verification settings
+      visual_verification:
+        enabled: true
+        capture_before: true   # Screenshot before tool execution
+        capture_after: true    # Screenshot after tool execution
+        prompts:              # Custom verification prompts
+          - "Verify the title matches the page header"
+    
+    # For snapshot validation
+    snapshot:
+      structure_only: false    # Compare structure only
+      exclude_paths:          # Paths to exclude from comparison
+        - "timestamp"
+        - "random_id"
+      sanitizers:             # Value sanitization rules
+        - path: "date"
+          pattern: "\\d{4}-\\d{2}-\\d{2}"
+          replacement: "YYYY-MM-DD"
+    
+    # For hybrid validation (both llm-judge and snapshot)
+    hybrid:
+      weight_llm: 0.7         # Weight for LLM score
+      weight_snapshot: 0.3    # Weight for snapshot score
+  
+  # Metadata and tags
+  metadata:
+    tags:                     # Categorization tags
+      - "schema-extraction"
+      - "wikipedia"
+      - "regression"
+    priority: "normal"        # low|normal|high
+    owner: "team-browser"     # Responsible team/person
+    created: "2024-01-01"     # Creation date
+    modified: "2024-01-15"    # Last modification date
+```
+
+## Tool-Specific Input Schemas
+
+### extract_schema_data
+
+```yaml
+input:
+  schema:                     # JSON Schema for extraction
+    type: "object"
+    properties:
+      title:
+        type: "string"
+      items:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            price:
+              type: "number"
+```
+
+### research_agent
+
+```yaml
+input:
+  query: "Research the latest AI developments"  # Research query
+  max_iterations: 5          # Maximum agent iterations
+  include_sources: true      # Include source URLs
+  depth: "comprehensive"     # shallow|moderate|comprehensive
+```
+
+### action_agent
+
+```yaml
+input:
+  task: "Fill out the contact form"  # Task description
+  form_data:                         # Data to use
+    name: "Test User"
+    email: "test@example.com"
+  verify_completion: true            # Verify task completion
+```
+
+### web_task_agent
+
+```yaml
+input:
+  instructions: |                    # Multi-line instructions
+    1. Navigate to the products page
+    2. Search for "laptop"
+    3. Filter by price < $1000
+    4. Extract the first 5 results
+  expected_outcome: "List of laptops under $1000"
+  max_steps: 10                     # Maximum action steps
+```
+
+## Complete Example
+
+```yaml
+client:
+  id: "550e8400-e29b-41d4-a716-446655440000"
+  name: "Chrome DevTools Production Agent"
+  secret_key: "sk-prod-abc123"
+  description: "Production DevTools instance for continuous evaluation"
+
+settings:
+  max_concurrent_evaluations: 5
+  default_timeout: 45000
+  retry_policy:
+    max_retries: 3
+    backoff_multiplier: 2
+    initial_delay: 2000
+
+evaluations:
+  # Schema extraction evaluation
+  - id: "schema-extract-wiki-001"
+    name: "Wikipedia Chrome DevTools Schema Extraction"
+    description: "Test schema extraction on Wikipedia article"
+    enabled: true
+    
+    target:
+      url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
+      wait_for: "networkidle"
+      wait_timeout: 5000
+    
+    tool: "extract_schema_data"
+    timeout: 30000
+    
+    input:
+      schema:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          summary:
+            type: "string"
+          features:
+            type: "array"
+            items:
+              type: "string"
+          lastModified:
+            type: "string"
+    
+    schedule:
+      type: "periodic"
+      interval: 86400000  # Daily
+    
+    validation:
+      type: "hybrid"
+      llm_judge:
+        model: "gpt-4o"
+        criteria:
+          - "All schema fields must be populated"
+          - "Summary should be at least 100 characters"
+          - "Features should contain at least 5 items"
+      snapshot:
+        exclude_paths:
+          - "lastModified"
+      hybrid:
+        weight_llm: 0.6
+        weight_snapshot: 0.4
+    
+    metadata:
+      tags: ["schema", "wikipedia", "daily"]
+      priority: "high"
+      owner: "qa-team"
+
+  # Research agent evaluation
+  - id: "research-agent-news-001"
+    name: "Research Latest Tech News"
+    description: "Test research agent on current tech news"
+    enabled: true
+    
+    target:
+      url: "https://news.ycombinator.com"
+    
+    tool: "research_agent"
+    timeout: 60000
+    
+    input:
+      query: "What are the top 3 technology stories today?"
+      max_iterations: 5
+      include_sources: true
+      depth: "moderate"
+    
+    schedule:
+      type: "on_demand"
+    
+    validation:
+      type: "llm-judge"
+      llm_judge:
+        model: "gpt-4o-mini"
+        temperature: 0.3
+        criteria:
+          - "Response includes 3 distinct technology stories"
+          - "Each story has a clear summary"
+          - "Sources are provided for each story"
+          - "Information is current (from today)"
+    
+    metadata:
+      tags: ["research", "news", "tech"]
+      priority: "normal"
+```
+
+## Validation Rules
+
+1. **Client ID**: Must be valid UUID v4 format
+2. **Evaluation IDs**: Must be unique within the file
+3. **Tool names**: Must match registered tools in the client
+4. **URLs**: Must be valid HTTP/HTTPS URLs
+5. **Timeouts**: Must be positive integers (milliseconds)
+6. **Schedule intervals**: Must be at least 60000ms (1 minute)
+
+## YAML Best Practices
+
+1. Use meaningful IDs that describe the evaluation
+2. Group related evaluations together
+3. Use tags consistently for categorization
+4. Document complex input schemas with comments
+5. Keep validation criteria specific and measurable
+6. Use anchors and aliases for repeated configurations:
+
+```yaml
+# Define anchor
+defaults: &defaults
+  timeout: 30000
+  retry_policy:
+    max_retries: 2
+
+# Use alias
+evaluations:
+  - id: "eval-001"
+    <<: *defaults  # Inherits timeout and retry_policy
+    name: "Test 1"
+    # ...
+```
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-a11y-001.yaml b/eval-server/evals/action-agent/action-agent-a11y-001.yaml
new file mode 100644
index 00000000000..95265515737
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-a11y-001.yaml
@@ -0,0 +1,46 @@
+# Accessibility action test
+id: "action-agent-a11y-001"
+name: "Click Using ARIA Label"
+description: "Test clicking an element identified primarily by ARIA attributes"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the button with aria-label \"Print Page\""
+  reasoning: "Testing action selection using accessibility attributes"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Used accessibility tree to find elements"
+      - "Correctly identified element by ARIA label"
+      - "Successfully clicked the target button"
+      - "Demonstrated understanding of accessibility attributes"
+      - "No reliance on visual appearance alone"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Print Page button was successfully clicked"
+        - "Check if any print dialog or print preview appeared"
+        - "Confirm the button showed visual feedback (pressed state)"
+        - "Ensure the action was performed on the correct accessibility-labeled element"
+
+metadata:
+  tags: ["action", "accessibility", "aria", "click", "a11y"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-accordion-001.yaml b/eval-server/evals/action-agent/action-agent-accordion-001.yaml
new file mode 100644
index 00000000000..f2df3430523
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-accordion-001.yaml
@@ -0,0 +1,46 @@
+# Accordion expansion test
+id: "action-agent-accordion-001"
+name: "Expand Accordion Section"
+description: "Test clicking to expand an accordion panel"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/accordion/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to expand the \"Section 2\" accordion panel"
+  reasoning: "Testing accordion expand/collapse interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Section 2 accordion header"
+      - "Successfully clicked to expand the section"
+      - "Section 2 content became visible"
+      - "Other sections collapsed appropriately"
+      - "Accordion animation completed smoothly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify Section 2 is now expanded and content visible"
+        - "Check if other accordion sections collapsed"
+        - "Confirm the expansion animation completed"
+        - "Ensure Section 2 header shows expanded state"
+
+metadata:
+  tags: ["action", "accordion", "expand", "collapse", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml b/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml
new file mode 100644
index 00000000000..c22bfc737c0
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml
@@ -0,0 +1,46 @@
+# Autocomplete search test
+id: "action-agent-autocomplete-001"
+name: "Use Autocomplete Search"
+description: "Test typing in autocomplete field and selecting from suggestions"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/autocomplete/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
+  reasoning: "Testing autocomplete/typeahead interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the autocomplete input field"
+      - "Typed \"Java\" to trigger suggestions"
+      - "Autocomplete dropdown appeared with suggestions"
+      - "Selected \"JavaScript\" from the suggestion list"
+      - "Input field shows the selected value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify \"JavaScript\" appears in the input field"
+        - "Check if autocomplete suggestions appeared"
+        - "Confirm the correct suggestion was selected"
+        - "Ensure dropdown closed after selection"
+
+metadata:
+  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-checkbox-001.yaml b/eval-server/evals/action-agent/action-agent-checkbox-001.yaml
new file mode 100644
index 00000000000..b76f3072005
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-checkbox-001.yaml
@@ -0,0 +1,46 @@
+# Checkbox/radio button test
+id: "action-agent-checkbox-001"
+name: "Toggle Newsletter Checkbox"
+description: "Test clicking checkbox elements for form options"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Click the checkbox labeled \"I have a bike\" to check it"
+  reasoning: "Testing interaction with checkbox form elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified the correct checkbox among multiple options"
+      - "Used click action on the checkbox element"
+      - "Checkbox state changed from unchecked to checked"
+      - "Handled the iframe structure if present"
+      - "No errors with form element interaction"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the checkbox state changed from unchecked to checked"
+        - "Confirm the \"I have a bike\" checkbox now shows a checkmark"
+        - "Verify the checkbox visual indicator (checkmark) is clearly visible"
+        - "Ensure no other checkboxes were accidentally modified"
+
+metadata:
+  tags: ["action", "checkbox", "form", "w3schools", "input"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-checkbox-002.yaml b/eval-server/evals/action-agent/action-agent-checkbox-002.yaml
new file mode 100644
index 00000000000..0b25fa8195b
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-checkbox-002.yaml
@@ -0,0 +1,47 @@
+# Toggle checkbox test - using HTML form test site
+id: "action-agent-checkbox-002"
+name: "Check Extra Cheese Checkbox"
+description: "Test checking a specific checkbox using the check method"
+enabled: true
+
+target:
+  url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Find and check the \"Extra Cheese\" checkbox in the Pizza Toppings section"
+  reasoning: "Testing checkbox interaction functionality using check method"
+  hint: "Look for the Extra Cheese checkbox and use the check method to select it"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Extra Cheese checkbox in the Pizza Toppings section"
+      - "Used the check method instead of click for better reliability"
+      - "Checkbox became checked (if it wasn't already)"
+      - "No errors occurred during checkbox interaction"
+      - "Form maintained its structure after checkbox selection"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Extra Cheese checkbox is now checked (shows checkmark)"
+        - "Check that the checkbox shows proper visual feedback for checked state"
+        - "Confirm the form structure remained intact"
+        - "Ensure the checkbox for Extra Cheese was specifically targeted and checked"
+
+metadata:
+  tags: ["action", "checkbox", "check", "form", "httpbin"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-click-001.yaml b/eval-server/evals/action-agent/action-agent-click-001.yaml
new file mode 100644
index 00000000000..e9af6cfdf23
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-click-001.yaml
@@ -0,0 +1,47 @@
+# Basic search interaction test
+id: "action-agent-click-001"
+name: "Search with Text Entry and Click"
+description: "Test entering text in search field and clicking search button"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
+  reasoning: "Testing multi-step interaction: text input followed by button click"
+  hint: "First fill the search input field, then find and click the search button"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully located the search input field"
+      - "Entered \"DevTools automation\" text in the search box"
+      - "Located the Google Search button after entering text"
+      - "Successfully clicked the search button"
+      - "Search was executed and results page loaded"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify text \"DevTools automation\" was entered in the search field"
+        - "Check if search results page loaded with relevant results"
+        - "Confirm the search was executed (URL changed to results page)"
+        - "Ensure search results are related to \"DevTools automation\""
+
+metadata:
+  tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-context-001.yaml b/eval-server/evals/action-agent/action-agent-context-001.yaml
new file mode 100644
index 00000000000..61626977f4d
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-context-001.yaml
@@ -0,0 +1,46 @@
+# Right click context menu test
+id: "action-agent-context-001"
+name: "Right Click Context Menu"
+description: "Test right-clicking to open context menu"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/context_menu"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Right-click on the context menu area to open the context menu"
+  reasoning: "Testing right-click context menu interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the designated context menu area"
+      - "Performed right-click action correctly"
+      - "Context menu appeared with options"
+      - "Successfully triggered the right-click event"
+      - "Alert or confirmation appeared as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify right-click was performed on correct area"
+        - "Check if context menu or alert appeared"
+        - "Confirm right-click event was properly triggered"
+        - "Ensure the expected response occurred"
+
+metadata:
+  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-datepicker-001.yaml b/eval-server/evals/action-agent/action-agent-datepicker-001.yaml
new file mode 100644
index 00000000000..f4abbf7ac33
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-datepicker-001.yaml
@@ -0,0 +1,46 @@
+# Date picker test
+id: "action-agent-datepicker-001"
+name: "Select Date from Calendar"
+description: "Test clicking date input and selecting a specific date from calendar popup"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/datepicker/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the date input field and select March 15, 2024 from the calendar picker"
+  reasoning: "Testing interaction with calendar popup widgets"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located and clicked the date input field"
+      - "Calendar popup opened successfully"
+      - "Navigated to correct month/year if needed"
+      - "Selected the specific date (March 15, 2024)"
+      - "Date input field shows the selected date"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the date input field contains the selected date"
+        - "Check if the calendar widget opened and closed properly"
+        - "Confirm the correct date was highlighted and selected"
+        - "Ensure the date format matches expected output"
+
+metadata:
+  tags: ["action", "datepicker", "calendar", "form", "popup"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-daterange-001.yaml b/eval-server/evals/action-agent/action-agent-daterange-001.yaml
new file mode 100644
index 00000000000..4581a472052
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-daterange-001.yaml
@@ -0,0 +1,46 @@
+# Date range picker test
+id: "action-agent-daterange-001"
+name: "Select Date Range"
+description: "Test selecting a date range with start and end dates"
+enabled: true
+
+target:
+  url: "https://www.daterangepicker.com/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Select a date range from February 1, 2024 to February 28, 2024"
+  reasoning: "Testing complex date range selection with start and end dates"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Opened the date range picker interface"
+      - "Selected the start date (February 1, 2024)"
+      - "Selected the end date (February 28, 2024)"
+      - "Date range was properly applied"
+      - "Input field shows the complete date range"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify both start and end dates are displayed in the input"
+        - "Check if the date range picker shows the selected range"
+        - "Confirm the format matches expected date range display"
+        - "Ensure both dates were selected in sequence"
+
+metadata:
+  tags: ["action", "daterange", "date-picker", "form", "complex"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-dropdown-001.yaml b/eval-server/evals/action-agent/action-agent-dropdown-001.yaml
new file mode 100644
index 00000000000..b37b91c3e3f
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-dropdown-001.yaml
@@ -0,0 +1,46 @@
+# Dropdown selection test
+id: "action-agent-dropdown-001"
+name: "Select Dropdown Option"
+description: "Test selecting an option from a dropdown menu"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Select \"Audi\" from the car brands dropdown menu"
+  reasoning: "Testing dropdown selection interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the dropdown/select element"
+      - "Identified the correct option to select"
+      - "Successfully selected the Audi option"
+      - "Dropdown value changed to the selected option"
+      - "Handled select element interaction properly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the dropdown selection changed"
+        - "Confirm \"Audi\" is now displayed as the selected option"
+        - "Check if the dropdown is closed after selection"
+        - "Verify no other form elements were affected by the selection"
+
+metadata:
+  tags: ["action", "dropdown", "select", "form", "w3schools"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-dynamic-001.yaml b/eval-server/evals/action-agent/action-agent-dynamic-001.yaml
new file mode 100644
index 00000000000..a4380f33f3d
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-dynamic-001.yaml
@@ -0,0 +1,46 @@
+# Dynamic content interaction test
+id: "action-agent-dynamic-001"
+name: "Click Dynamic Load Button"
+description: "Test clicking a button that loads dynamic content"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Click the \"Start\" button to trigger dynamic content loading"
+  reasoning: "Testing interaction with dynamically loaded content"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Found and clicked the Start button"
+      - "Handled the dynamic loading process"
+      - "Recognized that content changes after clicking"
+      - "No timing issues with the dynamic content"
+      - "Successfully triggered the loading animation"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify dynamic content loaded after clicking Start"
+        - "Check if loading animation or spinner was displayed"
+        - "Confirm new content appeared that was previously hidden"
+        - "Verify the Start button state changed or was replaced after clicking"
+
+metadata:
+  tags: ["action", "dynamic", "click", "ajax", "loading"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-ecommerce-001.yaml b/eval-server/evals/action-agent/action-agent-ecommerce-001.yaml
new file mode 100644
index 00000000000..503c157d37f
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-ecommerce-001.yaml
@@ -0,0 +1,46 @@
+# E-commerce action test
+id: "action-agent-ecommerce-001"
+name: "Add Product to Cart"
+description: "Test clicking \"Add to Cart\" button on an e-commerce product page"
+enabled: true
+
+target:
+  url: "https://www.homedepot.com/p/Husky-20-Gal-Professional-Duty-Waterproof-Storage-Container-with-Hinged-Lid-in-Red-249160/313799634"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 180000
+
+input:
+  objective: "Click the \"Add to Cart\" button for this storage container"
+  reasoning: "Testing e-commerce interaction with product cart functionality"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Add to Cart button on the product page"
+      - "Successfully clicked the button"
+      - "Handled any popups or confirmations that appeared"
+      - "Verified the item was added (cart count changed or confirmation shown)"
+      - "Dealt with page dynamics after clicking"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify the Add to Cart button was clicked"
+        - "Check if cart count indicator increased or shows the item was added"
+        - "Look for any confirmation popup or notification about the item being added"
+        - "Verify the button state changed (e.g., to \"Added to Cart\" or disabled)"
+
+metadata:
+  tags: ["action", "ecommerce", "click", "homedepot", "cart"]
+  priority: "high"
+  timeout: 180000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-error-001.yaml b/eval-server/evals/action-agent/action-agent-error-001.yaml
new file mode 100644
index 00000000000..43c95e6d0ff
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-error-001.yaml
@@ -0,0 +1,47 @@
+# Error recovery test
+id: "action-agent-error-001"
+name: "Handle Missing Element"
+description: "Test agent behavior when target element is not found"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the \"Sign Up\" button"
+  reasoning: "Testing error handling when element does not exist"
+  hint: "There is no Sign Up button on Google homepage - agent should handle gracefully"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Attempted to find the requested element"
+      - "Recognized that the element does not exist"
+      - "Provided clear error message or explanation"
+      - "Did not crash or produce confusing output"
+      - "Suggested alternatives or explained the issue"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the page remains in a stable state despite the missing element"
+        - "Confirm no error dialogs or broken UI elements appeared"
+        - "Check that the agent handled the missing element gracefully"
+        - "Ensure the page was properly analyzed even though the target was not found"
+
+metadata:
+  tags: ["action", "error-handling", "missing-element", "recovery", "edge-case"]
+  priority: "high"
+  timeout: 60000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-filter-001.yaml b/eval-server/evals/action-agent/action-agent-filter-001.yaml
new file mode 100644
index 00000000000..77829993599
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-filter-001.yaml
@@ -0,0 +1,46 @@
+# Search filter application test
+id: "action-agent-filter-001"
+name: "Apply Search Filters"
+description: "Test applying search filters to modify results"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/howto/howto_js_filter_lists.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Anna\" in the search filter to filter the list"
+  reasoning: "Testing search filter application"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the search filter input"
+      - "Typed \"Anna\" in the filter field"
+      - "List items filtered to show only matching results"
+      - "Non-matching items were hidden or removed from view"
+      - "Filter functionality worked as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify search input contains \"Anna\""
+        - "Check if list shows only items containing \"Anna\""
+        - "Confirm non-matching items are not visible"
+        - "Ensure filter functionality reduced the visible list items"
+
+metadata:
+  tags: ["action", "filter", "search", "list", "dynamic"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-form-001.yaml b/eval-server/evals/action-agent/action-agent-form-001.yaml
new file mode 100644
index 00000000000..61d036f683d
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-form-001.yaml
@@ -0,0 +1,46 @@
+# Form fill action test
+id: "action-agent-form-001"
+name: "Fill Search Query"
+description: "Test filling a search input field with specific text"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Fill the search box with \"Chrome DevTools automation testing\""
+  reasoning: "Testing form input capability with a specific search query"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully identified the search input field"
+      - "Used perform_action with fill method"
+      - "Correctly filled the field with the specified text"
+      - "Verified the field accepted the input"
+      - "No formatting or encoding issues with the text"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to confirm text was entered in the search field"
+        - "Verify the exact text \"Chrome DevTools automation testing\" is visible"
+        - "Check if search suggestions or autocomplete dropdown appeared"
+        - "Ensure no input validation errors are shown"
+
+metadata:
+  tags: ["action", "form-fill", "input", "google", "basic"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-hover-001.yaml b/eval-server/evals/action-agent/action-agent-hover-001.yaml
new file mode 100644
index 00000000000..ed98fbf6ef6
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-hover-001.yaml
@@ -0,0 +1,46 @@
+# Hover action test
+id: "action-agent-hover-001"
+name: "Hover to Reveal Menu"
+description: "Test hovering over an element to reveal hidden content"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/hovers"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Hover over the first user avatar image to reveal the hidden caption"
+  reasoning: "Testing hover interaction to reveal dynamic content"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the first user avatar image"
+      - "Used appropriate hover action method"
+      - "Successfully triggered the hover state"
+      - "Hidden caption became visible after hover"
+      - "Handled mouse interaction correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify hover revealed hidden content"
+        - "Check that caption or overlay appeared over the first avatar"
+        - "Confirm the hover state is visually active on the image"
+        - "Verify user information or caption text is now visible"
+
+metadata:
+  tags: ["action", "hover", "mouse", "dynamic", "reveal"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-keyboard-001.yaml b/eval-server/evals/action-agent/action-agent-keyboard-001.yaml
new file mode 100644
index 00000000000..6bfceac0b24
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-keyboard-001.yaml
@@ -0,0 +1,46 @@
+# Keyboard tab navigation test
+id: "action-agent-keyboard-001"
+name: "Keyboard Tab Navigation"
+description: "Test using keyboard navigation to move between elements"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/menubar/examples/menubar-navigation/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Use Tab key to navigate between menu items and Enter to activate"
+  reasoning: "Testing keyboard-only navigation patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully used keyboard navigation"
+      - "Tab key moved focus between menu items"
+      - "Focus indicators were visible during navigation"
+      - "Enter key activated the focused menu item"
+      - "Keyboard navigation followed accessibility standards"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify focus indicators are visible on menu items"
+        - "Check if keyboard navigation moved focus correctly"
+        - "Confirm Enter key activated the focused item"
+        - "Ensure accessibility navigation patterns worked"
+
+metadata:
+  tags: ["action", "keyboard", "navigation", "accessibility", "focus"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-login-001.yaml b/eval-server/evals/action-agent/action-agent-login-001.yaml
new file mode 100644
index 00000000000..1b705ce8dee
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-login-001.yaml
@@ -0,0 +1,47 @@
+# Login form test
+id: "action-agent-login-001"
+name: "Fill Login Credentials"
+description: "Test filling username and password fields in a login form"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/login"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Fill the username field with \"tomsmith\" and password field with \"SuperSecretPassword!\""
+  reasoning: "Testing form fill with multiple fields including password type"
+  input_data: "<username>tomsmith</username><password>SuperSecretPassword!</password>"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified both username and password fields"
+      - "Filled username field with correct value"
+      - "Filled password field with correct value"
+      - "Handled password field type appropriately"
+      - "Used the provided input_data XML format correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the username field shows \"tomsmith\" entered"
+        - "Confirm the password field has dots/asterisks indicating password entry"
+        - "Check that both fields are properly filled before submission"
+        - "Ensure no validation errors are shown for the filled fields"
+
+metadata:
+  tags: ["action", "login", "form-fill", "authentication", "multi-field"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-modal-001.yaml b/eval-server/evals/action-agent/action-agent-modal-001.yaml
new file mode 100644
index 00000000000..1324fee7cf4
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-modal-001.yaml
@@ -0,0 +1,46 @@
+# Modal dialog test
+id: "action-agent-modal-001"
+name: "Open and Close Modal"
+description: "Test opening modal dialog and closing it with X button"
+enabled: true
+
+target:
+  url: "https://getbootstrap.com/docs/5.0/components/modal/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to open the modal dialog, then close it using the X button"
+  reasoning: "Testing modal dialog interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located and clicked the modal trigger button"
+      - "Modal dialog opened successfully"
+      - "Modal content was visible and accessible"
+      - "Found and clicked the close (X) button"
+      - "Modal closed and page returned to normal state"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify modal opened with visible content"
+        - "Check if modal overlay appeared correctly"
+        - "Confirm modal was closed after clicking X"
+        - "Ensure page background is accessible again"
+
+metadata:
+  tags: ["action", "modal", "dialog", "popup", "overlay"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-multiselect-001.yaml b/eval-server/evals/action-agent/action-agent-multiselect-001.yaml
new file mode 100644
index 00000000000..fed3f78d278
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-multiselect-001.yaml
@@ -0,0 +1,46 @@
+# Multi-select dropdown test
+id: "action-agent-multiselect-001"
+name: "Select Multiple Options"
+description: "Test selecting multiple options from a multi-select dropdown"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_select_multiple"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Select both \"Volvo\" and \"Audi\" from the multi-select dropdown"
+  reasoning: "Testing multiple selection in select elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the multi-select dropdown element"
+      - "Successfully selected Volvo option"
+      - "Successfully selected Audi option"
+      - "Both options remain selected simultaneously"
+      - "Used appropriate multi-select interaction method"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify both Volvo and Audi appear selected"
+        - "Check if both options are highlighted/marked"
+        - "Confirm multi-select functionality worked correctly"
+        - "Ensure no other options were accidentally selected"
+
+metadata:
+  tags: ["action", "multi-select", "dropdown", "form", "multiple"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-multistep-001.yaml b/eval-server/evals/action-agent/action-agent-multistep-001.yaml
new file mode 100644
index 00000000000..31514dde101
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-multistep-001.yaml
@@ -0,0 +1,47 @@
+# Multi-step form test
+id: "action-agent-multistep-001"
+name: "Complete Search and Submit"
+description: "Test filling a search form and then clicking the submit button"
+enabled: true
+
+target:
+  url: "https://www.bing.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Fill the search box with \"automated testing tools\" and then click the search button"
+  reasoning: "Testing multi-step form interaction combining fill and click actions"
+  hint: "This requires two actions: first fill the search field, then click the search button"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Recognized this requires multiple actions"
+      - "First filled the search input correctly"
+      - "Then located and clicked the search button"
+      - "Both actions completed successfully in sequence"
+      - "Search was initiated with the correct query"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the search input contains \"automated testing tools\" text"
+        - "Confirm the search was submitted and results page loaded"
+        - "Check that search results are related to the query"
+        - "Ensure the multi-step action completed fully with both fill and click"
+
+metadata:
+  tags: ["action", "multi-step", "form-fill", "click", "bing", "search"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-nav-001.yaml b/eval-server/evals/action-agent/action-agent-nav-001.yaml
new file mode 100644
index 00000000000..f49a0cf9b89
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-nav-001.yaml
@@ -0,0 +1,46 @@
+# Complex navigation test
+id: "action-agent-nav-001"
+name: "Navigate via Menu Click"
+description: "Test clicking navigation menu items to navigate between pages"
+enabled: true
+
+target:
+  url: "https://www.wikipedia.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"English\" language link to navigate to English Wikipedia"
+  reasoning: "Testing navigation through link clicks on a multilingual site"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Identified the correct language link among many options"
+      - "Successfully clicked the English link"
+      - "Navigation occurred to the English Wikipedia"
+      - "Used appropriate tools to verify navigation success"
+      - "Handled the multilingual page structure correctly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify navigation from Wikipedia homepage to English Wikipedia"
+        - "Check if the page language and content changed to English"
+        - "Verify the URL changed to en.wikipedia.org"
+        - "Confirm the English Wikipedia main page is displayed"
+
+metadata:
+  tags: ["action", "navigation", "click", "wikipedia", "multilingual"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-radio-001.yaml b/eval-server/evals/action-agent/action-agent-radio-001.yaml
new file mode 100644
index 00000000000..07d6ef88805
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-radio-001.yaml
@@ -0,0 +1,47 @@
+# Radio button selection test
+id: "action-agent-radio-001"
+name: "Select Radio Button Option"
+description: "Test selecting a specific radio button option using click method"
+enabled: true
+
+target:
+  url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 45000
+
+input:
+  objective: "Select the \"Medium\" pizza size from the Pizza Size radio button group"
+  reasoning: "Testing radio button selection functionality"
+  hint: "Look for the Medium radio button in the Pizza Size section and click it to select"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Medium radio button in the Pizza Size section"
+      - "Successfully clicked the Medium radio button"
+      - "Radio button became selected (checked state)"
+      - "Other radio buttons in the same group became unselected"
+      - "Form maintained its structure after radio button selection"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Medium radio button is now selected (shows filled circle)"
+        - "Check that other pizza size options (Small, Large) are no longer selected"
+        - "Confirm the form structure remained intact"
+        - "Ensure the Medium pizza size radio button was specifically targeted"
+
+metadata:
+  tags: ["action", "radio", "click", "form", "httpbin"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-slider-001.yaml b/eval-server/evals/action-agent/action-agent-slider-001.yaml
new file mode 100644
index 00000000000..c3706587f07
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-slider-001.yaml
@@ -0,0 +1,46 @@
+# Range slider test
+id: "action-agent-slider-001"
+name: "Adjust Range Slider"
+description: "Test moving slider to set a specific value"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/slider/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Move the slider to set the value to 75"
+  reasoning: "Testing slider/range input manipulation"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the slider control element"
+      - "Successfully moved the slider handle"
+      - "Set the slider value to approximately 75"
+      - "Slider position reflects the target value"
+      - "Any associated display shows the correct value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify slider handle moved to represent value 75"
+        - "Check if value display shows 75 or close to it"
+        - "Confirm slider position visually matches target"
+        - "Ensure slider interaction was smooth and successful"
+
+metadata:
+  tags: ["action", "slider", "range", "form", "drag"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-tableselect-001.yaml b/eval-server/evals/action-agent/action-agent-tableselect-001.yaml
new file mode 100644
index 00000000000..d78e66ca6fb
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-tableselect-001.yaml
@@ -0,0 +1,46 @@
+# Table row selection test
+id: "action-agent-tableselect-001"
+name: "Select Table Row"
+description: "Test clicking to select a table row"
+enabled: true
+
+target:
+  url: "https://datatables.net/examples/api/select_single_row.html"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the first row to select it"
+  reasoning: "Testing table row selection patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the first table row"
+      - "Successfully clicked the row"
+      - "Row became highlighted/selected"
+      - "Selection state is visually apparent"
+      - "Only one row is selected at a time"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the first row is now highlighted/selected"
+        - "Check if row selection visual feedback is clear"
+        - "Confirm only the clicked row is selected"
+        - "Ensure row selection styling is properly applied"
+
+metadata:
+  tags: ["action", "table", "select", "row", "highlight"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-tablesort-001.yaml b/eval-server/evals/action-agent/action-agent-tablesort-001.yaml
new file mode 100644
index 00000000000..e3e31764939
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-tablesort-001.yaml
@@ -0,0 +1,46 @@
+# Table column sorting test
+id: "action-agent-tablesort-001"
+name: "Sort Table Column"
+description: "Test clicking table column header to sort data"
+enabled: true
+
+target:
+  url: "https://datatables.net/examples/basic_init/zero_configuration.html"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"Name\" column header to sort the table by name"
+  reasoning: "Testing table column sorting interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Name column header"
+      - "Successfully clicked the column header"
+      - "Table data reordered by name alphabetically"
+      - "Sort indicator appeared on the Name column"
+      - "Table sorting completed without errors"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify table rows are now sorted alphabetically by name"
+        - "Check if sort arrow/indicator appears on Name column"
+        - "Confirm the data order changed from before to after"
+        - "Ensure table structure remained intact after sorting"
+
+metadata:
+  tags: ["action", "table", "sort", "column", "data"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-tabs-001.yaml b/eval-server/evals/action-agent/action-agent-tabs-001.yaml
new file mode 100644
index 00000000000..22db60cd572
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-tabs-001.yaml
@@ -0,0 +1,46 @@
+# Tab panel navigation test
+id: "action-agent-tabs-001"
+name: "Navigate Tab Panels"
+description: "Test clicking tab to switch between tab panels"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/tabs/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click on the \"Nunc tincidunt\" tab to switch to that panel"
+  reasoning: "Testing tab panel navigation"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the \"Nunc tincidunt\" tab button"
+      - "Successfully clicked the tab"
+      - "Tab panel content switched to the selected tab"
+      - "Active tab visual state changed appropriately"
+      - "Content area updated to show the new panel"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the \"Nunc tincidunt\" tab is now active/highlighted"
+        - "Check if the content panel changed to show new content"
+        - "Confirm the tab switching animation completed"
+        - "Ensure the correct tab content is visible"
+
+metadata:
+  tags: ["action", "tabs", "navigation", "panels", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-timepicker-001.yaml b/eval-server/evals/action-agent/action-agent-timepicker-001.yaml
new file mode 100644
index 00000000000..056fbe9c792
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-timepicker-001.yaml
@@ -0,0 +1,46 @@
+# Time picker test
+id: "action-agent-timepicker-001"
+name: "Select Time from Picker"
+description: "Test setting time using time picker controls"
+enabled: true
+
+target:
+  url: "https://timepicker.co/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Set the time to 2:30 PM using the time picker controls"
+  reasoning: "Testing time selection with hour/minute controls"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the time picker interface"
+      - "Set the hour to 2 (14 for 24-hour format)"
+      - "Set the minutes to 30"
+      - "Selected PM or appropriate time format"
+      - "Time input shows 2:30 PM or equivalent"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the time input displays 2:30 PM or 14:30"
+        - "Check if hour and minute were set correctly"
+        - "Confirm AM/PM selection if applicable"
+        - "Ensure the time picker interface was properly used"
+
+metadata:
+  tags: ["action", "timepicker", "time", "form", "clock"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-upload-001.yaml b/eval-server/evals/action-agent/action-agent-upload-001.yaml
new file mode 100644
index 00000000000..518515d61d4
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-upload-001.yaml
@@ -0,0 +1,46 @@
+# File upload test
+id: "action-agent-upload-001"
+name: "Upload File via Input"
+description: "Test clicking file input and uploading a test file"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/upload"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the file input and upload a test file"
+  reasoning: "Testing file upload interaction through input elements"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the file input element"
+      - "Triggered file selection dialog"
+      - "Selected a file for upload"
+      - "File name appears in the input field"
+      - "Upload process initiated successfully"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify file name appears in the upload input field"
+        - "Check if file selection was successful"
+        - "Confirm upload button is available or file is ready"
+        - "Ensure no upload errors are displayed"
+
+metadata:
+  tags: ["action", "upload", "file", "input", "form"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-video-001.yaml b/eval-server/evals/action-agent/action-agent-video-001.yaml
new file mode 100644
index 00000000000..ba21b28e53c
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-video-001.yaml
@@ -0,0 +1,47 @@
+# Video playback controls test
+id: "action-agent-video-001"
+name: "Control Video Playback"
+description: "Test starting video playback using click + spacebar"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/html5_video.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Click the video element to focus it, then press spacebar to start playback"
+  reasoning: "Testing video control using standard keyboard interaction (click to focus + spacebar to play)"
+  hint: "First click the Video element to focus it, then use keyboard input to press the spacebar key to start playback"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Video element in the accessibility tree"
+      - "Successfully clicked the Video element to focus it"
+      - "Used keyboard input to press spacebar"
+      - "Video playback started after spacebar press"
+      - "No errors occurred during the interaction sequence"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify video player is visible on the page"
+        - "Check if the play button was clicked (may show pause button after)"
+        - "Look for visual indicators that video started playing"
+        - "Ensure no error messages appeared during video interaction"
+
+metadata:
+  tags: ["action", "video", "media", "controls", "playback"]
+  priority: "high"
+  timeout: 90000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/action-agent/action-agent-video-002.yaml b/eval-server/evals/action-agent/action-agent-video-002.yaml
new file mode 100644
index 00000000000..d7188ecd592
--- /dev/null
+++ b/eval-server/evals/action-agent/action-agent-video-002.yaml
@@ -0,0 +1,47 @@
+# Video play button specific targeting test
+id: "action-agent-video-002"
+name: "Click Video Play Button Specifically"
+description: "Test clicking the specific play button (not the video element)"
+enabled: true
+
+target:
+  url: "https://www.w3schools.com/html/html5_video.asp"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Find and click the button that has name=\"play\" (not the Video element itself)"
+  reasoning: "Testing specific targeting of the play button element"
+  hint: "Target the button element with text or label \"play\", do not click the Video element"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Found a button element (not Video element) with \"play\" in the name"
+      - "Successfully clicked the play button specifically"
+      - "Did not click on the Video element itself"
+      - "Play button click was executed correctly"
+      - "Video responded to the button click"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the play button (not video element) was clicked"
+        - "Check if video started playing after button click"
+        - "Confirm the target was the button, not the video container"
+        - "Look for changes in video player state"
+
+metadata:
+  tags: ["action", "video", "button", "specific-targeting"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/research-agent/research-agent-basic-001.yaml b/eval-server/evals/research-agent/research-agent-basic-001.yaml
new file mode 100644
index 00000000000..85743d55c38
--- /dev/null
+++ b/eval-server/evals/research-agent/research-agent-basic-001.yaml
@@ -0,0 +1,39 @@
+# Basic research test - stable topic with clear sources
+id: "research-agent-basic-001"
+name: "Research Chrome DevTools History"
+description: "Research the history and development of Chrome DevTools"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 180000
+
+input:
+  query: "History and development of Chrome DevTools browser developer tools"
+  reasoning: "Testing basic research capabilities on a well-documented technical topic"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0
+    criteria:
+      - "Research covers the origins and early development of Chrome DevTools"
+      - "Information includes key milestones and major feature additions"
+      - "Sources include official documentation or reliable technical sources"
+      - "At least 3-5 different sources were consulted"
+      - "Information is factually accurate and up-to-date"
+      - "Research demonstrates understanding of the topic evolution"
+      - "Handoff to content_writer_agent occurred with comprehensive data"
+
+metadata:
+  tags: ["basic", "technical", "stable", "documentation"]
+  priority: "high"
+  timeout: 180000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/research-agent/research-agent-business-001.yaml b/eval-server/evals/research-agent/research-agent-business-001.yaml
new file mode 100644
index 00000000000..defeed16e6f
--- /dev/null
+++ b/eval-server/evals/research-agent/research-agent-business-001.yaml
@@ -0,0 +1,39 @@
+# Business research test
+id: "research-agent-business-001"
+name: "Research Remote Work Productivity"
+description: "Research remote work impact on productivity and business outcomes"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Remote work productivity statistics impact business outcomes 2024 studies"
+  reasoning: "Testing business research requiring statistical data and multiple perspectives"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research includes statistical data and survey results"
+      - "Covers multiple perspectives (employee, employer, industry)"
+      - "Sources include business publications, research studies, and reports"
+      - "Information addresses both positive and negative impacts"
+      - "Data is recent and relevant to current work trends"
+      - "Research demonstrates understanding of business implications"
+      - "Statistics and claims are properly sourced"
+
+metadata:
+  tags: ["business", "statistics", "workplace", "comprehensive"]
+  priority: "high"
+  timeout: 240000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/research-agent/research-agent-comparison-001.yaml b/eval-server/evals/research-agent/research-agent-comparison-001.yaml
new file mode 100644
index 00000000000..a433a58d886
--- /dev/null
+++ b/eval-server/evals/research-agent/research-agent-comparison-001.yaml
@@ -0,0 +1,39 @@
+# Comparative research test
+id: "research-agent-comparison-001"
+name: "Compare JavaScript vs TypeScript"
+description: "Research and compare JavaScript and TypeScript for web development"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 200000
+
+input:
+  query: "JavaScript vs TypeScript comparison web development pros cons differences"
+  reasoning: "Testing comparative research requiring balanced analysis of multiple options"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research covers both JavaScript and TypeScript comprehensively"
+      - "Includes clear comparison points (syntax, features, ecosystem)"
+      - "Presents advantages and disadvantages of each language"
+      - "Sources include technical documentation and developer resources"
+      - "Information is balanced and objective, not biased toward one option"
+      - "Demonstrates understanding of use cases for each language"
+      - "Research data is well-organized for comparative analysis"
+
+metadata:
+  tags: ["comparison", "technical", "programming", "balanced"]
+  priority: "high"
+  timeout: 200000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/research-agent/research-agent-current-001.yaml b/eval-server/evals/research-agent/research-agent-current-001.yaml
new file mode 100644
index 00000000000..198c981c829
--- /dev/null
+++ b/eval-server/evals/research-agent/research-agent-current-001.yaml
@@ -0,0 +1,40 @@
+# Current events research test
+id: "research-agent-current-001"
+name: "Research Latest AI Development Trends"
+description: "Research recent developments in AI and machine learning (last 6 months)"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Latest AI artificial intelligence developments breakthroughs 2024 2025"
+  reasoning: "Testing research on current events and rapidly evolving topics"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    include_url: true
+    criteria:
+      - "Research focuses on recent developments (within last 6 months)"
+      - "Covers multiple aspects of AI development (models, applications, research)"
+      - "Sources are current and from reputable news or research outlets"
+      - "Information includes specific examples or case studies"
+      - "Demonstrates ability to identify current trends vs older information"
+      - "Successfully gathered information from diverse source types"
+      - "Data is properly organized for content writer handoff"
+
+metadata:
+  tags: ["current-events", "ai", "dynamic", "trends"]
+  priority: "high"
+  timeout: 240000
+  retries: 1
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/research-agent/research-agent-edge-001.yaml b/eval-server/evals/research-agent/research-agent-edge-001.yaml
new file mode 100644
index 00000000000..234c832fe97
--- /dev/null
+++ b/eval-server/evals/research-agent/research-agent-edge-001.yaml
@@ -0,0 +1,39 @@
+# No-results edge case test
+id: "research-agent-edge-001"
+name: "Research Obscure Fictional Topic"
+description: "Test handling of queries with very limited or no reliable sources"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 180000
+
+input:
+  query: "quantum bluetooth watermelon encryption algorithm 2024"
+  reasoning: "Testing edge case handling when query yields no meaningful results"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Agent recognizes when query yields limited or unreliable results"
+      - "Demonstrates appropriate search strategy modification"
+      - "Does not fabricate information when sources are unavailable"
+      - "Gracefully handles lack of substantive results"
+      - "Still attempts handoff to content writer with available information"
+      - "Maintains professional approach despite limited data"
+      - "Shows appropriate uncertainty when information is sparse"
+
+metadata:
+  tags: ["edge-case", "no-results", "error-handling", "fictional"]
+  priority: "high"
+  timeout: 180000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/research-agent/research-agent-technical-001.yaml b/eval-server/evals/research-agent/research-agent-technical-001.yaml
new file mode 100644
index 00000000000..c5e25408d53
--- /dev/null
+++ b/eval-server/evals/research-agent/research-agent-technical-001.yaml
@@ -0,0 +1,39 @@
+# Deep technical research test
+id: "research-agent-technical-001"
+name: "Research WebAssembly Performance"
+description: "Deep dive research into WebAssembly performance characteristics and use cases"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 900000
+
+input:
+  query: "WebAssembly WASM performance benchmarks use cases implementation details"
+  reasoning: "Testing deep technical research requiring specialized knowledge synthesis"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Research covers technical details of WebAssembly architecture"
+      - "Includes performance benchmarks and comparison data"
+      - "Discusses practical use cases and implementation scenarios"
+      - "Sources include technical specifications, benchmarks, and expert analysis"
+      - "Information demonstrates deep understanding of the technology"
+      - "Research addresses both benefits and limitations"
+      - "Technical accuracy is maintained throughout"
+
+metadata:
+  tags: ["technical", "deep-dive", "performance", "webassembly"]
+  priority: "high"
+  timeout: 900000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/research-agent/research-agent-tools-001.yaml b/eval-server/evals/research-agent/research-agent-tools-001.yaml
new file mode 100644
index 00000000000..44da108d763
--- /dev/null
+++ b/eval-server/evals/research-agent/research-agent-tools-001.yaml
@@ -0,0 +1,40 @@
+# Tool orchestration test - focuses on how well the agent uses available tools
+id: "research-agent-tools-001"
+name: "Research Python Framework Comparison"
+description: "Research comparing Django vs Flask Python frameworks with focus on tool usage"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "research_agent"
+timeout: 240000
+
+input:
+  query: "Django vs Flask Python web framework comparison features performance"
+  reasoning: "Testing effective orchestration of navigation, extraction, and fetching tools"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Agent effectively used navigate_url to access search engines"
+      - "Schema-based extraction was used to gather structured search results"
+      - "Fetcher tool was used to collect content from multiple URLs"
+      - "Navigation strategy was logical and systematic"
+      - "Tool usage demonstrated purposeful research progression"
+      - "Information from different tools was effectively synthesized"
+      - "At least 3-5 different sources were accessed and processed"
+      - "Final handoff included comprehensive data from all tools"
+
+metadata:
+  tags: ["tool-orchestration", "systematic", "python", "frameworks"]
+  priority: "high"
+  timeout: 240000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/amazon-product-001.yaml b/eval-server/evals/schema-extractor/amazon-product-001.yaml
new file mode 100644
index 00000000000..bfeb975979c
--- /dev/null
+++ b/eval-server/evals/schema-extractor/amazon-product-001.yaml
@@ -0,0 +1,78 @@
+# E-commerce product extraction test
+id: "amazon-product-001"
+name: "Extract Amazon Product Details"
+description: "Extract product information from an Amazon product page"
+enabled: true
+
+target:
+  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      product:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          brand:
+            type: "string"
+          price:
+            type: "object"
+            properties:
+              current:
+                type: "number"
+              currency:
+                type: "string"
+          rating:
+            type: "object"
+            properties:
+              average:
+                type: "number"
+              count:
+                type: "number"
+          images:
+            type: "array"
+            items:
+              type: "string"
+              format: "url"
+          features:
+            type: "array"
+            items:
+              type: "string"
+        required:
+          - "title"
+          - "price"
+      availability:
+        type: "string"
+    required:
+      - "product"
+  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
+  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Product title is accurate and complete"
+      - "Price information is current and properly formatted"
+      - "Rating data includes both average and review count"
+      - "Image URLs are valid and accessible"
+      - "Key product features are captured"
+      - "All URLs are properly resolved (not node IDs)"
+
+metadata:
+  tags: ["ecommerce", "amazon", "product", "dynamic"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/bbc-news-001.yaml b/eval-server/evals/schema-extractor/bbc-news-001.yaml
new file mode 100644
index 00000000000..e434d2a874a
--- /dev/null
+++ b/eval-server/evals/schema-extractor/bbc-news-001.yaml
@@ -0,0 +1,69 @@
+# News article extraction test
+id: "bbc-news-001"
+name: "Extract BBC News Article"
+description: "Extract article content and metadata from a BBC News page"
+enabled: true
+
+target:
+  url: "https://www.bbc.com/news/technology"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      headlines:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            summary:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            category:
+              type: "string"
+          required:
+            - "title"
+      mainStory:
+        type: "object"
+        properties:
+          headline:
+            type: "string"
+          summary:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+    required:
+      - "headlines"
+  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
+  reasoning: "Testing extraction from a news aggregation page with multiple articles"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    include_url: true
+    criteria:
+      - "Headlines are current and relevant to technology news"
+      - "Article summaries provide meaningful context"
+      - "URLs link to valid BBC news articles"
+      - "Main story is properly identified"
+      - "All extracted content is in English"
+
+metadata:
+  tags: ["news", "bbc", "aggregation", "dynamic"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/bing-search-001.yaml b/eval-server/evals/schema-extractor/bing-search-001.yaml
new file mode 100644
index 00000000000..8488f341b43
--- /dev/null
+++ b/eval-server/evals/schema-extractor/bing-search-001.yaml
@@ -0,0 +1,70 @@
+# Bing Search results extraction test
+id: "bing-search-001"
+name: "Extract Bing Search Results"
+description: "Extract search results from Bing search page"
+enabled: true
+
+target:
+  url: "https://www.bing.com/search?q=web+scraping+best+practices"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            datePublished:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      sidebarInfo:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          description:
+            type: "string"
+          source:
+            type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
+  reasoning: "Testing extraction from Bing search results with different layout than Google"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results match the query intent"
+      - "Results include valid URLs and meaningful snippets"
+      - "Sidebar information is extracted when present"
+      - "No duplicate results in the list"
+
+metadata:
+  tags: ["search", "bing", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/github-repo-001-streamlined.yaml b/eval-server/evals/schema-extractor/github-repo-001-streamlined.yaml
new file mode 100644
index 00000000000..07532e7d13f
--- /dev/null
+++ b/eval-server/evals/schema-extractor/github-repo-001-streamlined.yaml
@@ -0,0 +1,66 @@
+# Simple structured data test (Streamlined version)
+id: "github-repo-001-streamlined"
+name: "Extract GitHub Repository Info (Streamlined)"
+description: "Extract basic repository information from a GitHub page using streamlined extractor"
+enabled: true
+
+target:
+  url: "https://github.com/microsoft/TypeScript"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      name:
+        type: "string"
+      description:
+        type: "string"
+      language:
+        type: "string"
+      stars:
+        type: "number"
+      forks:
+        type: "number"
+      topics:
+        type: "array"
+        items:
+          type: "string"
+      readme:
+        type: "object"
+        properties:
+          summary:
+            type: "string"
+    required:
+      - "name"
+      - "description"
+  instruction: "Extract repository metadata and basic statistics"
+  reasoning: "Testing extraction from a well-structured GitHub repository page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "stars"
+      - "forks"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Repository name matches the GitHub page"
+      - "Description accurately reflects the project purpose"
+      - "Programming language is correctly identified"
+      - "Topic tags are relevant to the project"
+
+metadata:
+  tags: ["github", "repository", "structured", "streamlined"]
+  priority: "high"
+  timeout: 30000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/github-repo-001.yaml b/eval-server/evals/schema-extractor/github-repo-001.yaml
new file mode 100644
index 00000000000..7a01a14043e
--- /dev/null
+++ b/eval-server/evals/schema-extractor/github-repo-001.yaml
@@ -0,0 +1,66 @@
+# Simple structured data test
+id: "github-repo-001"
+name: "Extract GitHub Repository Info"
+description: "Extract basic repository information from a GitHub page"
+enabled: true
+
+target:
+  url: "https://github.com/microsoft/TypeScript"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      name:
+        type: "string"
+      description:
+        type: "string"
+      language:
+        type: "string"
+      stars:
+        type: "number"
+      forks:
+        type: "number"
+      topics:
+        type: "array"
+        items:
+          type: "string"
+      readme:
+        type: "object"
+        properties:
+          summary:
+            type: "string"
+    required:
+      - "name"
+      - "description"
+  instruction: "Extract repository metadata and basic statistics"
+  reasoning: "Testing extraction from a well-structured GitHub repository page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "stars"
+      - "forks"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Repository name matches the GitHub page"
+      - "Description accurately reflects the project purpose"
+      - "Programming language is correctly identified"
+      - "Topic tags are relevant to the project"
+
+metadata:
+  tags: ["github", "repository", "structured"]
+  priority: "high"
+  timeout: 30000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/google-flights-001.yaml b/eval-server/evals/schema-extractor/google-flights-001.yaml
new file mode 100644
index 00000000000..80da1bb7bb5
--- /dev/null
+++ b/eval-server/evals/schema-extractor/google-flights-001.yaml
@@ -0,0 +1,106 @@
+# Google Flights search extraction test
+id: "google-flights-001"
+name: "Extract Google Flights Search Results"
+description: "Extract flight options from Google Flights search"
+enabled: true
+
+target:
+  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchCriteria:
+        type: "object"
+        properties:
+          origin:
+            type: "string"
+          destination:
+            type: "string"
+          departureDate:
+            type: "string"
+          returnDate:
+            type: "string"
+          tripType:
+            type: "string"
+          passengers:
+            type: "number"
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            airline:
+              type: "string"
+            flightNumber:
+              type: "string"
+            departureTime:
+              type: "string"
+            arrivalTime:
+              type: "string"
+            duration:
+              type: "string"
+            stops:
+              type: "number"
+            price:
+              type: "object"
+              properties:
+                amount:
+                  type: "number"
+                currency:
+                  type: "string"
+            cabin:
+              type: "string"
+            bookingUrl:
+              type: "string"
+              format: "url"
+            legroom:
+              type: "string"
+            amenities:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "airline"
+            - "departureTime"
+            - "arrivalTime"
+            - "price"
+      priceInsights:
+        type: "object"
+        properties:
+          trend:
+            type: "string"
+          recommendation:
+            type: "string"
+          averagePrice:
+            type: "number"
+    required:
+      - "flights"
+  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
+  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Flight times are in proper format"
+      - "Prices are numeric values with currency"
+      - "Airlines and flight numbers are accurate"
+      - "Stop information is correctly identified"
+      - "Duration is in readable format"
+
+metadata:
+  tags: ["travel", "flights", "google", "booking"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/google-search-001.yaml b/eval-server/evals/schema-extractor/google-search-001.yaml
new file mode 100644
index 00000000000..7e6f0e6a4eb
--- /dev/null
+++ b/eval-server/evals/schema-extractor/google-search-001.yaml
@@ -0,0 +1,76 @@
+# Google Search results extraction test
+id: "google-search-001"
+name: "Extract Google Search Results"
+description: "Extract search results from Google search page"
+enabled: true
+
+target:
+  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            domain:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      featuredSnippet:
+        type: "object"
+        properties:
+          content:
+            type: "string"
+          source:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+      relatedSearches:
+        type: "array"
+        items:
+          type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
+  reasoning: "Testing extraction from Google search results page with various result types"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are relevant to the query"
+      - "Each result has a valid title, URL, and snippet"
+      - "URLs are properly resolved and not node IDs"
+      - "Related searches are extracted if present"
+      - "Featured snippet is captured when available"
+
+metadata:
+  tags: ["search", "google", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/homedepot-001.yaml b/eval-server/evals/schema-extractor/homedepot-001.yaml
new file mode 100644
index 00000000000..4e8b835b66d
--- /dev/null
+++ b/eval-server/evals/schema-extractor/homedepot-001.yaml
@@ -0,0 +1,92 @@
+# Home Depot product search extraction test
+id: "homedepot-001"
+name: "Extract Home Depot Product Search"
+description: "Extract product listings from Home Depot search results"
+enabled: true
+
+target:
+  url: "https://www.homedepot.com/s/power%2520drill"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchQuery:
+        type: "string"
+      totalResults:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            price:
+              type: "number"
+            originalPrice:
+              type: "number"
+            savings:
+              type: "number"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            availability:
+              type: "string"
+            features:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "price"
+            - "productUrl"
+      filters:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
+  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are relevant to the search query"
+      - "Prices are numeric values in USD"
+      - "Product URLs link to Home Depot product pages"
+      - "Ratings are on a 5-star scale"
+      - "Key product features are captured"
+
+metadata:
+  tags: ["ecommerce", "homedepot", "products", "search"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/macys-001.yaml b/eval-server/evals/schema-extractor/macys-001.yaml
new file mode 100644
index 00000000000..23a4e37dec0
--- /dev/null
+++ b/eval-server/evals/schema-extractor/macys-001.yaml
@@ -0,0 +1,106 @@
+# Macy's product listing extraction test
+id: "macys-001"
+name: "Extract Macy's Product Listings"
+description: "Extract fashion products from Macy's category page"
+enabled: true
+
+target:
+  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      category:
+        type: "string"
+      totalProducts:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            currentPrice:
+              type: "number"
+            originalPrice:
+              type: "number"
+            discount:
+              type: "string"
+            colors:
+              type: "array"
+              items:
+                type: "string"
+            sizes:
+              type: "array"
+              items:
+                type: "string"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            promotions:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "brand"
+            - "currentPrice"
+      refinements:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          sizes:
+            type: "array"
+            items:
+              type: "string"
+          colors:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
+  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are from the correct category"
+      - "Prices reflect current and sale prices"
+      - "Color and size options are captured"
+      - "Brand names are accurately extracted"
+      - "Promotional text is included when present"
+
+metadata:
+  tags: ["ecommerce", "macys", "fashion", "products"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/schema-extractor/wikipedia-search-001.yaml b/eval-server/evals/schema-extractor/wikipedia-search-001.yaml
new file mode 100644
index 00000000000..ad5f2f43b82
--- /dev/null
+++ b/eval-server/evals/schema-extractor/wikipedia-search-001.yaml
@@ -0,0 +1,77 @@
+# Wikipedia search results extraction test
+id: "wikipedia-search-001"
+name: "Extract Wikipedia Search Results"
+description: "Extract search results from Wikipedia search"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_data"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchTerm:
+        type: "string"
+      resultCount:
+        type: "number"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            category:
+              type: "string"
+            wordCount:
+              type: "number"
+            lastEdited:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      suggestedArticles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+    required:
+      - "searchResults"
+  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
+  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are Wikipedia articles"
+      - "Each result has a valid Wikipedia URL"
+      - "Snippets contain relevant content highlights"
+      - "Metadata like word count is extracted when available"
+
+metadata:
+  tags: ["search", "wikipedia", "encyclopedia"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/screenshot-verification/dynamic-content-verification-001.yaml b/eval-server/evals/screenshot-verification/dynamic-content-verification-001.yaml
new file mode 100644
index 00000000000..ce271614e03
--- /dev/null
+++ b/eval-server/evals/screenshot-verification/dynamic-content-verification-001.yaml
@@ -0,0 +1,47 @@
+# Dynamic content visual verification test
+id: "dynamic-content-verification-001"
+name: "Dynamic Content Visual Verification"
+description: "Test visual verification of dynamic content loading using screenshots"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/dynamic_loading/1"
+
+tool: "action_agent"
+timeout: 90000
+
+input:
+  objective: "Take a screenshot, click the Start button, wait for content to load, then take another screenshot to verify the dynamic content appeared"
+  reasoning: "Testing visual verification of dynamic content changes using screenshot comparison"
+  hint: "Use take_screenshot before clicking Start, then again after the dynamic content loads"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Initial screenshot captured the page before dynamic loading"
+      - "Start button was successfully clicked"
+      - "Agent waited for dynamic content to fully load"
+      - "Final screenshot shows the revealed dynamic content"
+      - "Visual comparison demonstrates successful content loading verification"
+      - "Screenshots show clear before/after difference in content visibility"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare screenshots to verify dynamic content loading"
+        - "Confirm the first screenshot shows hidden content area"
+        - "Verify the second screenshot shows the revealed 'Hello World!' text"
+        - "Check that the loading animation or process is properly captured"
+
+metadata:
+  tags: ["screenshot", "dynamic-content", "visual-verification", "loading"]
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: true
\ No newline at end of file
diff --git a/eval-server/evals/screenshot-verification/screenshot-error-handling-001.yaml b/eval-server/evals/screenshot-verification/screenshot-error-handling-001.yaml
new file mode 100644
index 00000000000..4a98da0d81a
--- /dev/null
+++ b/eval-server/evals/screenshot-verification/screenshot-error-handling-001.yaml
@@ -0,0 +1,44 @@
+# Screenshot error handling test
+id: "screenshot-error-handling-001"
+name: "Screenshot Error Handling"
+description: "Test screenshot tool error handling and recovery"
+enabled: true
+
+target:
+  url: "https://httpstat.us/500"
+
+tool: "take_screenshot"
+timeout: 30000
+
+input:
+  fullPage: false
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Screenshot tool handled the error page gracefully"
+      - "Either successfully captured the error page or reported appropriate error"
+      - "No crashes or undefined behavior occurred"
+      - "Tool response is meaningful regardless of page loading issues"
+      - "Error handling demonstrates robustness of screenshot functionality"
+    visual_verification:
+      enabled: true
+      capture_before: false
+      capture_after: true
+      prompts:
+        - "If screenshot was taken, verify it shows the error page content"
+        - "Check that the tool handled the HTTP 500 error appropriately"
+        - "Confirm no blank or corrupted screenshots were produced"
+        - "Ensure error scenarios are handled professionally"
+
+metadata:
+  tags: ["screenshot", "error-handling", "robustness", "edge-case"]
+  priority: "normal"
+  timeout: 30000
+  retries: 1
+  flaky: true
\ No newline at end of file
diff --git a/eval-server/evals/screenshot-verification/screenshot-fullpage-001.yaml b/eval-server/evals/screenshot-verification/screenshot-fullpage-001.yaml
new file mode 100644
index 00000000000..b592f8c6bc0
--- /dev/null
+++ b/eval-server/evals/screenshot-verification/screenshot-fullpage-001.yaml
@@ -0,0 +1,45 @@
+# Full page screenshot verification test
+id: "screenshot-fullpage-001"
+name: "Take Full Page Screenshot"
+description: "Test taking full page screenshot and verify functionality"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
+
+tool: "take_screenshot"
+timeout: 45000
+
+input:
+  fullPage: true
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Full page screenshot was successfully captured"
+      - "Data URL contains valid image data"
+      - "Screenshot captures the entire page content including areas below the fold"
+      - "Image size is larger than viewport-only screenshot would be"
+      - "No errors occurred during full page capture"
+      - "Screenshot includes both header and footer content"
+    visual_verification:
+      enabled: true
+      capture_before: false
+      capture_after: true
+      prompts:
+        - "Verify the screenshot shows the complete Wikipedia article page"
+        - "Check that content above and below the fold is captured"
+        - "Confirm the image is taller than a typical viewport"
+        - "Ensure no content is cut off at the bottom"
+
+metadata:
+  tags: ["screenshot", "fullpage", "visual", "verification", "wikipedia"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
\ No newline at end of file
diff --git a/eval-server/evals/screenshot-verification/screenshot-viewport-001.yaml b/eval-server/evals/screenshot-verification/screenshot-viewport-001.yaml
new file mode 100644
index 00000000000..54833d68f26
--- /dev/null
+++ b/eval-server/evals/screenshot-verification/screenshot-viewport-001.yaml
@@ -0,0 +1,44 @@
+# Viewport screenshot verification test
+id: "screenshot-viewport-001"
+name: "Take Viewport Screenshot"
+description: "Test taking viewport screenshot and verify functionality"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+
+tool: "take_screenshot"
+timeout: 30000
+
+input:
+  fullPage: false
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Screenshot was successfully captured"
+      - "Data URL is properly formatted and contains image data"
+      - "Screenshot shows the viewport content correctly"
+      - "No errors occurred during screenshot capture"
+      - "Image data length indicates a valid screenshot was taken"
+    visual_verification:
+      enabled: true
+      capture_before: false
+      capture_after: true
+      prompts:
+        - "Verify the screenshot shows the Google homepage"
+        - "Check that the screenshot is not empty or corrupted"
+        - "Confirm the image quality is appropriate for verification"
+        - "Ensure the screenshot captures the current viewport accurately"
+
+metadata:
+  tags: ["screenshot", "viewport", "visual", "verification"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: false
\ No newline at end of file
diff --git a/eval-server/evals/screenshot-verification/visual-comparison-001.yaml b/eval-server/evals/screenshot-verification/visual-comparison-001.yaml
new file mode 100644
index 00000000000..035447993f2
--- /dev/null
+++ b/eval-server/evals/screenshot-verification/visual-comparison-001.yaml
@@ -0,0 +1,47 @@
+# Visual comparison verification test
+id: "visual-comparison-001"
+name: "Visual Comparison Before and After Action"
+description: "Test visual verification by comparing screenshots before and after an action"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Take a screenshot, then type 'DevTools testing' in the search box, and take another screenshot to compare"
+  reasoning: "Testing visual verification workflow with before/after screenshot comparison"
+  hint: "Use take_screenshot tool before and after performing the search input action"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Initial screenshot was taken before performing any actions"
+      - "Search text was successfully entered into the search field"
+      - "Second screenshot was taken after the text input"
+      - "Visual comparison shows the difference between before and after states"
+      - "Search field contains the entered text in the final screenshot"
+      - "Screenshots demonstrate successful action verification workflow"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Compare the before and after screenshots"
+        - "Verify the search field is empty in the first screenshot"
+        - "Confirm the search field contains 'DevTools testing' in the second screenshot"
+        - "Check that the visual changes accurately reflect the performed action"
+
+metadata:
+  tags: ["screenshot", "visual-comparison", "action-verification", "before-after"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/amazon-product-001.yaml b/eval-server/evals/streamlined-schema-extractor/amazon-product-001.yaml
new file mode 100644
index 00000000000..b1544549e4e
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/amazon-product-001.yaml
@@ -0,0 +1,78 @@
+# E-commerce product extraction test (Streamlined)
+id: "amazon-product-001"
+name: "Extract Amazon Product Details"
+description: "Extract product information from an Amazon product page"
+enabled: true
+
+target:
+  url: "https://www.amazon.com/Obelisk-Climbing-Rustproof-Trellises-Clematis/dp/B0B4SBY6QD/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      product:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          brand:
+            type: "string"
+          price:
+            type: "object"
+            properties:
+              current:
+                type: "number"
+              currency:
+                type: "string"
+          rating:
+            type: "object"
+            properties:
+              average:
+                type: "number"
+              count:
+                type: "number"
+          images:
+            type: "array"
+            items:
+              type: "string"
+              format: "url"
+          features:
+            type: "array"
+            items:
+              type: "string"
+        required:
+          - "title"
+          - "price"
+      availability:
+        type: "string"
+    required:
+      - "product"
+  instruction: "Extract comprehensive product information including pricing, ratings, and key features"
+  reasoning: "Testing extraction from a dynamic e-commerce page with complex structure"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Product title is accurate and complete"
+      - "Price information is current and properly formatted"
+      - "Rating data includes both average and review count"
+      - "Image URLs are valid and accessible"
+      - "Key product features are captured"
+      - "All URLs are properly resolved (not node IDs)"
+
+metadata:
+  tags: ["ecommerce", "amazon", "product", "dynamic"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/bbc-news-001.yaml b/eval-server/evals/streamlined-schema-extractor/bbc-news-001.yaml
new file mode 100644
index 00000000000..31ef2883ecd
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/bbc-news-001.yaml
@@ -0,0 +1,69 @@
+# News article extraction test (Streamlined)
+id: "bbc-news-001"
+name: "Extract BBC News Article"
+description: "Extract article content and metadata from a BBC News page"
+enabled: true
+
+target:
+  url: "https://www.bbc.com/news/technology"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      headlines:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            summary:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            category:
+              type: "string"
+          required:
+            - "title"
+      mainStory:
+        type: "object"
+        properties:
+          headline:
+            type: "string"
+          summary:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+    required:
+      - "headlines"
+  instruction: "Extract the main headlines and featured stories from the BBC Technology news section"
+  reasoning: "Testing extraction from a news aggregation page with multiple articles"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    include_url: true
+    criteria:
+      - "Headlines are current and relevant to technology news"
+      - "Article summaries provide meaningful context"
+      - "URLs link to valid BBC news articles"
+      - "Main story is properly identified"
+      - "All extracted content is in English"
+
+metadata:
+  tags: ["news", "bbc", "aggregation", "dynamic"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/bing-search-001.yaml b/eval-server/evals/streamlined-schema-extractor/bing-search-001.yaml
new file mode 100644
index 00000000000..e9f3b6edb33
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/bing-search-001.yaml
@@ -0,0 +1,70 @@
+# Bing Search results extraction test
+id: "bing-search-001"
+name: "Extract Bing Search Results"
+description: "Extract search results from Bing search page"
+enabled: true
+
+target:
+  url: "https://www.bing.com/search?q=web+scraping+best+practices"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            datePublished:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      sidebarInfo:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+          description:
+            type: "string"
+          source:
+            type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract search results including titles, URLs, snippets, and any sidebar information from Bing"
+  reasoning: "Testing extraction from Bing search results with different layout than Google"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results match the query intent"
+      - "Results include valid URLs and meaningful snippets"
+      - "Sidebar information is extracted when present"
+      - "No duplicate results in the list"
+
+metadata:
+  tags: ["search", "bing", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/github-repo-001.yaml b/eval-server/evals/streamlined-schema-extractor/github-repo-001.yaml
new file mode 100644
index 00000000000..5c496c518f5
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/github-repo-001.yaml
@@ -0,0 +1,66 @@
+# Simple structured data test (Streamlined)
+id: "github-repo-001"
+name: "Extract GitHub Repository Info"
+description: "Extract basic repository information from a GitHub page"
+enabled: true
+
+target:
+  url: "https://github.com/microsoft/TypeScript"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      name:
+        type: "string"
+      description:
+        type: "string"
+      language:
+        type: "string"
+      stars:
+        type: "number"
+      forks:
+        type: "number"
+      topics:
+        type: "array"
+        items:
+          type: "string"
+      readme:
+        type: "object"
+        properties:
+          summary:
+            type: "string"
+    required:
+      - "name"
+      - "description"
+  instruction: "Extract repository metadata and basic statistics"
+  reasoning: "Testing extraction from a well-structured GitHub repository page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "stars"
+      - "forks"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Repository name matches the GitHub page"
+      - "Description accurately reflects the project purpose"
+      - "Programming language is correctly identified"
+      - "Topic tags are relevant to the project"
+
+metadata:
+  tags: ["github", "repository", "structured"]
+  priority: "high"
+  timeout: 30000
+  retries: 1
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/google-flights-001.yaml b/eval-server/evals/streamlined-schema-extractor/google-flights-001.yaml
new file mode 100644
index 00000000000..981ccbd48dc
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/google-flights-001.yaml
@@ -0,0 +1,106 @@
+# Google Flights search extraction test
+id: "google-flights-001"
+name: "Extract Google Flights Search Results"
+description: "Extract flight options from Google Flights search"
+enabled: true
+
+target:
+  url: "https://www.google.com/travel/flights/search?tfs=CBwQAhojEgoyMDI1LTEyLTI0agwIAhIIL20vMGQ5anJyBwgBEgNTRk8aIxIKMjAyNS0xMi0zMWoHCAESA1NGT3IMCAISCC9tLzBkOWpyQAFIAXABggELCP___________wGYAQE"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchCriteria:
+        type: "object"
+        properties:
+          origin:
+            type: "string"
+          destination:
+            type: "string"
+          departureDate:
+            type: "string"
+          returnDate:
+            type: "string"
+          tripType:
+            type: "string"
+          passengers:
+            type: "number"
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            airline:
+              type: "string"
+            flightNumber:
+              type: "string"
+            departureTime:
+              type: "string"
+            arrivalTime:
+              type: "string"
+            duration:
+              type: "string"
+            stops:
+              type: "number"
+            price:
+              type: "object"
+              properties:
+                amount:
+                  type: "number"
+                currency:
+                  type: "string"
+            cabin:
+              type: "string"
+            bookingUrl:
+              type: "string"
+              format: "url"
+            legroom:
+              type: "string"
+            amenities:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "airline"
+            - "departureTime"
+            - "arrivalTime"
+            - "price"
+      priceInsights:
+        type: "object"
+        properties:
+          trend:
+            type: "string"
+          recommendation:
+            type: "string"
+          averagePrice:
+            type: "number"
+    required:
+      - "flights"
+  instruction: "Extract flight options including airlines, times, prices, and amenities from Google Flights results"
+  reasoning: "Testing extraction from complex travel search interface with dynamic pricing"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Flight times are in proper format"
+      - "Prices are numeric values with currency"
+      - "Airlines and flight numbers are accurate"
+      - "Stop information is correctly identified"
+      - "Duration is in readable format"
+
+metadata:
+  tags: ["travel", "flights", "google", "booking"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/google-search-001.yaml b/eval-server/evals/streamlined-schema-extractor/google-search-001.yaml
new file mode 100644
index 00000000000..c1725d481d6
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/google-search-001.yaml
@@ -0,0 +1,76 @@
+# Google Search results extraction test
+id: "google-search-001"
+name: "Extract Google Search Results"
+description: "Extract search results from Google search page"
+enabled: true
+
+target:
+  url: "https://www.google.com/search?q=chrome+devtools+tutorial"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      query:
+        type: "string"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            domain:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      featuredSnippet:
+        type: "object"
+        properties:
+          content:
+            type: "string"
+          source:
+            type: "string"
+          url:
+            type: "string"
+            format: "url"
+      relatedSearches:
+        type: "array"
+        items:
+          type: "string"
+    required:
+      - "searchResults"
+  instruction: "Extract the top 10 search results with titles, URLs, and snippets. Also extract featured snippet if present and related searches"
+  reasoning: "Testing extraction from Google search results page with various result types"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are relevant to the query"
+      - "Each result has a valid title, URL, and snippet"
+      - "URLs are properly resolved and not node IDs"
+      - "Related searches are extracted if present"
+      - "Featured snippet is captured when available"
+
+metadata:
+  tags: ["search", "google", "serp", "dynamic"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/homedepot-001.yaml b/eval-server/evals/streamlined-schema-extractor/homedepot-001.yaml
new file mode 100644
index 00000000000..1d268488a3b
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/homedepot-001.yaml
@@ -0,0 +1,92 @@
+# Home Depot product search extraction test
+id: "homedepot-001"
+name: "Extract Home Depot Product Search"
+description: "Extract product listings from Home Depot search results"
+enabled: true
+
+target:
+  url: "https://www.homedepot.com/s/power%2520drill"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchQuery:
+        type: "string"
+      totalResults:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            price:
+              type: "number"
+            originalPrice:
+              type: "number"
+            savings:
+              type: "number"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            availability:
+              type: "string"
+            features:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "price"
+            - "productUrl"
+      filters:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract product listings from Home Depot search results including prices, ratings, and availability"
+  reasoning: "Testing extraction from e-commerce search results with product cards and filters"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are relevant to the search query"
+      - "Prices are numeric values in USD"
+      - "Product URLs link to Home Depot product pages"
+      - "Ratings are on a 5-star scale"
+      - "Key product features are captured"
+
+metadata:
+  tags: ["ecommerce", "homedepot", "products", "search"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/macys-001.yaml b/eval-server/evals/streamlined-schema-extractor/macys-001.yaml
new file mode 100644
index 00000000000..28a2c1056c1
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/macys-001.yaml
@@ -0,0 +1,106 @@
+# Macy's product listing extraction test
+id: "macys-001"
+name: "Extract Macy's Product Listings"
+description: "Extract fashion products from Macy's category page"
+enabled: true
+
+target:
+  url: "https://www.macys.com/shop/womens-clothing/womens-dresses"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 60000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      category:
+        type: "string"
+      totalProducts:
+        type: "number"
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            brand:
+              type: "string"
+            currentPrice:
+              type: "number"
+            originalPrice:
+              type: "number"
+            discount:
+              type: "string"
+            colors:
+              type: "array"
+              items:
+                type: "string"
+            sizes:
+              type: "array"
+              items:
+                type: "string"
+            rating:
+              type: "number"
+            reviewCount:
+              type: "number"
+            productUrl:
+              type: "string"
+              format: "url"
+            imageUrl:
+              type: "string"
+              format: "url"
+            promotions:
+              type: "array"
+              items:
+                type: "string"
+          required:
+            - "name"
+            - "brand"
+            - "currentPrice"
+      refinements:
+        type: "object"
+        properties:
+          brands:
+            type: "array"
+            items:
+              type: "string"
+          sizes:
+            type: "array"
+            items:
+              type: "string"
+          colors:
+            type: "array"
+            items:
+              type: "string"
+          priceRanges:
+            type: "array"
+            items:
+              type: "string"
+    required:
+      - "products"
+  instruction: "Extract fashion products including prices, sizes, colors, and promotional offers from Macy's"
+  reasoning: "Testing extraction from fashion e-commerce with complex product attributes"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Products are from the correct category"
+      - "Prices reflect current and sale prices"
+      - "Color and size options are captured"
+      - "Brand names are accurately extracted"
+      - "Promotional text is included when present"
+
+metadata:
+  tags: ["ecommerce", "macys", "fashion", "products"]
+  priority: "high"
+  timeout: 60000
+  retries: 3
+  flaky: true
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/wikipedia-001.yaml b/eval-server/evals/streamlined-schema-extractor/wikipedia-001.yaml
new file mode 100644
index 00000000000..88983bd32c6
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/wikipedia-001.yaml
@@ -0,0 +1,76 @@
+# Wikipedia article extraction test (Streamlined)
+id: "wikipedia-chrome-devtools-001"
+name: "Extract Chrome DevTools Wikipedia Article"
+description: "Extract structured information from the Chrome DevTools Wikipedia page"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/wiki/Chrome_DevTools"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 45000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      title:
+        type: "string"
+      summary:
+        type: "string"
+      tableOfContents:
+        type: "array"
+        items:
+          type: "string"
+      infobox:
+        type: "object"
+        properties:
+          developer:
+            type: "string"
+          initialRelease:
+            type: "string"
+          operatingSystem:
+            type: "string"
+          license:
+            type: "string"
+      externalLinks:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            text:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+    required:
+      - "title"
+      - "summary"
+  instruction: "Extract the main article information including title, summary, table of contents, and infobox details"
+  reasoning: "Testing extraction from a stable, well-structured Wikipedia page"
+
+validation:
+  type: "hybrid"
+  snapshot:
+    exclude_paths:
+      - "externalLinks[*].url"
+    structure_only: false
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Article title matches the Wikipedia page title"
+      - "Summary captures the main description of Chrome DevTools"
+      - "Table of contents includes major sections"
+      - "Infobox contains key technical details"
+      - "External links are properly resolved URLs"
+
+metadata:
+  tags: ["wikipedia", "documentation", "stable"]
+  priority: "high"
+  timeout: 45000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/streamlined-schema-extractor/wikipedia-search-001.yaml b/eval-server/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
new file mode 100644
index 00000000000..c432c20d4f0
--- /dev/null
+++ b/eval-server/evals/streamlined-schema-extractor/wikipedia-search-001.yaml
@@ -0,0 +1,77 @@
+# Wikipedia search results extraction test
+id: "wikipedia-search-001"
+name: "Extract Wikipedia Search Results"
+description: "Extract search results from Wikipedia search"
+enabled: true
+
+target:
+  url: "https://en.wikipedia.org/w/index.php?search=artificial+intelligence&title=Special:Search"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "extract_schema_streamlined"
+timeout: 30000
+
+input:
+  schema:
+    type: "object"
+    properties:
+      searchTerm:
+        type: "string"
+      resultCount:
+        type: "number"
+      searchResults:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+            snippet:
+              type: "string"
+            category:
+              type: "string"
+            wordCount:
+              type: "number"
+            lastEdited:
+              type: "string"
+          required:
+            - "title"
+            - "url"
+            - "snippet"
+      suggestedArticles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            url:
+              type: "string"
+              format: "url"
+    required:
+      - "searchResults"
+  instruction: "Extract Wikipedia search results including article titles, URLs, snippets, and metadata like word count or last edit date"
+  reasoning: "Testing extraction from Wikipedia's internal search with rich metadata"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Search results are Wikipedia articles"
+      - "Each result has a valid Wikipedia URL"
+      - "Snippets contain relevant content highlights"
+      - "Metadata like word count is extracted when available"
+
+metadata:
+  tags: ["search", "wikipedia", "encyclopedia"]
+  priority: "high"
+  timeout: 30000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-booking-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-booking-001.yaml
new file mode 100644
index 00000000000..fab657a67fe
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-booking-001.yaml
@@ -0,0 +1,47 @@
+# Hotel Search Workflow - Web Task Agent
+id: "web-task-agent-booking-001"
+name: "Hotel Search Workflow"
+description: "Test web task agent orchestrating complex multi-step booking search"
+enabled: true
+
+target:
+  url: "https://www.booking.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for hotels in San Francisco for 2 adults, check-in March 15, check-out March 17"
+  reasoning: "Customer is looking for travel booking"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully searched for hotels in San Francisco"
+      - "Results show hotels available for March 15-17 dates"
+      - "Guest count of 2 adults is reflected in the search results"
+      - "Returned multiple hotel options with relevant details"
+      - "Each hotel includes essential information (name, price, location)"
+      - "Results are presented in a clear, readable format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify hotel search results are displayed for San Francisco"
+        - "Check that dates March 15-17 are correctly selected"
+        - "Confirm guest count shows 2 adults"
+        - "Ensure search results show hotels with availability for specified dates"
+
+metadata:
+  tags: ["web-task", "booking", "workflow", "multi-step", "travel", "complex"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-ecommerce-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
new file mode 100644
index 00000000000..b05bab0b54f
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-ecommerce-001.yaml
@@ -0,0 +1,55 @@
+# E-commerce web task evaluation (matches DevTools test case)
+id: "web-task-agent-ecommerce-001"
+name: "E-commerce Product Search"
+description: "Test web task agent handling product search on shopping site"
+enabled: true
+
+target:
+  url: "https://www.amazon.com"
+
+tool: "web_task_agent"
+timeout: 90000
+
+input:
+  task: "Search Amazon for \"wireless headphones\" and find products under $100"
+  reasoning: "Testing e-commerce search workflow with price filtering"
+  context: "User wants to find wireless headphones with specific price constraint"
+  extraction_schema:
+    type: "object"
+    properties:
+      products:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            price:
+              type: "string"
+            rating:
+              type: "string"
+            url:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "hybrid"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Successfully navigated to product search"
+      - "Applied appropriate filters correctly"
+      - "Extracted product details accurately"
+      - "Provided meaningful comparison of features"
+      - "Stayed within specified price range"
+  snapshot:
+    structure_only: true
+    exclude_paths:
+      - "timestamp"
+      - "sessionId"
+
+metadata:
+  tags: ["web-task", "multi-step", "ecommerce", "search"]
+  priority: "high"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-error-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-error-001.yaml
new file mode 100644
index 00000000000..bb7c2645f00
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-error-001.yaml
@@ -0,0 +1,47 @@
+# Error Recovery Workflow - Web Task Agent
+id: "web-task-agent-error-001"
+name: "Error Recovery Workflow"
+description: "Test web task agent handling action_agent failures and retry logic"
+enabled: true
+
+target:
+  url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"nonexistent test query 12345\" and handle any issues that arise"
+  reasoning: "Customer is asking for this response"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Attempted to search for the unusual query \"nonexistent test query 12345\""
+      - "Either found some results OR provided clear explanation why no results were found"
+      - "Response handles the edge case gracefully without errors"
+      - "If no results found, suggested alternative actions or explanations"
+      - "Maintained professional tone despite unusual request"
+      - "Final output is coherent and helpful to the user"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Check if search was attempted despite unusual query"
+        - "Verify error handling did not break the page interaction"
+        - "Confirm agent attempted to complete the task or provided clear error info"
+        - "Ensure page is still functional after error recovery attempts"
+
+metadata:
+  tags: ["web-task", "error-recovery", "retry", "orchestration", "robustness"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-extract-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-extract-001.yaml
new file mode 100644
index 00000000000..7dda4681661
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-extract-001.yaml
@@ -0,0 +1,62 @@
+# Structured Data Extraction - Web Task Agent
+id: "web-task-agent-extract-001"
+name: "Structured Data Extraction"
+description: "Test web task agent extracting structured data from search results"
+enabled: true
+
+target:
+  url: "https://news.ycombinator.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 5 Hacker News stories with their titles, scores, and comment counts"
+  reasoning: "User is looking to understand the top stories on Hacker News"
+  extraction_schema:
+    type: "object"
+    properties:
+      stories:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            score:
+              type: "number"
+            comments:
+              type: "number"
+            url:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully returned exactly 5 Hacker News stories in structured text format"
+      - "Each story is numbered (1., 2., 3., 4., 5.) with title, score, comments, and URL"
+      - "Results are presented in readable text format similar to the example provided"
+      - "Response includes all required fields: title, score, comments count, URL"
+      - "Maintained proper orchestration pattern throughout the extraction process"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Hacker News homepage is loaded and displaying stories"
+        - "Check that top stories are visible with scores and comment counts"
+        - "Confirm story titles and metadata are clearly displayed"
+        - "Ensure page structure allows for data extraction"
+
+metadata:
+  tags: ["web-task", "data-extraction", "structured-data", "hackernews", "schema"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-finance-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-finance-001.yaml
new file mode 100644
index 00000000000..ad873ab9172
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-finance-001.yaml
@@ -0,0 +1,70 @@
+# Stock Information Research - Web Task Agent
+id: "web-task-agent-finance-001"
+name: "Stock Information Research"
+description: "Test extracting stock prices and financial information"
+enabled: true
+
+target:
+  url: "https://finance.yahoo.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for Apple (AAPL) stock information and extract current price, market cap, and recent performance"
+  reasoning: "Users need automated financial data collection for investment decisions"
+  extraction_schema:
+    type: "object"
+    properties:
+      stock_info:
+        type: "object"
+        properties:
+          symbol:
+            type: "string"
+          company_name:
+            type: "string"
+          current_price:
+            type: "string"
+          change:
+            type: "string"
+          change_percent:
+            type: "string"
+          market_cap:
+            type: "string"
+          pe_ratio:
+            type: "string"
+          volume:
+            type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Apple (AAPL) stock information"
+      - "Current stock price is clearly stated"
+      - "Market cap information is included"
+      - "Price change and percentage change are provided"
+      - "Additional metrics (PE ratio, volume) included when available"
+      - "Financial data is current and presented in readable text format (not JSON)"
+      - "Stock information is well-organized and easy to understand"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Yahoo Finance shows Apple (AAPL) stock page"
+        - "Check that current stock price and change are visible"
+        - "Confirm market cap and trading volume are displayed"
+        - "Ensure financial metrics and charts are shown"
+
+metadata:
+  tags: ["web-task", "finance", "stocks", "yahoo-finance", "investment", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-flight-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-flight-001.yaml
new file mode 100644
index 00000000000..bf79e2419c6
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-flight-001.yaml
@@ -0,0 +1,47 @@
+# Complex Flight Search - Web Task Agent
+id: "web-task-agent-flight-001"
+name: "Complex Flight Search"
+description: "Test web task agent handling complex flight search with multiple criteria"
+enabled: true
+
+target:
+  url: "https://www.kayak.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for round-trip flights from Seattle (SEA) to Tokyo (NRT) departing March 20, returning March 30"
+  reasoning: "Customer is looking for finding the best flight options"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found round-trip flights from Seattle (SEA) to Tokyo (NRT)"
+      - "Flight results show March 20 departure date"
+      - "Flight results show March 30 return date"
+      - "Returned multiple flight options with airlines and prices"
+      - "Each flight includes essential details (times, airlines, prices)"
+      - "Results clearly distinguish between outbound and return flights"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify flight search results are displayed"
+        - "Check SEA to NRT route is correctly selected"
+        - "Confirm dates March 20 departure and March 30 return"
+        - "Ensure flight options are showing with prices and airlines"
+
+metadata:
+  tags: ["web-task", "flight", "travel", "multi-step", "kayak", "round-trip"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-food-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-food-001.yaml
new file mode 100644
index 00000000000..8bbf0324bb9
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-food-001.yaml
@@ -0,0 +1,70 @@
+# Restaurant Search and Menu Extraction - Web Task Agent
+id: "web-task-agent-food-001"
+name: "Restaurant Search and Menu Extraction"
+description: "Test searching restaurants and extracting menu information"
+enabled: true
+
+target:
+  url: "https://www.yelp.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Italian restaurants near me\" in San Francisco and extract restaurant details"
+  reasoning: "Users want to quickly compare restaurants, menus, and reviews"
+  extraction_schema:
+    type: "object"
+    properties:
+      restaurants:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            name:
+              type: "string"
+            rating:
+              type: "string"
+            price_range:
+              type: "string"
+            cuisine:
+              type: "string"
+            address:
+              type: "string"
+            phone:
+              type: "string"
+            hours:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Italian restaurants in San Francisco"
+      - "Each restaurant includes name, rating, and price range"
+      - "Location/address information is provided for each restaurant"
+      - "Contact details (phone/hours) included when available"
+      - "All restaurants listed serve Italian cuisine"
+      - "Results are presented in clear, structured text format (not JSON)"
+      - "Restaurants are numbered or organized clearly for easy comparison"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Yelp search results for Italian restaurants"
+        - "Check that restaurants show ratings and price ranges"
+        - "Confirm location filter shows San Francisco results"
+        - "Ensure restaurant listings include contact information"
+
+metadata:
+  tags: ["web-task", "restaurants", "yelp", "food", "local-search", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-iframe-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-iframe-001.yaml
new file mode 100644
index 00000000000..fe38d9cffad
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-iframe-001.yaml
@@ -0,0 +1,85 @@
+# ANA Airlines Iframe Content Extraction - Web Task Agent
+id: "web-task-agent-iframe-001"
+name: "ANA Airlines Iframe Content Extraction"
+description: "Test web task agent handling iframe-heavy airline booking sites like ANA Airlines"
+enabled: true
+
+target:
+  url: "https://aswbe.ana.co.jp/webapps/reservation/flight-search?CONNECTION_KIND=SEA&LANG=en&hiddenSearchMode=ROUND_TRIP&departureDate:field=20260320&returnDate:field=20260330&departureAirportCode:field=SEA&arrivalAirportCode:field=NRT&adultCount=1&youngAdultCount=0&childCount=0&infantCount=0&boardingClass=INTY001&searchFlag=1"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Navigate the ANA Airlines flight search page and extract available flight options from Seattle (SEA) to Tokyo Narita (NRT) for March 20-30, 2026. Handle any iframe content and booking interface elements."
+  reasoning: "Testing iframe content extraction and complex airline booking site navigation"
+  extraction_schema:
+    type: "object"
+    properties:
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            flight_number:
+              type: "string"
+            airline:
+              type: "string"
+            departure_time:
+              type: "string"
+            arrival_time:
+              type: "string"
+            departure_date:
+              type: "string"
+            arrival_date:
+              type: "string"
+            duration:
+              type: "string"
+            aircraft:
+              type: "string"
+            price:
+              type: "string"
+            cabin_class:
+              type: "string"
+            stops:
+              type: "string"
+      booking_interface_status:
+        type: "string"
+      iframe_content_found:
+        type: "boolean"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully navigated ANA Airlines booking interface"
+      - "Handled iframe content correctly (iframe_content_found should be true if iframes detected)"
+      - "Extracted flight information from ANA flight search results"
+      - "Flight details include ANA flight numbers and accurate route (SEA to NRT)"
+      - "Extracted pricing information in appropriate currency"
+      - "Handled any booking interface elements, popups, or navigation flows"
+      - "Results show flights for the correct dates (March 20-30, 2026)"
+      - "Successfully demonstrated iframe content extraction capabilities"
+      - "Booking interface status indicates successful page interaction"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify ANA Airlines flight search page loaded correctly"
+        - "Check that search parameters show SEA to NRT route"
+        - "Confirm flight results are displayed (may be in iframes)"
+        - "Ensure booking interface elements are functional"
+        - "Verify flight information is accessible and extractable"
+
+metadata:
+  tags: ["web-task", "iframe", "ana-airlines", "complex-booking", "international-flight", "airline-specific"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-jobs-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-jobs-001.yaml
new file mode 100644
index 00000000000..06de5beb368
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-jobs-001.yaml
@@ -0,0 +1,70 @@
+# Job Search Workflow - Web Task Agent
+id: "web-task-agent-jobs-001"
+name: "Job Search Workflow"
+description: "Test web task agent orchestrating job search on LinkedIn"
+enabled: true
+
+target:
+  url: "https://www.linkedin.com/jobs"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Software Engineer\" jobs in \"San Francisco\" and extract details for the first 5 results"
+  reasoning: "User wants to find job opportunities in tech industry"
+  extraction_schema:
+    type: "object"
+    properties:
+      jobs:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            company:
+              type: "string"
+            location:
+              type: "string"
+            salary:
+              type: "string"
+            description:
+              type: "string"
+            url:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Either used construct_direct_url for LinkedIn job search OR used traditional form interaction"
+      - "If using direct URL: constructed proper LinkedIn job search URL with keywords and location"
+      - "If using forms: delegated keyword and location input to action_agent"
+      - "Extracted job listings using schema_based_extractor"
+      - "Returned structured job data in readable text format (not JSON)"
+      - "Each job listing includes title, company, location, and other relevant fields"
+      - "Results are numbered or organized clearly for easy reading"
+      - "Demonstrated proper workflow orchestration for job search"
+      - "Never used direct browser interaction tools"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify LinkedIn job search results are displayed"
+        - "Check that search shows Software Engineer jobs in San Francisco"
+        - "Confirm job listings include company names and titles"
+        - "Ensure at least 5 job results are visible"
+
+metadata:
+  tags: ["web-task", "jobs", "linkedin", "search", "career", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-learning-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-learning-001.yaml
new file mode 100644
index 00000000000..58dec4d06cc
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-learning-001.yaml
@@ -0,0 +1,71 @@
+# Online Course Search - Web Task Agent
+id: "web-task-agent-learning-001"
+name: "Online Course Search"
+description: "Test searching and extracting course information from learning platforms"
+enabled: true
+
+target:
+  url: "https://www.coursera.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"Machine Learning\" courses and extract details for top 5 results"
+  reasoning: "Users want to compare courses across platforms for learning decisions"
+  extraction_schema:
+    type: "object"
+    properties:
+      courses:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            instructor:
+              type: "string"
+            university:
+              type: "string"
+            rating:
+              type: "string"
+            duration:
+              type: "string"
+            price:
+              type: "string"
+            description:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully found Machine Learning courses on Coursera"
+      - "Returned details for top 5 courses as requested"
+      - "Each course includes title, instructor, university, and rating"
+      - "Duration and pricing information included for each course"
+      - "Course descriptions or key topics are provided"
+      - "Results are presented in structured text format (not JSON)"
+      - "Courses are numbered (1-5) and well-organized for easy comparison"
+      - "Each course entry is clearly formatted and readable"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Coursera search results for Machine Learning"
+        - "Check that courses show titles, instructors, and ratings"
+        - "Confirm course details include duration and pricing"
+        - "Ensure search results are relevant to Machine Learning"
+
+metadata:
+  tags: ["web-task", "education", "coursera", "courses", "learning", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-nav-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-nav-001.yaml
new file mode 100644
index 00000000000..313d1fcdaab
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-nav-001.yaml
@@ -0,0 +1,48 @@
+# Site Navigation Workflow - Web Task Agent
+id: "web-task-agent-nav-001"
+name: "Site Navigation Workflow"
+description: "Test web task agent orchestrating navigation between different sections of a site"
+enabled: true
+
+target:
+  url: "https://www.wikipedia.org"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 90000
+
+input:
+  task: "Navigate to the Wikipedia homepage, search for \"artificial intelligence\", and find information about machine learning"
+  reasoning: "User is looking to explore Wikipedia content through structured navigation"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Orchestrated Wikipedia search via action_agent calls"
+      - "Navigated to artificial intelligence article through action_agent"
+      - "Located machine learning section via action_agent coordination"
+      - "Extracted relevant information about machine learning"
+      - "Demonstrated multi-step navigation workflow"
+      - "Maintained orchestration pattern throughout navigation"
+      - "Provided structured summary of found information"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify navigation reached artificial intelligence Wikipedia page"
+        - "Check that machine learning section or content is visible"
+        - "Confirm successful navigation through multiple page sections"
+        - "Ensure content related to machine learning is displayed"
+
+metadata:
+  tags: ["web-task", "navigation", "multi-step", "wikipedia", "content-exploration"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-news-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-news-001.yaml
new file mode 100644
index 00000000000..412a45ec32d
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-news-001.yaml
@@ -0,0 +1,66 @@
+# News Article Aggregation - Web Task Agent
+id: "web-task-agent-news-001"
+name: "News Article Aggregation"
+description: "Test aggregating news headlines and summaries from news sites"
+enabled: true
+
+target:
+  url: "https://news.ycombinator.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 10 Hacker News stories with titles, scores, and first few comments"
+  reasoning: "Users want automated news monitoring for research and awareness"
+  extraction_schema:
+    type: "object"
+    properties:
+      articles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            score:
+              type: "number"
+            comments_count:
+              type: "number"
+            url:
+              type: "string"
+            top_comment:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully extracted 10 Hacker News stories as requested"
+      - "Each story includes title, score, and comment count"
+      - "URLs are provided for each story"
+      - "Stories appear to be from the current top/front page"
+      - "Results are presented in clear, numbered text format (1-10), not JSON"
+      - "All required fields are present and properly formatted in readable text"
+      - "Each story is clearly separated and easy to read"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Hacker News stories are visible with scores"
+        - "Check that story titles and comment counts are shown"
+        - "Confirm top stories section is properly displayed"
+        - "Ensure story metadata is accessible for extraction"
+
+metadata:
+  tags: ["web-task", "news", "hackernews", "aggregation", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-realestate-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-realestate-001.yaml
new file mode 100644
index 00000000000..9cf2b947a28
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-realestate-001.yaml
@@ -0,0 +1,72 @@
+# Real Estate Property Search - Web Task Agent
+id: "web-task-agent-realestate-001"
+name: "Real Estate Property Search"
+description: "Test property search workflow on real estate platforms"
+enabled: true
+
+target:
+  url: "https://www.zillow.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for houses for sale in Austin, Texas under $500k and extract property details"
+  reasoning: "User wants to find affordable housing options in a specific location"
+  extraction_schema:
+    type: "object"
+    properties:
+      properties:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            address:
+              type: "string"
+            price:
+              type: "string"
+            bedrooms:
+              type: "number"
+            bathrooms:
+              type: "number"
+            sqft:
+              type: "string"
+            lot_size:
+              type: "string"
+            year_built:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Orchestrated location search via action_agent"
+      - "Delegated price filter setting to action_agent"
+      - "Coordinated property type selection through action_agent"
+      - "Applied search filters through proper action_agent calls"
+      - "Extracted property listings with schema_based_extractor"
+      - "Returned structured property data in readable text format (not JSON)"
+      - "Each property includes address, price, bedrooms, bathrooms, and other key details"
+      - "Properties are clearly numbered or organized for easy comparison"
+      - "Demonstrated complex real estate search workflow orchestration"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Zillow search results for Austin, Texas properties"
+        - "Check that properties shown are under $500k"
+        - "Confirm property listings show price, beds, baths info"
+        - "Ensure search results match the specified criteria"
+
+metadata:
+  tags: ["web-task", "real-estate", "zillow", "property-search", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-001.yaml
new file mode 100644
index 00000000000..f90cd8f8526
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-001.yaml
@@ -0,0 +1,63 @@
+# Infinite Scroll Content Loading - Web Task Agent
+id: "web-task-agent-scroll-001"
+name: "Infinite Scroll Content Loading"
+description: "Test web task agent handling infinite scroll pages to load more content"
+enabled: true
+
+target:
+  url: "https://twitter.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down the Twitter feed to load at least 20 tweets and extract their content"
+  reasoning: "Testing infinite scroll functionality for dynamic content loading"
+  extraction_schema:
+    type: "object"
+    properties:
+      tweets:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            author:
+              type: "string"
+            content:
+              type: "string"
+            likes:
+              type: "string"
+            retweets:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully used scroll_page tool to scroll down the page"
+      - "Loaded additional content through scrolling actions"
+      - "Extracted at least 20 tweets from the feed"
+      - "Each tweet includes author and content information"
+      - "Demonstrated proper handling of dynamically loaded content"
+      - "Results are presented in clear, numbered text format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify initial Twitter feed is loaded"
+        - "Check that scrolling action loaded additional tweets"
+        - "Confirm at least 20 tweets are visible after scrolling"
+        - "Ensure page scrolled down significantly from initial position"
+
+metadata:
+  tags: ["web-task", "scrolling", "infinite-scroll", "dynamic-content", "twitter"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-002.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-002.yaml
new file mode 100644
index 00000000000..858178e216a
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-002.yaml
@@ -0,0 +1,67 @@
+# Product Review Scrolling - Web Task Agent
+id: "web-task-agent-scroll-002"
+name: "Product Review Scrolling"
+description: "Test scrolling to load more product reviews on e-commerce sites"
+enabled: true
+
+target:
+  url: "https://www.amazon.com/dp/B08N5WRWNW"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down to the reviews section and load more reviews by scrolling, then extract review details"
+  reasoning: "Users need to see multiple reviews beyond initial visible ones"
+  extraction_schema:
+    type: "object"
+    properties:
+      reviews:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            rating:
+              type: "string"
+            title:
+              type: "string"
+            author:
+              type: "string"
+            date:
+              type: "string"
+            verified:
+              type: "boolean"
+            content:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Used scroll_page tool to navigate to reviews section"
+      - "Scrolled within reviews area to load additional reviews"
+      - "Extracted multiple product reviews with ratings"
+      - "Each review includes rating, author, and content"
+      - "Successfully handled lazy-loaded review content"
+      - "Presented reviews in structured, readable format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Amazon product page is loaded"
+        - "Check that page scrolled to reviews section"
+        - "Confirm additional reviews loaded after scrolling"
+        - "Ensure review content is fully visible"
+
+metadata:
+  tags: ["web-task", "scrolling", "reviews", "amazon", "e-commerce"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-003.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-003.yaml
new file mode 100644
index 00000000000..c1b3597e642
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-003.yaml
@@ -0,0 +1,63 @@
+# News Article Progressive Loading - Web Task Agent
+id: "web-task-agent-scroll-003"
+name: "News Article Progressive Loading"
+description: "Test scrolling through news sites that load articles progressively"
+enabled: true
+
+target:
+  url: "https://medium.com/topic/technology"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Scroll down to load more technology articles and extract titles and authors for at least 15 articles"
+  reasoning: "Testing progressive content loading on news/blog platforms"
+  extraction_schema:
+    type: "object"
+    properties:
+      articles:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            author:
+              type: "string"
+            reading_time:
+              type: "string"
+            preview:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Used scroll_page tool multiple times to load content"
+      - "Successfully loaded at least 15 articles through scrolling"
+      - "Extracted article titles and author information"
+      - "Handled Medium's progressive loading mechanism"
+      - "Articles are from technology topic as requested"
+      - "Results presented in clear, numbered format"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Medium technology page is loaded"
+        - "Check that initial articles are visible"
+        - "Confirm scrolling loaded additional articles"
+        - "Ensure at least 15 articles are visible after scrolling"
+
+metadata:
+  tags: ["web-task", "scrolling", "progressive-loading", "medium", "articles"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-004.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-004.yaml
new file mode 100644
index 00000000000..96cf5798c3b
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-004.yaml
@@ -0,0 +1,63 @@
+# Search Results Infinite Scroll - Web Task Agent
+id: "web-task-agent-scroll-004"
+name: "Search Results Infinite Scroll"
+description: "Test handling search results that use infinite scroll instead of pagination"
+enabled: true
+
+target:
+  url: "https://www.pinterest.com/search/pins/?q=web%20design"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Search for \"web design\" pins and scroll to load at least 30 results, then extract pin details"
+  reasoning: "Testing infinite scroll on visual search platforms"
+  extraction_schema:
+    type: "object"
+    properties:
+      pins:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            title:
+              type: "string"
+            description:
+              type: "string"
+            saves:
+              type: "string"
+            source:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully performed search for \"web design\" pins"
+      - "Used scroll_page tool to trigger infinite scroll loading"
+      - "Loaded at least 30 pins through scrolling actions"
+      - "Extracted pin titles and metadata"
+      - "Handled Pinterest's masonry layout and lazy loading"
+      - "Results are well-organized and readable"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Pinterest search results for web design"
+        - "Check initial pins are displayed"
+        - "Confirm scrolling loaded many more pins"
+        - "Ensure grid layout shows 30+ pins after scrolling"
+
+metadata:
+  tags: ["web-task", "scrolling", "infinite-scroll", "pinterest", "visual-search"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-scroll-005.yaml b/eval-server/evals/web-task-agent/web-task-agent-scroll-005.yaml
new file mode 100644
index 00000000000..169befe8606
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-scroll-005.yaml
@@ -0,0 +1,75 @@
+# Google Flights Scroll and Show More - Web Task Agent
+id: "web-task-agent-scroll-005"
+name: "Google Flights Scroll and Show More"
+description: "Test scrolling and clicking \"Show more flights\" button on Google Flights to load additional flight options"
+enabled: true
+
+target:
+  url: "https://www.google.com/travel/flights?sca_esv=646eedf97dcc8cf2&source=flun&uitype=cuAA&hl=en&gl=us&curr=USD&tfs=CAEQAhoeEgoyMDI2LTAzLTIwagcIARIDU0VBcgcIARIDTlJUGh4SCjIwMjYtMDMtMzBqBwgBEgNOUlRyBwgBEgNTRUF6aENqUklhVFJJTVVwVlZVOXpNakJCUTJodGVFRkNSeTB0TFMwdExTMHRjR3BpYjI4eE0wRkJRVUZCUjJoc1lsWlZRV2RYUlZsQkVnTmpTMFVhQ3dqUXNnVVFBaG9EVlZORU9EQncwTElG"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the initial flight results, then scroll down and click \"Show more flights\" button to load additional flights. Extract at least 20 total flight options from Seattle to Tokyo."
+  reasoning: "Testing combination of scrolling and button clicking to load more flight results on Google Flights"
+  extraction_schema:
+    type: "object"
+    properties:
+      flights:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            airline:
+              type: "string"
+            departure_time:
+              type: "string"
+            arrival_time:
+              type: "string"
+            duration:
+              type: "string"
+            stops:
+              type: "string"
+            price:
+              type: "string"
+            aircraft:
+              type: "string"
+      total_flights_found:
+        type: "number"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully extracted initial flight results from Google Flights"
+      - "Used scroll_page tool to scroll down the flight results list"
+      - "Located and clicked \"Show more flights\" button using action_agent"
+      - "Loaded additional flight options beyond the initial set"
+      - "Extracted at least 20 total flights from Seattle (SEA) to Tokyo (NRT)"
+      - "Each flight includes airline, times, duration, stops, and price"
+      - "Flights are for the correct dates (March 20-30, 2026)"
+      - "Results are presented in clear, numbered format"
+      - "Successfully combined scrolling and clicking actions to load more content"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Google Flights page shows SEA to NRT flights"
+        - "Check that initial flight results are displayed"
+        - "Confirm scrolling occurred and \"Show more flights\" button was visible"
+        - "Ensure additional flights loaded after clicking the button"
+        - "Verify at least 20 flight options are now visible"
+
+metadata:
+  tags: ["web-task", "scrolling", "google-flights", "click-action", "load-more", "travel"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-search-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-search-001.yaml
new file mode 100644
index 00000000000..b431605b93c
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-search-001.yaml
@@ -0,0 +1,43 @@
+# Basic web task search evaluation (matches DevTools test case)
+id: "web-task-agent-search-001"
+name: "Site-Specific Search Task"
+description: "Test web task agent orchestrating a search workflow on a specific site"
+enabled: true
+
+target:
+  url: "chrome://new-tab-page"
+
+tool: "web_task_agent"
+timeout: 60000
+
+input:
+  task: "Search Google for \"Chrome DevTools automation\" and extract the top 3 search results"
+  reasoning: "Testing basic site-specific search workflow orchestration"
+  context: "Need to demonstrate web_task_agent can coordinate multiple action_agent calls for a complete search workflow"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4o"
+    criteria:
+      - "Successfully returned exactly 3 search results in structured text format"
+      - "Each result is numbered (1., 2., 3.) and contains a title related to \"Chrome DevTools automation\""
+      - "Each result includes a URL in the format \"URL: [link]\""
+      - "Results are presented in a clear, readable text format (not JSON)"
+      - "Response includes a brief summary or conclusion statement"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify search was completed and results page is showing"
+        - "Check that search results are related to \"Chrome DevTools automation\""
+        - "Confirm at least 3 search results are visible on the page"
+        - "Ensure the search workflow was completed successfully"
+
+metadata:
+  tags: ["web-task", "orchestration", "search", "workflow", "google", "basic"]
+  priority: "normal"
\ No newline at end of file
diff --git a/eval-server/evals/web-task-agent/web-task-agent-social-001.yaml b/eval-server/evals/web-task-agent/web-task-agent-social-001.yaml
new file mode 100644
index 00000000000..3f913c77ba3
--- /dev/null
+++ b/eval-server/evals/web-task-agent/web-task-agent-social-001.yaml
@@ -0,0 +1,62 @@
+# Social Media Content Extraction - Web Task Agent
+id: "web-task-agent-social-001"
+name: "Social Media Content Extraction"
+description: "Test extracting trending topics and posts from social media"
+enabled: true
+
+target:
+  url: "https://twitter.com/explore"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "web_task_agent"
+timeout: 180000
+
+input:
+  task: "Extract the top 5 trending topics from Twitter/X explore page"
+  reasoning: "User wants to stay updated on current trends"
+  extraction_schema:
+    type: "object"
+    properties:
+      trends:
+        type: "array"
+        items:
+          type: "object"
+          properties:
+            topic:
+              type: "string"
+            posts_count:
+              type: "string"
+            category:
+              type: "string"
+
+schedule:
+  type: "on_demand"
+
+validation:
+  type: "llm_judge"
+  llm_judge:
+    model: "gpt-4o-mini"
+    temperature: 0.3
+    criteria:
+      - "Successfully accessed Twitter/X explore page and found trending topics"
+      - "Returned exactly 5 trending topics as requested"
+      - "Each topic includes the trend name/hashtag"
+      - "Post counts or metrics are included when available"
+      - "Topics are current/recent trends (not outdated)"
+      - "Results are presented in clear, numbered text format (not JSON)"
+      - "Each trend is properly numbered (1., 2., 3., etc.) for readability"
+    visual_verification:
+      enabled: true
+      capture_before_action: true
+      capture_after_action: true
+      verification_prompts:
+        - "Verify Twitter/X explore page is loaded"
+        - "Check that trending topics section is visible"
+        - "Confirm trending topics show names and post counts"
+        - "Ensure page shows current trending content"
+
+metadata:
+  tags: ["web-task", "social-media", "twitter", "trends", "extraction", "popular"]
+  priority: "high"
+  owner: "devtools-team"
\ No newline at end of file
diff --git a/eval-server/package-lock.json b/eval-server/package-lock.json
new file mode 100644
index 00000000000..494fa5e41b8
--- /dev/null
+++ b/eval-server/package-lock.json
@@ -0,0 +1,829 @@
+{
+  "name": "bo-eval-server",
+  "version": "1.0.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "bo-eval-server",
+      "version": "1.0.0",
+      "license": "MIT",
+      "dependencies": {
+        "dotenv": "^16.3.1",
+        "js-yaml": "^4.1.0",
+        "openai": "^4.24.1",
+        "uuid": "^9.0.1",
+        "winston": "^3.11.0",
+        "ws": "^8.16.0"
+      },
+      "devDependencies": {
+        "@types/ws": "^8.5.10"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      }
+    },
+    "node_modules/@colors/colors": {
+      "version": "1.6.0",
+      "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.6.0.tgz",
+      "integrity": "sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.1.90"
+      }
+    },
+    "node_modules/@dabh/diagnostics": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/@dabh/diagnostics/-/diagnostics-2.0.3.tgz",
+      "integrity": "sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==",
+      "license": "MIT",
+      "dependencies": {
+        "colorspace": "1.1.x",
+        "enabled": "2.0.x",
+        "kuler": "^2.0.0"
+      }
+    },
+    "node_modules/@types/node": {
+      "version": "24.0.13",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.13.tgz",
+      "integrity": "sha512-Qm9OYVOFHFYg3wJoTSrz80hoec5Lia/dPp84do3X7dZvLikQvM1YpmvTBEdIr/e+U8HTkFjLHLnl78K/qjf+jQ==",
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~7.8.0"
+      }
+    },
+    "node_modules/@types/node-fetch": {
+      "version": "2.6.12",
+      "resolved": "https://registry.npmjs.org/@types/node-fetch/-/node-fetch-2.6.12.tgz",
+      "integrity": "sha512-8nneRWKCg3rMtF69nLQJnOYUcbafYeFSjqkw3jCRLsqkWFlHaoQrr5mXmofFGOx3DKn7UfmBMyov8ySvLRVldA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/node": "*",
+        "form-data": "^4.0.0"
+      }
+    },
+    "node_modules/@types/triple-beam": {
+      "version": "1.3.5",
+      "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz",
+      "integrity": "sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==",
+      "license": "MIT"
+    },
+    "node_modules/@types/ws": {
+      "version": "8.18.1",
+      "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz",
+      "integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/node": "*"
+      }
+    },
+    "node_modules/abort-controller": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
+      "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
+      "license": "MIT",
+      "dependencies": {
+        "event-target-shim": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=6.5"
+      }
+    },
+    "node_modules/agentkeepalive": {
+      "version": "4.6.0",
+      "resolved": "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.6.0.tgz",
+      "integrity": "sha512-kja8j7PjmncONqaTsB8fQ+wE2mSU2DJ9D4XKoJ5PFWIdRMa6SLSN1ff4mOr4jCbfRSsxR4keIiySJU0N9T5hIQ==",
+      "license": "MIT",
+      "dependencies": {
+        "humanize-ms": "^1.2.1"
+      },
+      "engines": {
+        "node": ">= 8.0.0"
+      }
+    },
+    "node_modules/argparse": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+      "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+      "license": "Python-2.0"
+    },
+    "node_modules/async": {
+      "version": "3.2.6",
+      "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz",
+      "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==",
+      "license": "MIT"
+    },
+    "node_modules/asynckit": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
+      "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
+      "license": "MIT"
+    },
+    "node_modules/call-bind-apply-helpers": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
+      "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/color": {
+      "version": "3.2.1",
+      "resolved": "https://registry.npmjs.org/color/-/color-3.2.1.tgz",
+      "integrity": "sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==",
+      "license": "MIT",
+      "dependencies": {
+        "color-convert": "^1.9.3",
+        "color-string": "^1.6.0"
+      }
+    },
+    "node_modules/color-convert": {
+      "version": "1.9.3",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz",
+      "integrity": "sha512-QfAUtd+vFdAtFQcC8CCyYt1fYWxSqAiK2cSD6zDB8N3cpsEBAvRxp9zOGg6G/SHHJYAT88/az/IuDGALsNVbGg==",
+      "license": "MIT",
+      "dependencies": {
+        "color-name": "1.1.3"
+      }
+    },
+    "node_modules/color-name": {
+      "version": "1.1.3",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz",
+      "integrity": "sha512-72fSenhMw2HZMTVHeCA9KCmpEIbzWiQsjN+BHcBbS9vr1mtt+vJjPdksIBNUmKAW8TFUDPJK5SUU3QhE9NEXDw==",
+      "license": "MIT"
+    },
+    "node_modules/color-string": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz",
+      "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==",
+      "license": "MIT",
+      "dependencies": {
+        "color-name": "^1.0.0",
+        "simple-swizzle": "^0.2.2"
+      }
+    },
+    "node_modules/colorspace": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.4.tgz",
+      "integrity": "sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==",
+      "license": "MIT",
+      "dependencies": {
+        "color": "^3.1.3",
+        "text-hex": "1.0.x"
+      }
+    },
+    "node_modules/combined-stream": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
+      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
+      "license": "MIT",
+      "dependencies": {
+        "delayed-stream": "~1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/delayed-stream": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
+      "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/dotenv": {
+      "version": "16.6.1",
+      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.6.1.tgz",
+      "integrity": "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow==",
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://dotenvx.com"
+      }
+    },
+    "node_modules/dunder-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
+      "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "gopd": "^1.2.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/enabled": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz",
+      "integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==",
+      "license": "MIT"
+    },
+    "node_modules/es-define-property": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
+      "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-errors": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-object-atoms": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
+      "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-set-tostringtag": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
+      "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.6",
+        "has-tostringtag": "^1.0.2",
+        "hasown": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/event-target-shim": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
+      "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/fecha": {
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz",
+      "integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==",
+      "license": "MIT"
+    },
+    "node_modules/fn.name": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz",
+      "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==",
+      "license": "MIT"
+    },
+    "node_modules/form-data": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.3.tgz",
+      "integrity": "sha512-qsITQPfmvMOSAdeyZ+12I1c+CKSstAFAwu+97zrnWAbIr5u8wfsExUzCesVLC8NgHuRUqNN4Zy6UPWUTRGslcA==",
+      "license": "MIT",
+      "dependencies": {
+        "asynckit": "^0.4.0",
+        "combined-stream": "^1.0.8",
+        "es-set-tostringtag": "^2.1.0",
+        "hasown": "^2.0.2",
+        "mime-types": "^2.1.12"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/form-data-encoder": {
+      "version": "1.7.2",
+      "resolved": "https://registry.npmjs.org/form-data-encoder/-/form-data-encoder-1.7.2.tgz",
+      "integrity": "sha512-qfqtYan3rxrnCk1VYaA4H+Ms9xdpPqvLZa6xmMgFvhO32x7/3J/ExcTd6qpxM0vH2GdMI+poehyBZvqfMTto8A==",
+      "license": "MIT"
+    },
+    "node_modules/formdata-node": {
+      "version": "4.4.1",
+      "resolved": "https://registry.npmjs.org/formdata-node/-/formdata-node-4.4.1.tgz",
+      "integrity": "sha512-0iirZp3uVDjVGt9p49aTaqjk84TrglENEDuqfdlZQ1roC9CWlPk6Avf8EEnZNcAqPonwkG35x4n3ww/1THYAeQ==",
+      "license": "MIT",
+      "dependencies": {
+        "node-domexception": "1.0.0",
+        "web-streams-polyfill": "4.0.0-beta.3"
+      },
+      "engines": {
+        "node": ">= 12.20"
+      }
+    },
+    "node_modules/function-bind": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/get-intrinsic": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
+      "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "es-define-property": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "es-object-atoms": "^1.1.1",
+        "function-bind": "^1.1.2",
+        "get-proto": "^1.0.1",
+        "gopd": "^1.2.0",
+        "has-symbols": "^1.1.0",
+        "hasown": "^2.0.2",
+        "math-intrinsics": "^1.1.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/get-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
+      "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
+      "license": "MIT",
+      "dependencies": {
+        "dunder-proto": "^1.0.1",
+        "es-object-atoms": "^1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/gopd": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
+      "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/has-symbols": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
+      "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/has-tostringtag": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
+      "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
+      "license": "MIT",
+      "dependencies": {
+        "has-symbols": "^1.0.3"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/hasown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+      "license": "MIT",
+      "dependencies": {
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/humanize-ms": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/humanize-ms/-/humanize-ms-1.2.1.tgz",
+      "integrity": "sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==",
+      "license": "MIT",
+      "dependencies": {
+        "ms": "^2.0.0"
+      }
+    },
+    "node_modules/inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
+      "license": "ISC"
+    },
+    "node_modules/is-arrayish": {
+      "version": "0.3.2",
+      "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz",
+      "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==",
+      "license": "MIT"
+    },
+    "node_modules/is-stream": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz",
+      "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/js-yaml": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
+      "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
+      "license": "MIT",
+      "dependencies": {
+        "argparse": "^2.0.1"
+      },
+      "bin": {
+        "js-yaml": "bin/js-yaml.js"
+      }
+    },
+    "node_modules/kuler": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz",
+      "integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==",
+      "license": "MIT"
+    },
+    "node_modules/logform": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/logform/-/logform-2.7.0.tgz",
+      "integrity": "sha512-TFYA4jnP7PVbmlBIfhlSe+WKxs9dklXMTEGcBCIvLhE/Tn3H6Gk1norupVW7m5Cnd4bLcr08AytbyV/xj7f/kQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@colors/colors": "1.6.0",
+        "@types/triple-beam": "^1.3.2",
+        "fecha": "^4.2.0",
+        "ms": "^2.1.1",
+        "safe-stable-stringify": "^2.3.1",
+        "triple-beam": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 12.0.0"
+      }
+    },
+    "node_modules/math-intrinsics": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
+      "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "license": "MIT",
+      "dependencies": {
+        "mime-db": "1.52.0"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+      "license": "MIT"
+    },
+    "node_modules/node-domexception": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/node-domexception/-/node-domexception-1.0.0.tgz",
+      "integrity": "sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==",
+      "deprecated": "Use your platform's native DOMException instead",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/jimmywarting"
+        },
+        {
+          "type": "github",
+          "url": "https://paypal.me/jimmywarting"
+        }
+      ],
+      "license": "MIT",
+      "engines": {
+        "node": ">=10.5.0"
+      }
+    },
+    "node_modules/node-fetch": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
+      "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
+      "license": "MIT",
+      "dependencies": {
+        "whatwg-url": "^5.0.0"
+      },
+      "engines": {
+        "node": "4.x || >=6.0.0"
+      },
+      "peerDependencies": {
+        "encoding": "^0.1.0"
+      },
+      "peerDependenciesMeta": {
+        "encoding": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/one-time": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz",
+      "integrity": "sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==",
+      "license": "MIT",
+      "dependencies": {
+        "fn.name": "1.x.x"
+      }
+    },
+    "node_modules/openai": {
+      "version": "4.104.0",
+      "resolved": "https://registry.npmjs.org/openai/-/openai-4.104.0.tgz",
+      "integrity": "sha512-p99EFNsA/yX6UhVO93f5kJsDRLAg+CTA2RBqdHK4RtK8u5IJw32Hyb2dTGKbnnFmnuoBv5r7Z2CURI9sGZpSuA==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@types/node": "^18.11.18",
+        "@types/node-fetch": "^2.6.4",
+        "abort-controller": "^3.0.0",
+        "agentkeepalive": "^4.2.1",
+        "form-data-encoder": "1.7.2",
+        "formdata-node": "^4.3.2",
+        "node-fetch": "^2.6.7"
+      },
+      "bin": {
+        "openai": "bin/cli"
+      },
+      "peerDependencies": {
+        "ws": "^8.18.0",
+        "zod": "^3.23.8"
+      },
+      "peerDependenciesMeta": {
+        "ws": {
+          "optional": true
+        },
+        "zod": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/openai/node_modules/@types/node": {
+      "version": "18.19.118",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.19.118.tgz",
+      "integrity": "sha512-hIPK0hSrrcaoAu/gJMzN3QClXE4QdCdFvaenJ0JsjIbExP1JFFVH+RHcBt25c9n8bx5dkIfqKE+uw6BmBns7ug==",
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~5.26.4"
+      }
+    },
+    "node_modules/openai/node_modules/undici-types": {
+      "version": "5.26.5",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+      "license": "MIT"
+    },
+    "node_modules/readable-stream": {
+      "version": "3.6.2",
+      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
+      "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
+      "license": "MIT",
+      "dependencies": {
+        "inherits": "^2.0.3",
+        "string_decoder": "^1.1.1",
+        "util-deprecate": "^1.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/safe-buffer": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/safe-stable-stringify": {
+      "version": "2.5.0",
+      "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.5.0.tgz",
+      "integrity": "sha512-b3rppTKm9T+PsVCBEOUR46GWI7fdOs00VKZ1+9c1EWDaDMvjQc6tUwuFyIprgGgTcWoVHSKrU8H31ZHA2e0RHA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/simple-swizzle": {
+      "version": "0.2.2",
+      "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz",
+      "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==",
+      "license": "MIT",
+      "dependencies": {
+        "is-arrayish": "^0.3.1"
+      }
+    },
+    "node_modules/stack-trace": {
+      "version": "0.0.10",
+      "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz",
+      "integrity": "sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==",
+      "license": "MIT",
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/string_decoder": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
+      "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
+      "license": "MIT",
+      "dependencies": {
+        "safe-buffer": "~5.2.0"
+      }
+    },
+    "node_modules/text-hex": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
+      "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==",
+      "license": "MIT"
+    },
+    "node_modules/tr46": {
+      "version": "0.0.3",
+      "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
+      "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
+      "license": "MIT"
+    },
+    "node_modules/triple-beam": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz",
+      "integrity": "sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 14.0.0"
+      }
+    },
+    "node_modules/undici-types": {
+      "version": "7.8.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz",
+      "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==",
+      "license": "MIT"
+    },
+    "node_modules/util-deprecate": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
+      "license": "MIT"
+    },
+    "node_modules/uuid": {
+      "version": "9.0.1",
+      "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
+      "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
+      "funding": [
+        "https://github.com/sponsors/broofa",
+        "https://github.com/sponsors/ctavan"
+      ],
+      "license": "MIT",
+      "bin": {
+        "uuid": "dist/bin/uuid"
+      }
+    },
+    "node_modules/web-streams-polyfill": {
+      "version": "4.0.0-beta.3",
+      "resolved": "https://registry.npmjs.org/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz",
+      "integrity": "sha512-QW95TCTaHmsYfHDybGMwO5IJIM93I/6vTRk+daHTWFPhwh+C8Cg7j7XyKrwrj8Ib6vYXe0ocYNrmzY4xAAN6ug==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 14"
+      }
+    },
+    "node_modules/webidl-conversions": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
+      "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
+      "license": "BSD-2-Clause"
+    },
+    "node_modules/whatwg-url": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
+      "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
+      "license": "MIT",
+      "dependencies": {
+        "tr46": "~0.0.3",
+        "webidl-conversions": "^3.0.0"
+      }
+    },
+    "node_modules/winston": {
+      "version": "3.17.0",
+      "resolved": "https://registry.npmjs.org/winston/-/winston-3.17.0.tgz",
+      "integrity": "sha512-DLiFIXYC5fMPxaRg832S6F5mJYvePtmO5G9v9IgUFPhXm9/GkXarH/TUrBAVzhTCzAj9anE/+GjrgXp/54nOgw==",
+      "license": "MIT",
+      "dependencies": {
+        "@colors/colors": "^1.6.0",
+        "@dabh/diagnostics": "^2.0.2",
+        "async": "^3.2.3",
+        "is-stream": "^2.0.0",
+        "logform": "^2.7.0",
+        "one-time": "^1.0.0",
+        "readable-stream": "^3.4.0",
+        "safe-stable-stringify": "^2.3.1",
+        "stack-trace": "0.0.x",
+        "triple-beam": "^1.3.0",
+        "winston-transport": "^4.9.0"
+      },
+      "engines": {
+        "node": ">= 12.0.0"
+      }
+    },
+    "node_modules/winston-transport": {
+      "version": "4.9.0",
+      "resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.9.0.tgz",
+      "integrity": "sha512-8drMJ4rkgaPo1Me4zD/3WLfI/zPdA9o2IipKODunnGDcuqbHwjsbB79ylv04LCGGzU0xQ6vTznOMpQGaLhhm6A==",
+      "license": "MIT",
+      "dependencies": {
+        "logform": "^2.7.0",
+        "readable-stream": "^3.6.2",
+        "triple-beam": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 12.0.0"
+      }
+    },
+    "node_modules/ws": {
+      "version": "8.18.3",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
+      "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=10.0.0"
+      },
+      "peerDependencies": {
+        "bufferutil": "^4.0.1",
+        "utf-8-validate": ">=5.0.2"
+      },
+      "peerDependenciesMeta": {
+        "bufferutil": {
+          "optional": true
+        },
+        "utf-8-validate": {
+          "optional": true
+        }
+      }
+    }
+  }
+}
diff --git a/eval-server/package.json b/eval-server/package.json
new file mode 100644
index 00000000000..de89d99b27d
--- /dev/null
+++ b/eval-server/package.json
@@ -0,0 +1,29 @@
+{
+  "name": "bo-eval-server",
+  "version": "1.0.0",
+  "description": "WebSocket server for evaluating LLM agents with LLM-as-a-judge",
+  "main": "src/server.js",
+  "type": "module",
+  "scripts": {
+    "start": "node src/server.js",
+    "dev": "node --watch src/server.js",
+    "cli": "node src/cli.js"
+  },
+  "keywords": ["websocket", "llm", "evaluation", "rpc"],
+  "author": "",
+  "license": "MIT",
+  "dependencies": {
+    "ws": "^8.16.0",
+    "uuid": "^9.0.1",
+    "winston": "^3.11.0",
+    "dotenv": "^16.3.1",
+    "openai": "^4.24.1",
+    "js-yaml": "^4.1.0"
+  },
+  "devDependencies": {
+    "@types/ws": "^8.5.10"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  }
+}
\ No newline at end of file
diff --git a/eval-server/schemas/client.schema.json b/eval-server/schemas/client.schema.json
new file mode 100644
index 00000000000..5093155b971
--- /dev/null
+++ b/eval-server/schemas/client.schema.json
@@ -0,0 +1,337 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "title": "Client Configuration Schema",
+  "description": "Schema for validating client YAML configuration files",
+  "required": ["client", "settings", "evaluations"],
+  "properties": {
+    "client": {
+      "type": "object",
+      "required": ["id", "name"],
+      "properties": {
+        "id": {
+          "type": "string",
+          "pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
+          "description": "UUID v4 format client identifier"
+        },
+        "name": {
+          "type": "string",
+          "minLength": 1,
+          "maxLength": 100,
+          "description": "Human-readable client name"
+        },
+        "secret_key": {
+          "type": "string",
+          "description": "Optional authentication secret key"
+        },
+        "description": {
+          "type": "string",
+          "description": "Optional client description"
+        }
+      }
+    },
+    "settings": {
+      "type": "object",
+      "properties": {
+        "max_concurrent_evaluations": {
+          "type": "integer",
+          "minimum": 1,
+          "maximum": 10,
+          "default": 3
+        },
+        "default_timeout": {
+          "type": "integer",
+          "minimum": 5000,
+          "maximum": 300000,
+          "default": 30000,
+          "description": "Default timeout in milliseconds"
+        },
+        "retry_policy": {
+          "type": "object",
+          "properties": {
+            "max_retries": {
+              "type": "integer",
+              "minimum": 0,
+              "maximum": 5,
+              "default": 2
+            },
+            "backoff_multiplier": {
+              "type": "number",
+              "minimum": 1,
+              "maximum": 5,
+              "default": 2
+            },
+            "initial_delay": {
+              "type": "integer",
+              "minimum": 100,
+              "maximum": 10000,
+              "default": 1000,
+              "description": "Initial delay in milliseconds"
+            }
+          }
+        }
+      }
+    },
+    "evaluations": {
+      "type": "array",
+      "items": {
+        "$ref": "#/definitions/evaluation"
+      }
+    }
+  },
+  "definitions": {
+    "evaluation": {
+      "type": "object",
+      "required": ["id", "name", "tool", "input"],
+      "properties": {
+        "id": {
+          "type": "string",
+          "pattern": "^[a-zA-Z0-9-_]+$",
+          "minLength": 1,
+          "maxLength": 100,
+          "description": "Unique evaluation identifier"
+        },
+        "name": {
+          "type": "string",
+          "minLength": 1,
+          "maxLength": 200,
+          "description": "Human-readable evaluation name"
+        },
+        "description": {
+          "type": "string",
+          "description": "Optional evaluation description"
+        },
+        "enabled": {
+          "type": "boolean",
+          "default": true,
+          "description": "Whether this evaluation is enabled"
+        },
+        "target": {
+          "type": "object",
+          "properties": {
+            "url": {
+              "type": "string",
+              "format": "uri",
+              "description": "Target URL for the evaluation"
+            },
+            "wait_for": {
+              "type": "string",
+              "enum": ["load", "domcontentloaded", "networkidle"],
+              "default": "networkidle"
+            },
+            "wait_timeout": {
+              "type": "integer",
+              "minimum": 1000,
+              "maximum": 30000,
+              "default": 5000
+            }
+          }
+        },
+        "tool": {
+          "type": "string",
+          "enum": [
+            "extract_schema_data",
+            "extract_schema_streamlined", 
+            "research_agent",
+            "action_agent",
+            "web_task_agent"
+          ],
+          "description": "Tool to execute for this evaluation"
+        },
+        "timeout": {
+          "type": "integer",
+          "minimum": 5000,
+          "maximum": 300000,
+          "description": "Evaluation timeout in milliseconds"
+        },
+        "input": {
+          "type": "object",
+          "description": "Tool-specific input parameters"
+        },
+        "schedule": {
+          "type": "object",
+          "required": ["type"],
+          "properties": {
+            "type": {
+              "type": "string",
+              "enum": ["on_demand", "periodic", "once"]
+            },
+            "interval": {
+              "type": "integer",
+              "minimum": 60000,
+              "description": "Interval in milliseconds for periodic schedules"
+            },
+            "run_at": {
+              "type": "string",
+              "format": "date-time",
+              "description": "ISO timestamp for one-time schedules"
+            }
+          },
+          "if": {
+            "properties": {
+              "type": { "const": "periodic" }
+            }
+          },
+          "then": {
+            "required": ["interval"]
+          },
+          "else": {
+            "if": {
+              "properties": {
+                "type": { "const": "once" }
+              }
+            },
+            "then": {
+              "required": ["run_at"]
+            }
+          }
+        },
+        "validation": {
+          "type": "object",
+          "required": ["type"],
+          "properties": {
+            "type": {
+              "type": "string",
+              "enum": ["llm-judge", "snapshot", "hybrid"]
+            },
+            "llm_judge": {
+              "$ref": "#/definitions/llm_judge_config"
+            },
+            "snapshot": {
+              "$ref": "#/definitions/snapshot_config"
+            },
+            "hybrid": {
+              "type": "object",
+              "properties": {
+                "weight_llm": {
+                  "type": "number",
+                  "minimum": 0,
+                  "maximum": 1
+                },
+                "weight_snapshot": {
+                  "type": "number",
+                  "minimum": 0,
+                  "maximum": 1
+                }
+              }
+            }
+          }
+        },
+        "metadata": {
+          "type": "object",
+          "properties": {
+            "tags": {
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            },
+            "priority": {
+              "type": "string",
+              "enum": ["low", "normal", "high"],
+              "default": "normal"
+            },
+            "owner": {
+              "type": "string",
+              "description": "Responsible team or person"
+            },
+            "created": {
+              "type": "string",
+              "format": "date"
+            },
+            "modified": {
+              "type": "string",
+              "format": "date"
+            }
+          }
+        }
+      }
+    },
+    "llm_judge_config": {
+      "type": "object",
+      "required": ["criteria"],
+      "properties": {
+        "model": {
+          "type": "string",
+          "default": "gpt-4o-mini",
+          "description": "LLM model to use for evaluation"
+        },
+        "temperature": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 2,
+          "default": 0.3
+        },
+        "criteria": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "minItems": 1,
+          "description": "List of evaluation criteria"
+        },
+        "visual_verification": {
+          "type": "object",
+          "properties": {
+            "enabled": {
+              "type": "boolean",
+              "default": false
+            },
+            "capture_before": {
+              "type": "boolean",
+              "default": true
+            },
+            "capture_after": {
+              "type": "boolean",
+              "default": true
+            },
+            "prompts": {
+              "type": "array",
+              "items": {
+                "type": "string"
+              }
+            }
+          }
+        }
+      }
+    },
+    "snapshot_config": {
+      "type": "object",
+      "properties": {
+        "structure_only": {
+          "type": "boolean",
+          "default": false,
+          "description": "Compare only structure, not values"
+        },
+        "exclude_paths": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "JSONPath expressions for fields to exclude"
+        },
+        "sanitizers": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "required": ["path"],
+            "properties": {
+              "path": {
+                "type": "string",
+                "description": "JSONPath to the field"
+              },
+              "pattern": {
+                "type": "string",
+                "description": "Regex pattern to match"
+              },
+              "replacement": {
+                "type": "string",
+                "description": "Replacement string"
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/eval-server/src/api-server.js b/eval-server/src/api-server.js
new file mode 100644
index 00000000000..8011dea31f8
--- /dev/null
+++ b/eval-server/src/api-server.js
@@ -0,0 +1,221 @@
+import http from 'http';
+import url from 'url';
+import { EvaluationServer } from './server.js';
+import logger from './logger.js';
+
+class APIServer {
+  constructor(evaluationServer, port = 8081) {
+    this.evaluationServer = evaluationServer;
+    this.port = port;
+    this.server = null;
+  }
+
+  start() {
+    this.server = http.createServer((req, res) => {
+      // Enable CORS
+      res.setHeader('Access-Control-Allow-Origin', '*');
+      res.setHeader('Access-Control-Allow-Methods', 'GET, POST, OPTIONS');
+      res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
+
+      if (req.method === 'OPTIONS') {
+        res.writeHead(200);
+        res.end();
+        return;
+      }
+
+      this.handleRequest(req, res);
+    });
+
+    this.server.listen(this.port, () => {
+      logger.info(`API server started on http://localhost:${this.port}`);
+    });
+  }
+
+  async handleRequest(req, res) {
+    const parsedUrl = url.parse(req.url, true);
+    const path = parsedUrl.pathname;
+    const method = req.method;
+
+    try {
+      // Get body for POST requests
+      let body = '';
+      if (method === 'POST') {
+        for await (const chunk of req) {
+          body += chunk;
+        }
+      }
+
+      let result;
+      
+      switch (path) {
+        case '/status':
+          result = this.getStatus();
+          break;
+        
+        case '/clients':
+          result = this.getClients();
+          break;
+        
+        case '/clients/:id/evaluations':
+          const clientId = parsedUrl.query.id;
+          result = this.getClientEvaluations(clientId);
+          break;
+        
+        case '/evaluate':
+          if (method !== 'POST') {
+            this.sendError(res, 405, 'Method not allowed');
+            return;
+          }
+          result = await this.triggerEvaluation(JSON.parse(body));
+          break;
+        
+        default:
+          this.sendError(res, 404, 'Not found');
+          return;
+      }
+
+      this.sendResponse(res, 200, result);
+
+    } catch (error) {
+      logger.error('API error:', error);
+      this.sendError(res, 500, error.message);
+    }
+  }
+
+  getStatus() {
+    const status = this.evaluationServer.getStatus();
+    const clients = this.evaluationServer.getClientManager().getAllClients();
+    
+    return {
+      server: status,
+      clients: clients.map(client => ({
+        id: client.id,
+        name: client.name,
+        connected: this.evaluationServer.connectedClients.has(client.id),
+        ready: this.evaluationServer.connectedClients.get(client.id)?.ready || false
+      }))
+    };
+  }
+
+  getClients() {
+    const clients = this.evaluationServer.getClientManager().getAllClients();
+    
+    return clients.map(client => {
+      const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(client.id);
+      const connection = this.evaluationServer.connectedClients.get(client.id);
+      
+      return {
+        id: client.id,
+        name: client.name,
+        description: client.description,
+        connected: !!connection,
+        ready: connection?.ready || false,
+        evaluations: evaluations.map(evaluation => ({
+          id: evaluation.id,
+          name: evaluation.name,
+          tool: evaluation.tool,
+          status: evaluation.status || 'pending',
+          enabled: evaluation.enabled !== false
+        }))
+      };
+    });
+  }
+
+  getClientEvaluations(clientId) {
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(clientId);
+    return {
+      clientId,
+      evaluations: evaluations.map(evaluation => ({
+        id: evaluation.id,
+        name: evaluation.name,
+        description: evaluation.description,
+        tool: evaluation.tool,
+        status: evaluation.status || 'pending',
+        enabled: evaluation.enabled !== false,
+        lastRun: evaluation.lastRun,
+        lastResult: evaluation.lastResult
+      }))
+    };
+  }
+
+  async triggerEvaluation(payload) {
+    const { clientId, evaluationId, runAll = false } = payload;
+
+    if (!clientId) {
+      throw new Error('Client ID is required');
+    }
+
+    // Check if client is connected
+    const connection = this.evaluationServer.connectedClients.get(clientId);
+    if (!connection || !connection.ready) {
+      throw new Error(`Client '${clientId}' is not connected or not ready`);
+    }
+
+    if (runAll) {
+      // Run all evaluations for the client
+      const evaluations = this.evaluationServer.getClientManager().getClientEvaluations(clientId);
+      const results = [];
+
+      for (const evaluation of evaluations) {
+        try {
+          this.evaluationServer.getClientManager().updateEvaluationStatus(clientId, evaluation.id, 'pending');
+          await this.evaluationServer.executeEvaluation(connection, evaluation);
+          results.push({ id: evaluation.id, status: 'completed' });
+        } catch (error) {
+          results.push({ id: evaluation.id, status: 'failed', error: error.message });
+        }
+      }
+
+      return {
+        clientId,
+        type: 'batch',
+        results
+      };
+    } else {
+      // Run specific evaluation
+      if (!evaluationId) {
+        throw new Error('Evaluation ID is required when runAll is false');
+      }
+
+      const evaluation = this.evaluationServer.getClientManager().getClientEvaluations(clientId)
+        .find(e => e.id === evaluationId);
+
+      if (!evaluation) {
+        throw new Error(`Evaluation '${evaluationId}' not found for client '${clientId}'`);
+      }
+
+      this.evaluationServer.getClientManager().updateEvaluationStatus(clientId, evaluationId, 'pending');
+      await this.evaluationServer.executeEvaluation(connection, evaluation);
+
+      return {
+        clientId,
+        evaluationId,
+        type: 'single',
+        status: 'completed'
+      };
+    }
+  }
+
+  sendResponse(res, statusCode, data) {
+    res.writeHead(statusCode, { 'Content-Type': 'application/json' });
+    res.end(JSON.stringify(data, null, 2));
+  }
+
+  sendError(res, statusCode, message) {
+    res.writeHead(statusCode, { 'Content-Type': 'application/json' });
+    res.end(JSON.stringify({ error: message }));
+  }
+
+  stop() {
+    if (this.server) {
+      this.server.close();
+      logger.info('API server stopped');
+    }
+  }
+}
+
+export { APIServer };
\ No newline at end of file
diff --git a/eval-server/src/cli.js b/eval-server/src/cli.js
new file mode 100644
index 00000000000..6ad66dc9a5c
--- /dev/null
+++ b/eval-server/src/cli.js
@@ -0,0 +1,307 @@
+#!/usr/bin/env node
+
+import { EvaluationServer } from './server.js';
+import readline from 'readline';
+
+class EvaluationCLI {
+  constructor() {
+    this.server = new EvaluationServer();
+    this.rl = readline.createInterface({
+      input: process.stdin,
+      output: process.stdout
+    });
+  }
+
+  async start() {
+    console.log('🚀 Starting Evaluation Server CLI');
+    console.log('====================================');
+    
+    // Start the server
+    this.server.start();
+    
+    // Wait a moment for server to start
+    await new Promise(resolve => setTimeout(resolve, 1000));
+    
+    this.showHelp();
+    this.startInteractiveMode();
+  }
+
+  showHelp() {
+    console.log('\\nAvailable commands:');
+    console.log('  status                           - Show server status');
+    console.log('  clients                          - List all clients and their evaluations');
+    console.log('  run <client-id> <evaluation-id>  - Run specific evaluation for a client');
+    console.log('  run-all <client-id>              - Run all evaluations for a client');
+    console.log('  eval <evaluation-id>             - Run specific evaluation on all connected clients');
+    console.log('  eval all                         - Run all pending evaluations on all clients');
+    console.log('  clients-connected                - List connected clients');
+    console.log('  help                             - Show this help');
+    console.log('  quit                             - Exit the CLI');
+    console.log('');
+  }
+
+  startInteractiveMode() {
+    this.rl.question('eval-server> ', (input) => {
+      this.handleCommand(input.trim());
+    });
+  }
+
+  async handleCommand(input) {
+    const [command, ...args] = input.split(' ');
+    
+    try {
+      switch (command) {
+        case 'status':
+          this.showStatus();
+          break;
+        case 'clients':
+          this.listClients();
+          break;
+        case 'run':
+          if (args.length < 2) {
+            console.log('Usage: run <client-id> <evaluation-id>');
+          } else {
+            await this.runSpecificEvaluation(args[0], args[1]);
+          }
+          break;
+        case 'run-all':
+          if (args.length < 1) {
+            console.log('Usage: run-all <client-id>');
+          } else {
+            await this.runAllEvaluations(args[0]);
+          }
+          break;
+        case 'eval':
+          if (args.length === 0) {
+            console.log('Usage: eval <evaluation-id>  OR  eval all');
+          } else {
+            await this.runEvaluation(args.join(' '));
+          }
+          break;
+        case 'clients-connected':
+          this.listConnectedClients();
+          break;
+        case 'help':
+          this.showHelp();
+          break;
+        case 'quit':
+        case 'exit':
+          this.quit();
+          return;
+        case '':
+          break;
+        default:
+          console.log(`Unknown command: ${command}. Type 'help' for available commands.`);
+      }
+    } catch (error) {
+      console.error('Error:', error.message);
+    }
+    
+    this.startInteractiveMode();
+  }
+
+  showStatus() {
+    const status = this.server.getStatus();
+    console.log('\\n📊 Server Status:');
+    console.log(`  Connected clients: ${status.connectedClients}`);
+    console.log(`  Ready clients: ${status.readyClients}`);
+    console.log(`  Active evaluations: ${status.activeEvaluations}`);
+    console.log('');
+  }
+
+  listConnectedClients() {
+    const clients = Array.from(this.server.connectedClients.values());
+    console.log('\\n👥 Connected Clients:');
+    
+    if (clients.length === 0) {
+      console.log('  No clients connected');
+    } else {
+      clients.forEach(client => {
+        console.log(`  ID: ${client.clientId || client.id}`);
+        console.log(`    Connected: ${client.connectedAt}`);
+        console.log(`    Ready: ${client.ready ? 'Yes' : 'No'}`);
+        console.log(`    Registered: ${client.registered ? 'Yes' : 'No'}`);
+        console.log(`    Address: ${client.remoteAddress}`);
+        console.log('');
+      });
+    }
+  }
+
+  async runEvaluation(task) {
+    if (task && task.includes('-')) {
+      console.log(`\\n🔍 Running specific evaluation: "${task}"`);
+    } else if (task === 'all') {
+      console.log(`\\n🔍 Running all pending evaluations`);
+    } else {
+      console.log(`\\n🔍 Running evaluation: "${task}"`);
+    }
+    console.log('=====================================');
+    
+    try {
+      const results = await this.server.evaluateAllClients(task);
+      
+      console.log('\\n📋 Evaluation Results:');
+      results.forEach((result, index) => {
+        console.log(`\\n  Client ${index + 1} (${result.clientId || 'unknown'}):`);
+        
+        if (result.error) {
+          console.log(`    ❌ Error: ${result.error}`);
+        } else {
+          console.log(`    ✅ Success`);
+          if (result.evaluationId) {
+            console.log(`    Evaluation ID: ${result.evaluationId}`);
+          }
+          if (result.duration) {
+            console.log(`    Duration: ${result.duration}ms`);
+          }
+          
+          if (result.judgeEvaluation?.overall_score) {
+            console.log(`    Overall Score: ${result.judgeEvaluation.overall_score}/10`);
+          }
+          
+          if (result.clientResponse) {
+            const preview = result.clientResponse.length > 100 
+              ? result.clientResponse.substring(0, 100) + '...'
+              : result.clientResponse;
+            console.log(`    Response: ${preview}`);
+          }
+        }
+      });
+      
+      console.log('\\n✅ Evaluation completed');
+    } catch (error) {
+      console.log(`\\n❌ Evaluation failed: ${error.message}`);
+    }
+  }
+
+  listClients() {
+    const clients = this.server.getClientManager().getAllClients();
+    console.log('\\n👥 Registered Clients:');
+    
+    if (clients.length === 0) {
+      console.log('  No clients registered');
+      return;
+    }
+    
+    clients.forEach(client => {
+      console.log(`\\n  📋 ${client.name} (${client.id})`);
+      console.log(`     Description: ${client.description || 'N/A'}`);
+      console.log(`     Secret Key: ${client.secretKey ? '***' : 'None'}`);
+      
+      const evaluations = this.server.getClientManager().getClientEvaluations(client.id);
+      console.log(`     Evaluations: ${evaluations.length}`);
+      
+      evaluations.forEach(evaluation => {
+        const status = evaluation.status || 'pending';
+        const statusIcon = status === 'completed' ? '✅' : status === 'running' ? '🔄' : status === 'failed' ? '❌' : '⏳';
+        console.log(`       ${statusIcon} ${evaluation.id}: ${evaluation.name}`);
+      });
+    });
+    console.log('');
+  }
+
+  async runSpecificEvaluation(clientId, evaluationId) {
+    console.log(`\\n🎯 Running evaluation '${evaluationId}' for client '${clientId}'...`);
+    
+    try {
+      // Check if client is connected
+      const connection = this.server.connectedClients.get(clientId);
+      if (!connection || !connection.ready) {
+        console.log(`❌ Client '${clientId}' is not connected or not ready`);
+        return;
+      }
+      
+      // Get the evaluation
+      const evaluation = this.server.getClientManager().getClientEvaluations(clientId)
+        .find(e => e.id === evaluationId);
+      
+      if (!evaluation) {
+        console.log(`❌ Evaluation '${evaluationId}' not found for client '${clientId}'`);
+        return;
+      }
+      
+      // Reset evaluation status to pending
+      this.server.getClientManager().updateEvaluationStatus(clientId, evaluationId, 'pending');
+      
+      // Execute the evaluation
+      await this.server.executeEvaluation(connection, evaluation);
+      
+      console.log(`✅ Evaluation '${evaluationId}' completed successfully`);
+    } catch (error) {
+      console.log(`❌ Evaluation failed: ${error.message}`);
+    }
+  }
+
+  async runAllEvaluations(clientId) {
+    console.log(`\\n🚀 Running all evaluations for client '${clientId}'...`);
+    
+    try {
+      // Check if client is connected
+      const connection = this.server.connectedClients.get(clientId);
+      if (!connection || !connection.ready) {
+        console.log(`❌ Client '${clientId}' is not connected or not ready`);
+        return;
+      }
+      
+      // Get all evaluations for this client
+      const evaluations = this.server.getClientManager().getClientEvaluations(clientId);
+      
+      if (evaluations.length === 0) {
+        console.log(`❌ No evaluations found for client '${clientId}'`);
+        return;
+      }
+      
+      console.log(`Found ${evaluations.length} evaluations to run...`);
+      
+      let completed = 0;
+      let failed = 0;
+      
+      for (const evaluation of evaluations) {
+        console.log(`\\n🔄 Running: ${evaluation.name} (${evaluation.id})`);
+        
+        try {
+          // Reset evaluation status to pending
+          this.server.getClientManager().updateEvaluationStatus(clientId, evaluation.id, 'pending');
+          
+          // Execute the evaluation
+          await this.server.executeEvaluation(connection, evaluation);
+          
+          console.log(`  ✅ Completed: ${evaluation.name}`);
+          completed++;
+        } catch (error) {
+          console.log(`  ❌ Failed: ${evaluation.name} - ${error.message}`);
+          failed++;
+        }
+        
+        // Add a small delay between evaluations
+        await new Promise(resolve => setTimeout(resolve, 2000));
+      }
+      
+      console.log(`\\n📊 Results: ${completed} completed, ${failed} failed`);
+    } catch (error) {
+      console.log(`❌ Batch evaluation failed: ${error.message}`);
+    }
+  }
+
+
+  quit() {
+    console.log('\\n👋 Shutting down...');
+    this.server.stop();
+    this.rl.close();
+    process.exit(0);
+  }
+}
+
+// Start CLI if this file is run directly
+if (import.meta.url === `file://${process.argv[1]}`) {
+  const cli = new EvaluationCLI();
+  
+  process.on('SIGINT', () => {
+    cli.quit();
+  });
+  
+  cli.start().catch(error => {
+    console.error('Failed to start CLI:', error.message);
+    process.exit(1);
+  });
+}
\ No newline at end of file
diff --git a/eval-server/src/client-manager.js b/eval-server/src/client-manager.js
new file mode 100644
index 00000000000..3b1ec3bd6b1
--- /dev/null
+++ b/eval-server/src/client-manager.js
@@ -0,0 +1,310 @@
+import fs from 'fs';
+import path from 'path';
+import yaml from 'js-yaml';
+import { v4 as uuidv4 } from 'uuid';
+import logger from './logger.js';
+
+class ClientManager {
+  constructor(clientsDir = './clients', evalsDir = './evals') {
+    this.clientsDir = path.resolve(clientsDir);
+    this.evalsDir = path.resolve(evalsDir);
+    this.clients = new Map();
+    this.evaluations = new Map(); // clientId -> evaluations array
+    
+    // Ensure directories exist
+    if (!fs.existsSync(this.clientsDir)) {
+      fs.mkdirSync(this.clientsDir, { recursive: true });
+    }
+    if (!fs.existsSync(this.evalsDir)) {
+      fs.mkdirSync(this.evalsDir, { recursive: true });
+    }
+    
+    this.loadAllClients();
+    this.loadAllEvaluations();
+  }
+
+  /**
+   * Load all client YAML files on startup
+   */
+  loadAllClients() {
+    try {
+      const files = fs.readdirSync(this.clientsDir)
+        .filter(f => f.endsWith('.yaml') || f.endsWith('.yml'));
+      
+      for (const file of files) {
+        const clientId = path.basename(file, path.extname(file));
+        try {
+          this.loadClient(clientId);
+        } catch (error) {
+          logger.error(`Failed to load client ${clientId}:`, error);
+        }
+      }
+      
+      logger.info(`Loaded ${this.clients.size} clients`);
+    } catch (error) {
+      logger.error('Failed to load clients:', error);
+    }
+  }
+
+  /**
+   * Load a specific client's YAML configuration
+   */
+  loadClient(clientId) {
+    const yamlPath = path.join(this.clientsDir, `${clientId}.yaml`);
+    
+    if (!fs.existsSync(yamlPath)) {
+      throw new Error(`Client YAML not found: ${yamlPath}`);
+    }
+    
+    const yamlContent = fs.readFileSync(yamlPath, 'utf8');
+    const config = yaml.load(yamlContent);
+    
+    // Validate client configuration
+    if (!config.client || config.client.id !== clientId) {
+      throw new Error(`Invalid client configuration: ID mismatch`);
+    }
+    
+    // Store client info
+    this.clients.set(clientId, {
+      id: config.client.id,
+      name: config.client.name,
+      secretKey: config.client.secret_key,
+      description: config.client.description,
+      settings: config.settings || {},
+      yamlPath
+    });
+    
+    // Note: Evaluations are now loaded separately from the evals directory
+    // Initialize empty evaluations array for this client
+    if (!this.evaluations.has(clientId)) {
+      this.evaluations.set(clientId, []);
+    }
+    
+    logger.info(`Loaded client ${clientId}`);
+    return config;
+  }
+
+  /**
+   * Load all evaluations from the evals directory structure
+   */
+  loadAllEvaluations() {
+    try {
+      // Find all category directories
+      const categories = fs.readdirSync(this.evalsDir)
+        .filter(dir => fs.statSync(path.join(this.evalsDir, dir)).isDirectory());
+      
+      let totalEvaluations = 0;
+      
+      for (const category of categories) {
+        const categoryDir = path.join(this.evalsDir, category);
+        const evalFiles = fs.readdirSync(categoryDir)
+          .filter(f => f.endsWith('.yaml') || f.endsWith('.yml'));
+        
+        for (const file of evalFiles) {
+          try {
+            const evalPath = path.join(categoryDir, file);
+            const yamlContent = fs.readFileSync(evalPath, 'utf8');
+            const evaluation = yaml.load(yamlContent);
+            
+            if (evaluation.enabled !== false) {
+              // Add evaluation to all clients for now
+              // In the future, you might want to have client-specific evaluation assignments
+              for (const [clientId] of this.clients) {
+                const clientEvals = this.evaluations.get(clientId) || [];
+                clientEvals.push({
+                  ...evaluation,
+                  clientId,
+                  status: 'pending',
+                  category,
+                  filePath: evalPath
+                });
+                this.evaluations.set(clientId, clientEvals);
+              }
+              totalEvaluations++;
+            }
+          } catch (error) {
+            logger.error(`Failed to load evaluation ${file}:`, error);
+          }
+        }
+      }
+      
+      // Update the client evaluation counts
+      for (const [clientId] of this.clients) {
+        const evalCount = this.evaluations.get(clientId)?.length || 0;
+        logger.info(`Loaded client ${clientId} with ${evalCount} evaluations`);
+      }
+      
+      logger.info(`Loaded ${totalEvaluations} evaluations from ${categories.length} categories`);
+    } catch (error) {
+      logger.error('Failed to load evaluations:', error);
+    }
+  }
+
+  /**
+   * Register a new client with authentication
+   */
+  registerClient(clientId, secretKey, capabilities, skipSecretValidation = false) {
+    const client = this.clients.get(clientId);
+    
+    if (!client) {
+      throw new Error(`Client ${clientId} not found. Please create a YAML configuration file.`);
+    }
+    
+    // Verify secret key if configured (unless we're skipping validation)
+    if (!skipSecretValidation && client.secretKey && client.secretKey !== secretKey) {
+      throw new Error('Invalid secret key');
+    }
+    
+    // Update client capabilities
+    client.capabilities = capabilities;
+    client.lastRegistered = new Date().toISOString();
+    
+    return {
+      success: true,
+      clientName: client.name,
+      evaluationsCount: this.evaluations.get(clientId)?.length || 0
+    };
+  }
+
+  /**
+   * Get client information
+   */
+  getClient(clientId) {
+    return this.clients.get(clientId);
+  }
+
+  /**
+   * Get evaluations for a client
+   */
+  getClientEvaluations(clientId) {
+    return this.evaluations.get(clientId) || [];
+  }
+
+  /**
+   * Get next pending evaluation for a client
+   */
+  getNextEvaluation(clientId) {
+    const evaluations = this.evaluations.get(clientId) || [];
+    return evaluations.find(e => e.status === 'pending');
+  }
+
+  /**
+   * Update evaluation status
+   */
+  updateEvaluationStatus(clientId, evaluationId, status, result = null) {
+    const evaluations = this.evaluations.get(clientId);
+    if (!evaluations) return;
+    
+    const evaluation = evaluations.find(e => e.id === evaluationId);
+    if (evaluation) {
+      evaluation.status = status;
+      evaluation.lastRun = new Date().toISOString();
+      if (result) {
+        evaluation.lastResult = result;
+      }
+    }
+  }
+
+  /**
+   * Create a new client with default configuration
+   */
+  async createClient(clientName, secretKey = null) {
+    const clientId = uuidv4();
+    return this.createClientWithId(clientId, clientName, secretKey);
+  }
+
+  /**
+   * Create a new client with a specific ID
+   */
+  async createClientWithId(clientId, clientName, secretKey = null) {
+    const yamlPath = path.join(this.clientsDir, `${clientId}.yaml`);
+    
+    // Create simplified client configuration (evaluations come from evals directory)
+    const defaultConfig = {
+      client: {
+        id: clientId,
+        name: clientName,
+        secret_key: secretKey,
+        description: `Auto-generated DevTools evaluation client`
+      },
+      settings: {
+        max_concurrent_evaluations: 3,
+        default_timeout: 45000,
+        retry_policy: {
+          max_retries: 2,
+          backoff_multiplier: 2,
+          initial_delay: 1000
+        }
+      }
+    };
+    
+    // Write YAML file
+    const yamlContent = yaml.dump(defaultConfig, { indent: 2 });
+    fs.writeFileSync(yamlPath, yamlContent);
+    
+    // Load the new client
+    this.loadClient(clientId);
+    
+    // Load evaluations for the new client
+    this.loadAllEvaluations();
+    
+    logger.info(`Created new client: ${clientId}`);
+    return { clientId, yamlPath };
+  }
+
+  /**
+   * Reload a specific client's configuration
+   */
+  reloadClient(clientId) {
+    try {
+      this.loadClient(clientId);
+      logger.info(`Reloaded client: ${clientId}`);
+      return true;
+    } catch (error) {
+      logger.error(`Failed to reload client ${clientId}:`, error);
+      return false;
+    }
+  }
+
+  /**
+   * Get all active clients
+   */
+  getAllClients() {
+    return Array.from(this.clients.values());
+  }
+
+  /**
+   * Validate client exists and is authorized
+   */
+  validateClient(clientId, secretKey = null, skipSecretValidation = false) {
+    const client = this.clients.get(clientId);
+    
+    logger.debug('validateClient', {
+      clientId,
+      clientExists: !!client,
+      hasSecretKey: !!secretKey,
+      skipSecretValidation,
+      clientSecretKey: client ? '[REDACTED]' : 'N/A'
+    });
+    
+    if (!client) {
+      logger.debug('Client not found', { clientId });
+      return { valid: false, reason: 'Client not found' };
+    }
+    
+    // Skip secret key validation if explicitly requested (for new auth flow)
+    if (!skipSecretValidation && secretKey !== null && client.secretKey && client.secretKey !== secretKey) {
+      logger.warn('Secret key mismatch', { 
+        clientId,
+        hasProvidedKey: !!secretKey,
+        hasStoredKey: !!client.secretKey
+      });
+      return { valid: false, reason: 'Invalid secret key' };
+    }
+    
+    logger.debug('Client validation successful', { clientId });
+    return { valid: true };
+  }
+}
+
+export { ClientManager };
\ No newline at end of file
diff --git a/eval-server/src/config.js b/eval-server/src/config.js
new file mode 100644
index 00000000000..0f6c9f3e9b3
--- /dev/null
+++ b/eval-server/src/config.js
@@ -0,0 +1,40 @@
+import { config } from 'dotenv';
+
+config();
+
+export const CONFIG = {
+  server: {
+    port: parseInt(process.env.PORT) || 8080,
+    host: process.env.HOST || 'localhost'
+  },
+  
+  llm: {
+    apiKey: process.env.OPENAI_API_KEY,
+    model: process.env.JUDGE_MODEL || 'gpt-4',
+    temperature: parseFloat(process.env.JUDGE_TEMPERATURE) || 0.1
+  },
+  
+  logging: {
+    level: process.env.LOG_LEVEL || 'info',
+    dir: process.env.LOG_DIR || './logs'
+  },
+  
+  rpc: {
+    timeout: parseInt(process.env.RPC_TIMEOUT) || 30000,
+    maxConcurrentEvaluations: parseInt(process.env.MAX_CONCURRENT_EVALUATIONS) || 10
+  }
+};
+
+export function validateConfig() {
+  const errors = [];
+  
+  if (!CONFIG.llm.apiKey) {
+    errors.push('OPENAI_API_KEY is required');
+  }
+  
+  if (CONFIG.server.port < 1 || CONFIG.server.port > 65535) {
+    errors.push('PORT must be between 1 and 65535');
+  }
+  
+  return errors;
+}
\ No newline at end of file
diff --git a/eval-server/src/evaluator.js b/eval-server/src/evaluator.js
new file mode 100644
index 00000000000..95ac14ab9f1
--- /dev/null
+++ b/eval-server/src/evaluator.js
@@ -0,0 +1,117 @@
+import OpenAI from 'openai';
+import { CONFIG } from './config.js';
+import logger from './logger.js';
+
+export class LLMEvaluator {
+  constructor() {
+    if (!CONFIG.llm.apiKey) {
+      throw new Error('OpenAI API key is required');
+    }
+    
+    this.openai = new OpenAI({
+      apiKey: CONFIG.llm.apiKey
+    });
+  }
+
+  async evaluate(task, agentResponse) {
+    try {
+      const prompt = this.buildEvaluationPrompt(task, agentResponse);
+      
+      const completion = await this.openai.chat.completions.create({
+        model: CONFIG.llm.model,
+        messages: [
+          {
+            role: 'system',
+            content: 'You are an expert evaluator of AI agent responses. Provide objective, detailed evaluations.'
+          },
+          {
+            role: 'user',
+            content: prompt
+          }
+        ],
+        temperature: CONFIG.llm.temperature,
+        max_tokens: 1000
+      });
+
+      const evaluation = completion.choices[0].message.content;
+      const usage = completion.usage;
+
+      logger.info('LLM evaluation completed', {
+        tokens_used: usage.total_tokens,
+        model: CONFIG.llm.model
+      });
+
+      return this.parseEvaluation(evaluation);
+    } catch (error) {
+      logger.error('LLM evaluation failed', { error: error.message });
+      throw error;
+    }
+  }
+
+  buildEvaluationPrompt(task, agentResponse) {
+    return `Please evaluate the following AI agent response to a given task.
+
+TASK:
+${task}
+
+AGENT RESPONSE:
+${agentResponse}
+
+Please evaluate the response on the following criteria and provide a JSON response:
+
+1. **Correctness**: Is the response factually accurate and correct?
+2. **Completeness**: Does the response fully address the task?
+3. **Clarity**: Is the response clear and well-structured?
+4. **Relevance**: Is the response relevant to the task?
+5. **Helpfulness**: How helpful is the response to the user?
+
+Provide your evaluation in the following JSON format:
+{
+  "overall_score": <score from 1-10>,
+  "criteria_scores": {
+    "correctness": <score from 1-10>,
+    "completeness": <score from 1-10>,
+    "clarity": <score from 1-10>,
+    "relevance": <score from 1-10>,
+    "helpfulness": <score from 1-10>
+  },
+  "reasoning": "<detailed explanation of your evaluation>",
+  "strengths": ["<list of strengths>"],
+  "weaknesses": ["<list of weaknesses>"],
+  "suggestions": ["<list of improvement suggestions>"]
+}`;
+  }
+
+  parseEvaluation(evaluationText) {
+    try {
+      // Try to extract JSON from the response
+      const jsonMatch = evaluationText.match(/\{[\s\S]*\}/);
+      if (jsonMatch) {
+        return JSON.parse(jsonMatch[0]);
+      }
+      
+      // If no JSON found, return a structured response with the raw text
+      return {
+        overall_score: null,
+        criteria_scores: {},
+        reasoning: evaluationText,
+        strengths: [],
+        weaknesses: [],
+        suggestions: [],
+        raw_evaluation: evaluationText
+      };
+    } catch (error) {
+      logger.warn('Failed to parse evaluation JSON', { error: error.message });
+      return {
+        overall_score: null,
+        criteria_scores: {},
+        reasoning: evaluationText,
+        strengths: [],
+        weaknesses: [],
+        suggestions: [],
+        raw_evaluation: evaluationText,
+        parse_error: error.message
+      };
+    }
+  }
+}
\ No newline at end of file
diff --git a/eval-server/src/logger.js b/eval-server/src/logger.js
new file mode 100644
index 00000000000..5452cffbb41
--- /dev/null
+++ b/eval-server/src/logger.js
@@ -0,0 +1,102 @@
+import winston from 'winston';
+import { existsSync, mkdirSync } from 'fs';
+import { CONFIG } from './config.js';
+
+// Ensure logs directory exists
+if (!existsSync(CONFIG.logging.dir)) {
+  mkdirSync(CONFIG.logging.dir, { recursive: true });
+}
+
+const logger = winston.createLogger({
+  level: CONFIG.logging.level,
+  format: winston.format.combine(
+    winston.format.timestamp(),
+    winston.format.errors({ stack: true }),
+    winston.format.json()
+  ),
+  defaultMeta: { service: 'bo-eval-server' },
+  transports: [
+    new winston.transports.File({ 
+      filename: `${CONFIG.logging.dir}/error.log`, 
+      level: 'error' 
+    }),
+    new winston.transports.File({ 
+      filename: `${CONFIG.logging.dir}/combined.log` 
+    }),
+    new winston.transports.Console({
+      format: winston.format.combine(
+        winston.format.colorize(),
+        winston.format.simple()
+      )
+    })
+  ]
+});
+
+export function logEvaluation(evaluationData) {
+  const logEntry = {
+    type: 'evaluation',
+    timestamp: new Date().toISOString(),
+    ...evaluationData
+  };
+  
+  // Pretty print evaluation summary to console
+  console.log('\n' + '='.repeat(80));
+  console.log(`📊 EVALUATION COMPLETED: ${evaluationData.name}`);
+  console.log('='.repeat(80));
+  console.log(`🆔 ID: ${evaluationData.evaluationId}`);
+  console.log(`🔧 Tool: ${evaluationData.tool}`);
+  console.log(`⏱️  Duration: ${evaluationData.duration}ms`);
+  console.log(`👤 Client: ${evaluationData.clientId}`);
+  
+  if (evaluationData.response?.output?.output) {
+    console.log(`\n📝 Output:\n${evaluationData.response.output.output}`);
+  }
+  
+  if (evaluationData.validation?.result) {
+    const val = evaluationData.validation.result;
+    console.log(`\n📋 Validation:`);
+    console.log(`   ✅ Passed: ${evaluationData.validation.passed ? 'YES' : 'NO'}`);
+    console.log(`   📊 Overall Score: ${val.overall_score}/10`);
+    if (val.strengths?.length > 0) {
+      console.log(`   💪 Strengths: ${val.strengths.join(', ')}`);
+    }
+    if (val.weaknesses?.length > 0) {
+      console.log(`   ⚠️  Weaknesses: ${val.weaknesses.join(', ')}`);
+    }
+  }
+  
+  console.log('='.repeat(80) + '\n');
+  
+  // Also log structured data for file logs
+  logger.info('Evaluation completed', logEntry);
+  
+  // Also save to dedicated evaluation log
+  const evaluationLogger = winston.createLogger({
+    format: winston.format.json(),
+    transports: [
+      new winston.transports.File({
+        filename: `${CONFIG.logging.dir}/evaluations.jsonl`
+      })
+    ]
+  });
+  
+  evaluationLogger.info(logEntry);
+}
+
+export function logRpcCall(callData) {
+  logger.info('RPC call', {
+    type: 'rpc',
+    timestamp: new Date().toISOString(),
+    ...callData
+  });
+}
+
+export function logConnection(connectionData) {
+  logger.info('Connection event', {
+    type: 'connection',
+    timestamp: new Date().toISOString(),
+    ...connectionData
+  });
+}
+
+export default logger;
\ No newline at end of file
diff --git a/eval-server/src/rpc-client.js b/eval-server/src/rpc-client.js
new file mode 100644
index 00000000000..8de13cac81b
--- /dev/null
+++ b/eval-server/src/rpc-client.js
@@ -0,0 +1,122 @@
+import { v4 as uuidv4 } from 'uuid';
+import { CONFIG } from './config.js';
+import { logRpcCall } from './logger.js';
+
+export class RpcClient {
+  constructor() {
+    this.pendingRequests = new Map();
+  }
+
+  async callMethod(ws, method, params, timeout = CONFIG.rpc.timeout) {
+    return new Promise((resolve, reject) => {
+      const id = uuidv4();
+      const request = {
+        jsonrpc: '2.0',
+        method,
+        params,
+        id
+      };
+
+      // Set up timeout
+      const timeoutId = setTimeout(() => {
+        this.pendingRequests.delete(id);
+        logRpcCall({
+          id,
+          method,
+          params,
+          status: 'timeout',
+          error: 'Request timeout'
+        });
+        reject(new Error(`RPC call timeout after ${timeout}ms`));
+      }, timeout);
+
+      // Store the request for correlation
+      this.pendingRequests.set(id, {
+        resolve,
+        reject,
+        timeoutId,
+        method,
+        params,
+        timestamp: Date.now()
+      });
+
+      // Send the request
+      try {
+        ws.send(JSON.stringify(request));
+        logRpcCall({
+          id,
+          method,
+          params,
+          status: 'sent'
+        });
+      } catch (error) {
+        this.pendingRequests.delete(id);
+        clearTimeout(timeoutId);
+        logRpcCall({
+          id,
+          method,
+          params,
+          status: 'error',
+          error: error.message
+        });
+        reject(error);
+      }
+    });
+  }
+
+  handleResponse(message) {
+    try {
+      const response = JSON.parse(message);
+      
+      // Check if it's a valid JSON-RPC response
+      if (response.jsonrpc !== '2.0' || !response.id) {
+        return false;
+      }
+
+      const pendingRequest = this.pendingRequests.get(response.id);
+      if (!pendingRequest) {
+        return false;
+      }
+
+      // Clean up
+      this.pendingRequests.delete(response.id);
+      clearTimeout(pendingRequest.timeoutId);
+
+      // Handle response
+      if (response.error) {
+        logRpcCall({
+          id: response.id,
+          method: pendingRequest.method,
+          params: pendingRequest.params,
+          status: 'error',
+          error: response.error,
+          duration: Date.now() - pendingRequest.timestamp
+        });
+        pendingRequest.reject(new Error(response.error.message || 'RPC error'));
+      } else {
+        logRpcCall({
+          id: response.id,
+          method: pendingRequest.method,
+          params: pendingRequest.params,
+          status: 'success',
+          result: response.result,
+          duration: Date.now() - pendingRequest.timestamp
+        });
+        pendingRequest.resolve(response.result);
+      }
+
+      return true;
+    } catch (error) {
+      return false;
+    }
+  }
+
+  cleanup() {
+    // Cleanup any pending requests
+    for (const [id, request] of this.pendingRequests) {
+      clearTimeout(request.timeoutId);
+      request.reject(new Error('Connection closed'));
+    }
+    this.pendingRequests.clear();
+  }
+}
\ No newline at end of file
diff --git a/eval-server/src/server.js b/eval-server/src/server.js
new file mode 100644
index 00000000000..f15f269242c
--- /dev/null
+++ b/eval-server/src/server.js
@@ -0,0 +1,635 @@
+import { WebSocketServer } from 'ws';
+import { v4 as uuidv4 } from 'uuid';
+import { CONFIG, validateConfig } from './config.js';
+import { RpcClient } from './rpc-client.js';
+import { LLMEvaluator } from './evaluator.js';
+import { logConnection, logEvaluation } from './logger.js';
+import logger from './logger.js';
+import { ClientManager } from './client-manager.js';
+import { APIServer } from './api-server.js';
+
+class EvaluationServer {
+  constructor() {
+    this.connectedClients = new Map();
+    this.rpcClient = new RpcClient();
+    this.evaluator = new LLMEvaluator();
+    this.evaluationQueue = [];
+    this.activeEvaluations = 0;
+    this.clientManager = new ClientManager('./clients', './evals');
+    this.apiServer = new APIServer(this);
+  }
+
+  start() {
+    // Validate configuration
+    const configErrors = validateConfig();
+    if (configErrors.length > 0) {
+      logger.error('Configuration errors:', configErrors);
+      process.exit(1);
+    }
+
+    // Create WebSocket server
+    this.wss = new WebSocketServer({
+      port: CONFIG.server.port,
+      host: CONFIG.server.host
+    });
+
+    this.wss.on('connection', this.handleConnection.bind(this));
+    this.wss.on('error', (error) => {
+      logger.error('WebSocket server error', { error: error.message });
+    });
+
+    logger.info(`Evaluation server started on ws://${CONFIG.server.host}:${CONFIG.server.port}`);
+    
+    // Start API server
+    this.apiServer.start();
+    
+    this.startEvaluationProcessor();
+  }
+
+  handleConnection(ws, request) {
+    const connectionId = uuidv4(); // Temporary ID until registration
+    const connection = {
+      id: connectionId,
+      ws,
+      rpcClient: new RpcClient(),
+      connectedAt: new Date().toISOString(),
+      remoteAddress: request.socket.remoteAddress,
+      registered: false,
+      clientId: null
+    };
+
+    // Store temporarily with connection ID
+    this.connectedClients.set(connectionId, connection);
+    
+    logConnection({
+      event: 'connected',
+      connectionId,
+      remoteAddress: connection.remoteAddress,
+      totalConnections: this.connectedClients.size
+    });
+
+    ws.on('message', (message) => {
+      this.handleMessage(connection, message);
+    });
+
+    ws.on('close', () => {
+      this.handleDisconnection(connection);
+    });
+
+    ws.on('error', (error) => {
+      logger.error('WebSocket connection error', {
+        connectionId: connection.id,
+        clientId: connection.clientId,
+        error: error.message
+      });
+    });
+
+    // Send welcome message
+    this.sendMessage(ws, {
+      type: 'welcome',
+      serverId: 'server-001',
+      version: '1.0.0',
+      timestamp: new Date().toISOString()
+    });
+  }
+
+  async handleMessage(connection, message) {
+    try {
+      // Parse message first
+      const data = JSON.parse(message);
+      
+      // Try to handle as RPC response first
+      if (data.jsonrpc === '2.0' && (data.result || data.error) && data.id) {
+        if (connection.rpcClient.handleResponse(message)) {
+          return;
+        }
+        // If RPC client couldn't handle it, log but don't treat as unknown
+        logger.debug('RPC response could not be handled', {
+          connectionId: connection.id,
+          clientId: connection.clientId,
+          id: data.id
+        });
+        return;
+      }
+
+      // Handle other message types
+      switch (data.type) {
+        case 'register':
+          await this.handleRegistration(connection, data);
+          break;
+        case 'ping':
+          this.sendMessage(connection.ws, { 
+            type: 'pong',
+            timestamp: new Date().toISOString()
+          });
+          break;
+        case 'ready':
+          if (!connection.registered) {
+            logger.warn('Received ready signal from unregistered client', {
+              connectionId: connection.id
+            });
+            return;
+          }
+          connection.ready = true;
+          logger.info('Client ready for evaluations', { 
+            clientId: connection.clientId 
+          });
+          // Don't automatically start evaluations - wait for manual trigger
+          // this.processClientEvaluations(connection.clientId);
+          break;
+        case 'status':
+          this.handleStatusUpdate(connection, data);
+          break;
+        case 'auth_verify':
+          this.handleAuthVerification(connection, data);
+          break;
+        default:
+          logger.warn('Unknown message type', { 
+            connectionId: connection.id,
+            clientId: connection.clientId, 
+            type: data.type,
+            messageKeys: Object.keys(data)
+          });
+      }
+    } catch (error) {
+      logger.warn('Failed to parse message', {
+        connectionId: connection.id,
+        error: error.message,
+        messageLength: message.length
+      });
+    }
+  }
+
+  async handleRegistration(connection, data) {
+    try {
+      const { clientId, secretKey, capabilities } = data;
+      
+      logger.info('Registration attempt', { 
+        clientId, 
+        hasSecretKey: !!secretKey,
+        secretKey: secretKey ? '[REDACTED]' : 'none'
+      });
+      
+      // Check if client exists (don't validate secret key yet - that happens later)
+      const validation = this.clientManager.validateClient(clientId, null, true);
+      if (!validation.valid) {
+        if (validation.reason === 'Client not found') {
+          // Auto-create new client configuration
+          try {
+            logger.info('Auto-creating new client configuration', { clientId });
+            await this.clientManager.createClientWithId(clientId, `DevTools Client ${clientId.substring(0, 8)}`, 'hello');
+            
+            // Send rejection for first-time registration to allow server to set secret key
+            this.sendMessage(connection.ws, {
+              type: 'registration_ack',
+              clientId,
+              status: 'rejected',
+              reason: 'New client created. Please reconnect to complete registration.',
+              newClient: true
+            });
+            logger.info('New client configuration created, requesting reconnection', { clientId });
+            return;
+          } catch (error) {
+            this.sendMessage(connection.ws, {
+              type: 'registration_ack',
+              clientId,
+              status: 'rejected',
+              reason: `Failed to create client configuration: ${error.message}`
+            });
+            logger.error('Failed to auto-create client', { clientId, error: error.message });
+            return;
+          }
+        } else {
+          this.sendMessage(connection.ws, {
+            type: 'registration_ack',
+            clientId,
+            status: 'rejected',
+            reason: validation.reason
+          });
+          logger.warn('Client registration rejected', {
+            clientId,
+            reason: validation.reason
+          });
+          return;
+        }
+      }
+      
+      // Get client info including the server's secret key for this client
+      const client = this.clientManager.getClient(clientId);
+      if (!client) {
+        this.sendMessage(connection.ws, {
+          type: 'registration_ack',
+          clientId,
+          status: 'rejected',
+          reason: 'Client configuration not found'
+        });
+        return;
+      }
+      
+      // Send server's secret key to client for verification
+      this.sendMessage(connection.ws, {
+        type: 'registration_ack',
+        clientId,
+        status: 'auth_required',
+        serverSecretKey: client.secretKey || '',
+        message: 'Please verify secret key'
+      });
+      
+      // Store connection info but don't register yet
+      connection.clientId = clientId;
+      connection.capabilities = capabilities;
+      connection.awaitingAuth = true;
+      
+      logger.info('Client registered successfully', {
+        clientId,
+        capabilities: capabilities?.tools?.join(', ')
+      });
+      
+    } catch (error) {
+      logger.error('Registration error', { error: error.message });
+      this.sendMessage(connection.ws, {
+        type: 'registration_ack',
+        clientId: data.clientId,
+        status: 'rejected',
+        reason: error.message
+      });
+    }
+  }
+
+  handleStatusUpdate(connection, data) {
+    if (!connection.registered) return;
+    
+    const { evaluationId, status, progress, message } = data;
+    
+    logger.info('Evaluation status update', {
+      clientId: connection.clientId,
+      evaluationId,
+      status,
+      progress,
+      message
+    });
+    
+    // Update evaluation status in client manager
+    this.clientManager.updateEvaluationStatus(
+      connection.clientId,
+      evaluationId,
+      status
+    );
+  }
+
+  handleAuthVerification(connection, data) {
+    if (!connection.awaitingAuth) {
+      logger.warn('Received auth verification from non-awaiting connection', {
+        connectionId: connection.id,
+        clientId: connection.clientId
+      });
+      return;
+    }
+
+    const { clientId, verified } = data;
+    
+    if (verified) {
+      // Authentication successful - complete registration (skip secret validation since already verified)
+      const result = this.clientManager.registerClient(clientId, '', connection.capabilities, true);
+      
+      connection.registered = true;
+      connection.awaitingAuth = false;
+      
+      // Move connection to use clientId as key
+      this.connectedClients.delete(connection.id);
+      this.connectedClients.set(clientId, connection);
+      
+      // Send final acknowledgment
+      this.sendMessage(connection.ws, {
+        type: 'registration_ack',
+        clientId,
+        status: 'accepted',
+        message: result.clientName ? `Welcome ${result.clientName}` : 'Client authenticated successfully',
+        evaluationsCount: result.evaluationsCount
+      });
+      
+      logger.info('Client authenticated and registered', { clientId });
+    } else {
+      // Authentication failed
+      this.sendMessage(connection.ws, {
+        type: 'registration_ack',
+        clientId,
+        status: 'rejected',
+        reason: 'Secret key verification failed'
+      });
+      
+      logger.warn('Client authentication failed', { clientId });
+      connection.ws.close(1008, 'Authentication failed');
+    }
+  }
+
+  handleDisconnection(connection) {
+    connection.rpcClient.cleanup();
+    
+    // Remove by connection ID or client ID
+    if (connection.registered && connection.clientId) {
+      this.connectedClients.delete(connection.clientId);
+    } else {
+      this.connectedClients.delete(connection.id);
+    }
+    
+    logConnection({
+      event: 'disconnected',
+      connectionId: connection.id,
+      clientId: connection.clientId,
+      totalConnections: this.connectedClients.size
+    });
+  }
+
+  sendMessage(ws, data) {
+    if (ws.readyState === ws.OPEN) {
+      ws.send(JSON.stringify(data));
+    }
+  }
+
+  async processClientEvaluations(clientId) {
+    const client = this.connectedClients.get(clientId);
+    if (!client || !client.ready) return;
+    
+    // Get next pending evaluation for this client
+    const evaluation = this.clientManager.getNextEvaluation(clientId);
+    if (!evaluation) {
+      logger.info('No pending evaluations for client', { clientId });
+      return;
+    }
+    
+    // Execute the evaluation
+    try {
+      await this.executeEvaluation(client, evaluation);
+      
+      // Process next evaluation after a delay
+      setTimeout(() => {
+        this.processClientEvaluations(clientId);
+      }, 1000);
+    } catch (error) {
+      logger.error('Failed to execute evaluation', {
+        clientId,
+        evaluationId: evaluation.id,
+        error: error.message
+      });
+    }
+  }
+
+  async executeEvaluation(client, evaluation) {
+    const startTime = Date.now();
+    const rpcId = `rpc-${Date.now()}-${Math.random().toString(36).substring(2, 9)}`;
+    
+    try {
+      logger.info('Starting evaluation', { 
+        clientId: client.clientId,
+        evaluationId: evaluation.id,
+        tool: evaluation.tool
+      });
+      
+      // Update status to running
+      this.clientManager.updateEvaluationStatus(
+        client.clientId,
+        evaluation.id,
+        'running'
+      );
+      
+      // Prepare RPC request
+      const rpcRequest = {
+        jsonrpc: '2.0',
+        method: 'evaluate',
+        params: {
+          evaluationId: evaluation.id,
+          name: evaluation.name,
+          url: evaluation.target?.url || evaluation.url,
+          tool: evaluation.tool,
+          input: evaluation.input,
+          timeout: evaluation.timeout || 30000,
+          metadata: {
+            tags: evaluation.metadata?.tags || [],
+            retries: evaluation.settings?.retry_policy?.max_retries || 0
+          }
+        },
+        id: rpcId
+      };
+      
+      // Send RPC request with proper timeout
+      const response = await client.rpcClient.callMethod(
+        client.ws,
+        'evaluate',
+        rpcRequest.params,
+        evaluation.timeout || 45000
+      );
+      
+      logger.info('Evaluation response received', {
+        clientId: client.clientId,
+        evaluationId: evaluation.id,
+        executionTime: response.executionTime
+      });
+      
+      // Validate response based on YAML configuration
+      let validationResult = null;
+      if (evaluation.validation) {
+        validationResult = await this.validateResponse(
+          response,
+          evaluation
+        );
+      }
+      
+      // Update evaluation status
+      this.clientManager.updateEvaluationStatus(
+        client.clientId,
+        evaluation.id,
+        'completed',
+        {
+          response,
+          validation: validationResult,
+          duration: Date.now() - startTime
+        }
+      );
+      
+      // Log evaluation
+      logEvaluation({
+        evaluationId: evaluation.id,
+        clientId: client.clientId,
+        name: evaluation.name,
+        tool: evaluation.tool,
+        response,
+        validation: validationResult,
+        timestamp: new Date().toISOString(),
+        duration: Date.now() - startTime
+      });
+      
+    } catch (error) {
+      logger.error('Evaluation failed', {
+        clientId: client.clientId,
+        evaluationId: evaluation.id,
+        error: error.message
+      });
+      
+      // Update status to failed
+      this.clientManager.updateEvaluationStatus(
+        client.clientId,
+        evaluation.id,
+        'failed',
+        {
+          error: error.message,
+          duration: Date.now() - startTime
+        }
+      );
+      
+      throw error;
+    }
+  }
+
+  async validateResponse(response, evaluation) {
+    const validation = evaluation.validation;
+    
+    if (validation.type === 'llm-judge' || validation.type === 'hybrid') {
+      const llmConfig = validation.llm_judge || validation.llm_judge;
+      
+      // Prepare prompt with criteria
+      const criteria = llmConfig.criteria || [];
+      const task = `${evaluation.name} - ${evaluation.description || ''}`;
+      
+      // Use LLM evaluator
+      const judgeResult = await this.evaluator.evaluate(
+        task,
+        JSON.stringify(response.output || response),
+        {
+          criteria,
+          model: llmConfig.model
+        }
+      );
+      
+      return {
+        type: 'llm-judge',
+        result: judgeResult,
+        passed: judgeResult.score >= 0.7 // Configurable threshold
+      };
+    }
+    
+    // Add other validation types as needed
+    return null;
+  }
+
+  async evaluateAllClients(task) {
+    const readyClients = Array.from(this.connectedClients.values())
+      .filter(client => client.ready);
+
+    if (readyClients.length === 0) {
+      throw new Error('No ready clients available');
+    }
+
+    logger.info(`Starting evaluation for ${readyClients.length} clients`, { task });
+
+    // If task looks like an evaluation ID, run that specific evaluation
+    if (task && task.includes('-')) {
+      const evaluationPromises = readyClients.map(async (client) => {
+        try {
+          // Find the specific evaluation by ID
+          const evaluation = this.clientManager.getClientEvaluations(client.clientId)
+            .find(e => e.id === task);
+          
+          if (!evaluation) {
+            logger.warn(`Evaluation '${task}' not found for client ${client.clientId}`);
+            return {
+              error: `Evaluation '${task}' not found`,
+              clientId: client.clientId
+            };
+          }
+
+          // Reset evaluation status to pending
+          this.clientManager.updateEvaluationStatus(client.clientId, evaluation.id, 'pending');
+          
+          // Execute the specific evaluation
+          await this.executeEvaluation(client, evaluation);
+          
+          return {
+            success: true,
+            clientId: client.clientId,
+            evaluationId: evaluation.id
+          };
+        } catch (error) {
+          return {
+            error: error.message,
+            clientId: client.clientId
+          };
+        }
+      });
+
+      const results = await Promise.all(evaluationPromises);
+      
+      logger.info('Specific evaluation completed', {
+        evaluationId: task,
+        totalClients: readyClients.length,
+        successfulEvaluations: results.filter(r => !r.error).length,
+        failedEvaluations: results.filter(r => r.error).length
+      });
+
+      return results;
+    }
+
+    // Otherwise, process all pending evaluations (original behavior)
+    const evaluationPromises = readyClients.map(client => 
+      this.processClientEvaluations(client.clientId).catch(error => ({
+        error: error.message,
+        clientId: client.clientId
+      }))
+    );
+
+    const results = await Promise.all(evaluationPromises);
+    
+    logger.info('Batch evaluation completed', {
+      totalClients: readyClients.length,
+      successfulEvaluations: results.filter(r => !r.error).length,
+      failedEvaluations: results.filter(r => r.error).length
+    });
+
+    return results;
+  }
+
+  startEvaluationProcessor() {
+    // This method can be extended to process evaluation queues
+    // For now, it's a placeholder for future batch processing functionality
+    logger.info('Evaluation processor started');
+  }
+
+  getStatus() {
+    return {
+      connectedClients: this.connectedClients.size,
+      readyClients: Array.from(this.connectedClients.values())
+        .filter(client => client.ready).length,
+      activeEvaluations: this.activeEvaluations
+    };
+  }
+
+  getClientManager() {
+    return this.clientManager;
+  }
+
+  stop() {
+    if (this.wss) {
+      this.wss.close();
+      logger.info('Evaluation server stopped');
+    }
+    
+    if (this.apiServer) {
+      this.apiServer.stop();
+    }
+  }
+}
+
+// Start the server if this file is run directly
+if (import.meta.url === `file://${process.argv[1]}`) {
+  const server = new EvaluationServer();
+  
+  process.on('SIGINT', () => {
+    logger.info('Received SIGINT, shutting down gracefully');
+    server.stop();
+    process.exit(0);
+  });
+
+  server.start();
+}
+
+export { EvaluationServer };
\ No newline at end of file
diff --git a/eval-server/templates/default-client.yaml b/eval-server/templates/default-client.yaml
new file mode 100644
index 00000000000..a74e0defebf
--- /dev/null
+++ b/eval-server/templates/default-client.yaml
@@ -0,0 +1,58 @@
+# Default client configuration template
+# This file is used as a template when creating new clients
+
+client:
+  id: "{CLIENT_ID}"
+  name: "{CLIENT_NAME}"
+  secret_key: "{SECRET_KEY}"  # Optional
+  description: "Auto-generated client configuration"
+
+settings:
+  max_concurrent_evaluations: 3
+  default_timeout: 30000
+  retry_policy:
+    max_retries: 2
+    backoff_multiplier: 2
+    initial_delay: 1000
+
+evaluations:
+  # Example evaluation - disabled by default
+  - id: "example-schema-extraction"
+    name: "Example Schema Extraction"
+    description: "A sample evaluation for schema extraction"
+    enabled: false
+    
+    target:
+      url: "https://example.com"
+      wait_for: "networkidle"
+      wait_timeout: 5000
+    
+    tool: "extract_schema_data"
+    timeout: 30000
+    
+    input:
+      schema:
+        type: "object"
+        properties:
+          title:
+            type: "string"
+            description: "Page title"
+          content:
+            type: "string"
+            description: "Main content"
+    
+    schedule:
+      type: "on_demand"
+    
+    validation:
+      type: "llm-judge"
+      llm_judge:
+        model: "gpt-4o-mini"
+        temperature: 0.3
+        criteria:
+          - "Title should be extracted correctly"
+          - "Content should be meaningful and not empty"
+    
+    metadata:
+      tags: ["example", "schema-extraction"]
+      priority: "normal"
\ No newline at end of file
diff --git a/front_end/panels/ai_chat/BUILD.gn b/front_end/panels/ai_chat/BUILD.gn
index 9abdfea6200..2443386510d 100644
--- a/front_end/panels/ai_chat/BUILD.gn
+++ b/front_end/panels/ai_chat/BUILD.gn
@@ -83,6 +83,10 @@ devtools_module("ai_chat") {
     "common/log.ts",
     "common/context.ts",
     "common/page.ts",
+    "common/WebSocketRPCClient.ts",
+    "common/EvaluationConfig.ts",
+    "evaluation/EvaluationProtocol.ts",
+    "evaluation/EvaluationAgent.ts",
     "tracing/TracingProvider.ts",
     "tracing/LangfuseProvider.ts",
     "tracing/TracingConfig.ts",
@@ -174,6 +178,10 @@ _ai_chat_sources = [
     "common/log.ts",
     "common/context.ts",
     "common/page.ts",
+    "common/WebSocketRPCClient.ts",
+    "common/EvaluationConfig.ts",
+    "evaluation/EvaluationProtocol.ts",
+    "evaluation/EvaluationAgent.ts",
     "tracing/TracingProvider.ts",
     "tracing/LangfuseProvider.ts",
     "tracing/TracingConfig.ts",
diff --git a/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts b/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts
index e45b4bb6e1f..5b9d926be9b 100644
--- a/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts
+++ b/front_end/panels/ai_chat/LLM/LLMErrorHandler.ts
@@ -35,7 +35,7 @@ const ERROR_SPECIFIC_RETRY_CONFIGS: ErrorRetryConfig = {
     baseDelayMs: 60000, // 60 seconds for rate limits
     maxDelayMs: 300000, // Max 5 minutes
     backoffMultiplier: 1, // No exponential backoff for rate limits
-    jitterMs: 5000, // Small jitter to avoid thundering herd
+    jitterMs: 1000, // Small jitter to avoid thundering herd
   },
   
   [ErrorType.NETWORK_ERROR]: {
@@ -228,10 +228,7 @@ export class LLMRetryManager {
         const result = await operation();
         
         if (attempt > 1 && this.config.enableLogging) {
-          logger.info(`Operation succeeded on attempt ${attempt}`, {
-            context: options.context,
-            totalTime: Date.now() - startTime,
-          });
+          logger.info(`Operation succeeded on attempt ${attempt}${options.context ? ` (context: ${options.context})` : ''} - total time: ${Date.now() - startTime}ms`);
         }
         
         return result;
@@ -240,11 +237,7 @@ export class LLMRetryManager {
         const errorType = LLMErrorClassifier.classifyError(lastError);
         
         if (this.config.enableLogging) {
-          logger.error(`Operation failed on attempt ${attempt}:`, {
-            error: lastError.message,
-            errorType,
-            context: options.context,
-          });
+          logger.error(`Operation failed on attempt ${attempt}: ${lastError instanceof Error ? lastError.message : String(lastError)} (type: ${errorType}${options.context ? `, context: ${options.context}` : ''})`);
         }
 
         // Check if we should retry this error type
diff --git a/front_end/panels/ai_chat/LLM/OpenAIProvider.ts b/front_end/panels/ai_chat/LLM/OpenAIProvider.ts
index f0e4f881160..9bd558b157a 100644
--- a/front_end/panels/ai_chat/LLM/OpenAIProvider.ts
+++ b/front_end/panels/ai_chat/LLM/OpenAIProvider.ts
@@ -301,7 +301,7 @@ export class OpenAIProvider extends LLMBaseProvider {
 
       if (!response.ok) {
         const errorData = await response.json();
-        logger.error('OpenAI API error:', errorData);
+        logger.error('OpenAI API error:', JSON.stringify(errorData));
         const error = new Error(`OpenAI API error: ${response.statusText} - ${errorData?.error?.message || 'Unknown error'}`);
         
         // Create tracing observation for API errors
@@ -319,7 +319,7 @@ export class OpenAIProvider extends LLMBaseProvider {
 
       return data;
     } catch (error) {
-      logger.error('OpenAI API request failed:', error);
+      logger.error('OpenAI API request failed:', error instanceof Error ? error.message : String(error));
       
       // Create tracing observation for network/fetch errors
       if (error instanceof Error) {
diff --git a/front_end/panels/ai_chat/common/EvaluationConfig.ts b/front_end/panels/ai_chat/common/EvaluationConfig.ts
new file mode 100644
index 00000000000..8c1e7e7aada
--- /dev/null
+++ b/front_end/panels/ai_chat/common/EvaluationConfig.ts
@@ -0,0 +1,258 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import { createLogger } from '../core/Logger.js';
+import { WebSocketRPCClient } from './WebSocketRPCClient.js';
+import { createAndConnectEvaluationAgent, getEvaluationAgent, disconnectEvaluationAgent } from '../evaluation/EvaluationAgent.js';
+
+const logger = createLogger('EvaluationConfig');
+
+// Localized strings for evaluation configuration
+// TODO: Move to proper i18n system when evaluation UI is added to SettingsDialog
+const EvaluationStrings = {
+  testingConnection: 'Testing connection...',
+  connectionSuccessful: 'Connection successful',
+  connectionFailed: 'Connection failed',
+  unknownConnectionError: 'Unknown connection error',
+  evaluationNotEnabled: 'Evaluation is not enabled',
+  clientIdNotAvailable: 'Client ID not available',
+};
+
+export interface EvaluationConfiguration {
+  enabled: boolean;
+  endpoint: string;
+  secretKey?: string;
+  clientId?: string;
+}
+
+class EvaluationConfigStore {
+  private static instance: EvaluationConfigStore;
+  private config: EvaluationConfiguration = {
+    enabled: false,
+    endpoint: 'ws://localhost:8080',
+    secretKey: '',
+    clientId: ''
+  };
+  private rpcClient: WebSocketRPCClient | null = null;
+
+  private constructor() {
+    this.loadFromLocalStorage();
+    this.ensureClientId();
+  }
+
+  static getInstance(): EvaluationConfigStore {
+    if (!EvaluationConfigStore.instance) {
+      EvaluationConfigStore.instance = new EvaluationConfigStore();
+    }
+    return EvaluationConfigStore.instance;
+  }
+
+  private loadFromLocalStorage(): void {
+    try {
+      const enabled = localStorage.getItem('ai_chat_evaluation_enabled') === 'true';
+      const endpoint = localStorage.getItem('ai_chat_evaluation_endpoint') || 'ws://localhost:8080';
+      const secretKey = localStorage.getItem('ai_chat_evaluation_secret_key') || '';
+      const clientId = localStorage.getItem('ai_chat_evaluation_client_id') || '';
+
+      this.config = {
+        enabled,
+        endpoint,
+        secretKey,
+        clientId
+      };
+
+      logger.info('Loaded evaluation config from localStorage');
+    } catch (error) {
+      logger.warn('Failed to load evaluation config from localStorage:', error);
+    }
+  }
+
+  getConfig(): EvaluationConfiguration {
+    return { ...this.config };
+  }
+
+  setConfig(newConfig: EvaluationConfiguration): void {
+    // Preserve existing client ID if new config doesn't have one
+    const preservedClientId = newConfig.clientId || this.config.clientId;
+    
+    this.config = { ...newConfig, clientId: preservedClientId };
+    
+    // Ensure we have a client ID (generate if needed)
+    this.ensureClientId();
+    
+    logger.info('Evaluation configuration updated', {
+      enabled: this.config.enabled,
+      endpoint: this.config.endpoint,
+      clientId: this.config.clientId
+    });
+
+    // Save to localStorage for persistence
+    try {
+      localStorage.setItem('ai_chat_evaluation_enabled', String(this.config.enabled));
+      localStorage.setItem('ai_chat_evaluation_endpoint', this.config.endpoint);
+      localStorage.setItem('ai_chat_evaluation_secret_key', this.config.secretKey || '');
+      localStorage.setItem('ai_chat_evaluation_client_id', this.config.clientId || '');
+    } catch (error) {
+      logger.warn('Failed to save evaluation config to localStorage:', error);
+    }
+
+    // Disconnect existing client if configuration changed
+    if (this.rpcClient) {
+      this.rpcClient.disconnect();
+      this.rpcClient = null;
+    }
+  }
+
+  private ensureClientId(): void {
+    if (!this.config.clientId) {
+      // Generate a unique client ID for this installation
+      const clientId = this.generateUUID();
+      this.config.clientId = clientId;
+      
+      try {
+        localStorage.setItem('ai_chat_evaluation_client_id', clientId);
+        logger.info('Generated and saved new client ID:', clientId);
+      } catch (error) {
+        logger.warn('Failed to save client ID to localStorage:', error);
+      }
+    } else {
+      logger.debug('Using existing client ID:', this.config.clientId);
+    }
+  }
+
+  private generateUUID(): string {
+    // Generate UUID v4
+    return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
+      const r = Math.random() * 16 | 0;
+      const v = c === 'x' ? r : (r & 0x3 | 0x8);
+      return v.toString(16);
+    });
+  }
+
+  isEnabled(): boolean {
+    return this.config.enabled;
+  }
+
+  async connect(): Promise<void> {
+    if (!this.config.enabled) {
+      throw new Error(EvaluationStrings.evaluationNotEnabled);
+    }
+
+    // Ensure client ID exists
+    this.ensureClientId();
+
+    if (!this.config.clientId) {
+      throw new Error(EvaluationStrings.clientIdNotAvailable);
+    }
+
+    // Check if already connected
+    const existingAgent = getEvaluationAgent();
+    if (existingAgent && existingAgent.isConnected()) {
+      logger.info('Already connected to evaluation service');
+      return;
+    }
+
+    // Create and connect evaluation agent
+    await createAndConnectEvaluationAgent(
+      this.config.clientId,
+      this.config.endpoint,
+      this.config.secretKey
+    );
+
+    logger.info('Connected to evaluation service with client ID:', this.config.clientId);
+  }
+
+  disconnect(): void {
+    disconnectEvaluationAgent();
+    logger.info('Disconnected from evaluation service');
+  }
+
+  getClientId(): string | undefined {
+    return this.config.clientId;
+  }
+
+  isConnected(): boolean {
+    const agent = getEvaluationAgent();
+    return agent ? agent.isConnected() : false;
+  }
+
+  async testConnection(): Promise<{ success: boolean; message: string }> {
+    try {
+      const client = new WebSocketRPCClient({
+        endpoint: this.config.endpoint,
+        secretKey: this.config.secretKey,
+        connectionTimeout: 5000
+      });
+
+      await client.connect();
+      
+      // Try to make a ping call to test the connection
+      try {
+        await client.call('ping', {}, 5000);
+      } catch (error) {
+        // Ping might not be implemented, that's okay
+        logger.debug('Ping method not available, connection still valid');
+      }
+
+      client.disconnect();
+      return { success: true, message: EvaluationStrings.connectionSuccessful };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : EvaluationStrings.unknownConnectionError;
+      logger.error('Connection test failed:', error);
+      return { success: false, message };
+    }
+  }
+}
+
+export function getEvaluationConfig(): EvaluationConfiguration {
+  return EvaluationConfigStore.getInstance().getConfig();
+}
+
+export function setEvaluationConfig(config: EvaluationConfiguration): void {
+  EvaluationConfigStore.getInstance().setConfig(config);
+}
+
+export function isEvaluationEnabled(): boolean {
+  return EvaluationConfigStore.getInstance().isEnabled();
+}
+
+export async function connectToEvaluationService(): Promise<void> {
+  return EvaluationConfigStore.getInstance().connect();
+}
+
+export function disconnectFromEvaluationService(): void {
+  EvaluationConfigStore.getInstance().disconnect();
+}
+
+export function getEvaluationClientId(): string | undefined {
+  return EvaluationConfigStore.getInstance().getClientId();
+}
+
+export function isEvaluationConnected(): boolean {
+  return EvaluationConfigStore.getInstance().isConnected();
+}
+
+export async function testEvaluationConnection(): Promise<{ success: boolean; message: string }> {
+  return EvaluationConfigStore.getInstance().testConnection();
+}
+
+// Expose configuration functions globally for console access
+declare global {
+  interface Window {
+    getEvaluationConfig?: typeof getEvaluationConfig;
+    setEvaluationConfig?: typeof setEvaluationConfig;
+    isEvaluationEnabled?: typeof isEvaluationEnabled;
+    connectToEvaluationService?: typeof connectToEvaluationService;
+    disconnectFromEvaluationService?: typeof disconnectFromEvaluationService;
+  }
+}
+
+// Make functions available globally in development
+if (typeof window !== 'undefined') {
+  window.getEvaluationConfig = getEvaluationConfig;
+  window.setEvaluationConfig = setEvaluationConfig;
+  window.isEvaluationEnabled = isEvaluationEnabled;
+  window.connectToEvaluationService = connectToEvaluationService;
+  window.disconnectFromEvaluationService = disconnectFromEvaluationService;
+}
\ No newline at end of file
diff --git a/front_end/panels/ai_chat/common/WebSocketRPCClient.ts b/front_end/panels/ai_chat/common/WebSocketRPCClient.ts
new file mode 100644
index 00000000000..197e2d6a3c9
--- /dev/null
+++ b/front_end/panels/ai_chat/common/WebSocketRPCClient.ts
@@ -0,0 +1,290 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import { createLogger } from '../core/Logger.js';
+
+const logger = createLogger('WebSocketRPCClient');
+
+export interface RPCRequest {
+  id: string;
+  method: string;
+  params?: any;
+}
+
+export interface RPCResponse {
+  id: string;
+  result?: any;
+  error?: {
+    code: number;
+    message: string;
+    data?: any;
+  };
+}
+
+export interface WebSocketRPCClientOptions {
+  endpoint: string;
+  secretKey?: string;
+  reconnectAttempts?: number;
+  reconnectDelay?: number;
+  connectionTimeout?: number;
+}
+
+export class WebSocketRPCClient {
+  private websocket: WebSocket | null = null;
+  private endpoint: string;
+  private secretKey?: string;
+  private reconnectAttempts: number;
+  private reconnectDelay: number;
+  private connectionTimeout: number;
+  private currentReconnectAttempt = 0;
+  private reconnectTimeoutId: number | null = null;
+  private pendingRequests = new Map<string, {
+    resolve: (value: any) => void;
+    reject: (error: Error) => void;
+    timeout: number;
+  }>();
+  private isConnecting = false;
+  private isConnected = false;
+  private eventListeners = new Map<string, Set<Function>>();
+
+  constructor(options: WebSocketRPCClientOptions) {
+    this.endpoint = options.endpoint;
+    this.secretKey = options.secretKey;
+    this.reconnectAttempts = options.reconnectAttempts ?? 3;
+    this.reconnectDelay = options.reconnectDelay ?? 1000;
+    this.connectionTimeout = options.connectionTimeout ?? 5000;
+  }
+
+  public async connect(): Promise<void> {
+    if (this.isConnecting || this.isConnected) {
+      logger.warn('Already connecting or connected');
+      return;
+    }
+
+    this.isConnecting = true;
+    
+    return new Promise((resolve, reject) => {
+      const connectionTimer = setTimeout(() => {
+        this.isConnecting = false;
+        reject(new Error('Connection timeout'));
+      }, this.connectionTimeout);
+
+      try {
+        this.websocket = new WebSocket(this.endpoint);
+
+        this.websocket.onopen = () => {
+          clearTimeout(connectionTimer);
+          this.isConnecting = false;
+          this.isConnected = true;
+          this.currentReconnectAttempt = 0;
+          logger.info('WebSocket connected', { endpoint: this.endpoint, readyState: this.websocket?.readyState });
+          this.emit('connected');
+          
+          // Note: Authentication is handled via the register message in the evaluation protocol
+          
+          resolve();
+        };
+
+        this.websocket.onmessage = (event) => {
+          logger.debug('Received WebSocket message:', event.data);
+          this.handleMessage(event);
+        };
+
+        this.websocket.onclose = (event) => {
+          this.isConnected = false;
+          logger.warn('WebSocket connection closed', { 
+            code: event.code, 
+            reason: event.reason, 
+            wasClean: event.wasClean,
+            endpoint: this.endpoint
+          });
+          this.emit('disconnected');
+          
+          if (!event.wasClean && this.currentReconnectAttempt < this.reconnectAttempts) {
+            this.scheduleReconnect();
+          }
+        };
+
+        this.websocket.onerror = (error) => {
+          clearTimeout(connectionTimer);
+          this.isConnecting = false;
+          const errorDetails = {
+            type: error.type,
+            readyState: this.websocket?.readyState,
+            url: this.endpoint,
+            timestamp: new Date().toISOString(),
+            message: 'WebSocket connection error'
+          };
+          logger.error('WebSocket error:', JSON.stringify(errorDetails));
+          this.emit('error', errorDetails);
+          
+          if (this.isConnecting) {
+            reject(new Error('WebSocket connection failed'));
+          }
+        };
+
+      } catch (error) {
+        clearTimeout(connectionTimer);
+        this.isConnecting = false;
+        reject(error);
+      }
+    });
+  }
+
+  public disconnect(): void {
+    if (this.reconnectTimeoutId) {
+      clearTimeout(this.reconnectTimeoutId);
+      this.reconnectTimeoutId = null;
+    }
+
+    if (this.websocket) {
+      this.websocket.close(1000, 'Manual disconnect');
+      this.websocket = null;
+    }
+
+    this.isConnected = false;
+    this.isConnecting = false;
+    this.currentReconnectAttempt = 0;
+
+    // Reject all pending requests
+    for (const [id, request] of this.pendingRequests) {
+      clearTimeout(request.timeout);
+      request.reject(new Error('Connection closed'));
+    }
+    this.pendingRequests.clear();
+  }
+
+  public async call(method: string, params?: any, timeout = 30000): Promise<any> {
+    if (!this.isConnected) {
+      throw new Error('WebSocket not connected');
+    }
+
+    const id = this.generateRequestId();
+    const request: RPCRequest = { id, method, params };
+
+    return new Promise((resolve, reject) => {
+      const timeoutId = setTimeout(() => {
+        this.pendingRequests.delete(id);
+        reject(new Error(`RPC call timeout: ${method}`));
+      }, timeout);
+
+      this.pendingRequests.set(id, {
+        resolve,
+        reject,
+        timeout: timeoutId,
+      });
+
+      try {
+        this.websocket!.send(JSON.stringify(request));
+        logger.debug('Sent RPC request', { method, id });
+      } catch (error) {
+        this.pendingRequests.delete(id);
+        clearTimeout(timeoutId);
+        reject(error);
+      }
+    });
+  }
+
+  public send(message: any): void {
+    if (!this.isConnected || !this.websocket) {
+      throw new Error('WebSocket not connected');
+    }
+    
+    this.websocket.send(JSON.stringify(message));
+    logger.debug('Sent message:', message);
+  }
+
+  public isConnectionReady(): boolean {
+    return this.isConnected && this.websocket?.readyState === WebSocket.OPEN;
+  }
+
+  public on(event: string, callback: Function): void {
+    if (!this.eventListeners.has(event)) {
+      this.eventListeners.set(event, new Set());
+    }
+    this.eventListeners.get(event)!.add(callback);
+  }
+
+  public off(event: string, callback: Function): void {
+    const listeners = this.eventListeners.get(event);
+    if (listeners) {
+      listeners.delete(callback);
+    }
+  }
+
+  private emit(event: string, data?: any): void {
+    const listeners = this.eventListeners.get(event);
+    if (listeners) {
+      for (const callback of listeners) {
+        try {
+          callback(data);
+        } catch (error) {
+          logger.error('Error in event listener:', error);
+        }
+      }
+    }
+  }
+
+  private handleMessage(event: MessageEvent): void {
+    try {
+      const message = JSON.parse(event.data);
+      logger.debug('Received WebSocket message', { type: message.type, id: message.id });
+
+      // Check if this is an RPC response (has id and either result or error)
+      if (message.id && (message.hasOwnProperty('result') || message.hasOwnProperty('error'))) {
+        const response: RPCResponse = message;
+        
+        const pendingRequest = this.pendingRequests.get(response.id);
+        if (!pendingRequest) {
+          logger.warn('Received response for unknown request ID:', response.id);
+          return;
+        }
+
+        this.pendingRequests.delete(response.id);
+        clearTimeout(pendingRequest.timeout);
+
+        if (response.error) {
+          pendingRequest.reject(new Error(`RPC Error: ${response.error.message} (Code: ${response.error.code})`));
+        } else {
+          pendingRequest.resolve(response.result);
+        }
+      } else {
+        // This is a general WebSocket message (like welcome, evaluation requests, etc.)
+        this.emit('message', message);
+      }
+
+    } catch (error) {
+      logger.error('Failed to parse WebSocket message:', error);
+    }
+  }
+
+
+  private scheduleReconnect(): void {
+    if (this.reconnectTimeoutId) {
+      return;
+    }
+
+    this.currentReconnectAttempt++;
+    const delay = this.reconnectDelay * Math.pow(2, this.currentReconnectAttempt - 1);
+
+    logger.info(`Scheduling reconnect attempt ${this.currentReconnectAttempt}/${this.reconnectAttempts} in ${delay}ms`);
+
+    this.reconnectTimeoutId = setTimeout(() => {
+      this.reconnectTimeoutId = null;
+      this.connect().catch((error) => {
+        logger.error('Reconnect failed:', error);
+        if (this.currentReconnectAttempt < this.reconnectAttempts) {
+          this.scheduleReconnect();
+        } else {
+          logger.error('Max reconnect attempts reached');
+          this.emit('reconnect_failed');
+        }
+      });
+    }, delay);
+  }
+
+  private generateRequestId(): string {
+    return `rpc_${Date.now()}_${Math.random().toString(36).substring(2, 9)}`;
+  }
+}
\ No newline at end of file
diff --git a/front_end/panels/ai_chat/evaluation/EvaluationAgent.ts b/front_end/panels/ai_chat/evaluation/EvaluationAgent.ts
new file mode 100644
index 00000000000..768fbf0cceb
--- /dev/null
+++ b/front_end/panels/ai_chat/evaluation/EvaluationAgent.ts
@@ -0,0 +1,674 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+import { WebSocketRPCClient } from '../common/WebSocketRPCClient.js';
+import { getEvaluationConfig, getEvaluationClientId } from '../common/EvaluationConfig.js';
+import { ToolRegistry } from '../agent_framework/ConfigurableAgentTool.js';
+import { AgentService } from '../core/AgentService.js';
+import { createLogger } from '../core/Logger.js';
+import { createTracingProvider, withTracingContext, isTracingEnabled, getTracingConfig } from '../tracing/TracingConfig.js';
+import type { TracingProvider, TracingContext } from '../tracing/TracingProvider.js';
+import {
+  RegisterMessage,
+  ReadyMessage,
+  StatusMessage,
+  WelcomeMessage,
+  RegistrationAckMessage,
+  AuthVerifyMessage,
+  EvaluationRequest,
+  EvaluationSuccessResponse,
+  EvaluationErrorResponse,
+  ErrorCodes,
+  isWelcomeMessage,
+  isRegistrationAckMessage,
+  isEvaluationRequest,
+  isPongMessage,
+  createRegisterMessage,
+  createReadyMessage,
+  createAuthVerifyMessage,
+  createStatusMessage,
+  createSuccessResponse,
+  createErrorResponse
+} from './EvaluationProtocol.js';
+
+const logger = createLogger('EvaluationAgent');
+
+export interface EvaluationAgentOptions {
+  clientId: string;
+  endpoint: string;
+  secretKey?: string;
+}
+
+export class EvaluationAgent {
+  private client: WebSocketRPCClient | null = null;
+  private clientId: string;
+  private endpoint: string;
+  private secretKey?: string;
+  private registered = false;
+  private ready = false;
+  private activeEvaluations = new Map<string, any>();
+  private heartbeatInterval: number | null = null;
+  private authPromise: Promise<void> | null = null;
+  private authResolve: ((value?: void) => void) | null = null;
+  private authReject: ((reason?: any) => void) | null = null;
+  private tracingProvider: TracingProvider;
+
+  constructor(options: EvaluationAgentOptions) {
+    this.clientId = options.clientId;
+    this.endpoint = options.endpoint;
+    this.secretKey = options.secretKey;
+    this.tracingProvider = createTracingProvider();
+    
+    logger.info('EvaluationAgent created with tracing provider', {
+      clientId: this.clientId,
+      providerType: this.tracingProvider.constructor.name,
+      tracingEnabled: isTracingEnabled(),
+      tracingConfig: getTracingConfig()
+    });
+  }
+
+  public async connect(): Promise<void> {
+    if (this.client && this.client.isConnectionReady() && this.registered) {
+      logger.warn('Already connected and authenticated');
+      return;
+    }
+
+    logger.info('Connecting to evaluation server', {
+      endpoint: this.endpoint,
+      clientId: this.clientId
+    });
+
+    // Create authentication promise
+    this.authPromise = new Promise((resolve, reject) => {
+      this.authResolve = resolve;
+      this.authReject = reject;
+    });
+
+    this.client = new WebSocketRPCClient({
+      endpoint: this.endpoint,
+      secretKey: this.secretKey,
+      reconnectAttempts: 5,
+      reconnectDelay: 2000
+    });
+
+    // Setup event handlers
+    this.setupEventHandlers();
+
+    // Connect to server
+    await this.client.connect();
+    
+    // Wait for authentication to complete
+    await this.authPromise;
+  }
+
+  public disconnect(): void {
+    if (this.heartbeatInterval) {
+      clearInterval(this.heartbeatInterval);
+      this.heartbeatInterval = null;
+    }
+
+    if (this.client) {
+      this.client.disconnect();
+      this.client = null;
+    }
+
+    this.registered = false;
+    this.ready = false;
+    this.activeEvaluations.clear();
+
+    logger.info('Disconnected from evaluation server');
+  }
+
+  public isConnected(): boolean {
+    return (this.client?.isConnectionReady() && this.registered) || false;
+  }
+
+  public isRegistered(): boolean {
+    return this.registered;
+  }
+
+  public isReady(): boolean {
+    return this.ready;
+  }
+
+  private setupEventHandlers(): void {
+    if (!this.client) return;
+
+    this.client.on('connected', () => {
+      logger.info('WebSocket connected, waiting for welcome message');
+    });
+
+    this.client.on('disconnected', () => {
+      logger.info('WebSocket disconnected');
+      this.registered = false;
+      this.ready = false;
+      this.stopHeartbeat();
+    });
+
+    this.client.on('message', (data: any) => {
+      this.handleMessage(data);
+    });
+
+    this.client.on('error', (error: any) => {
+      logger.error('WebSocket error:', typeof error === 'object' ? JSON.stringify(error) : error);
+    });
+  }
+
+  private async handleMessage(message: any): Promise<void> {
+    try {
+      if (isWelcomeMessage(message)) {
+        logger.info('Received welcome message from server', {
+          serverId: message.serverId,
+          version: message.version
+        });
+        await this.register();
+      }
+      else if (isRegistrationAckMessage(message)) {
+        this.handleRegistrationAck(message);
+      }
+      else if (isEvaluationRequest(message)) {
+        await this.handleEvaluationRequest(message);
+      }
+      else if (isPongMessage(message)) {
+        logger.debug('Received pong');
+      }
+      else {
+        logger.warn('Unknown message type:', message);
+      }
+    } catch (error) {
+      logger.error('Error handling message:', error instanceof Error ? error.message : String(error));
+    }
+  }
+
+  private async register(): Promise<void> {
+    if (!this.client) return;
+
+    const tools: string[] = [];
+    
+    const registerMessage = createRegisterMessage(
+      this.clientId,
+      {
+        tools,
+        maxConcurrency: 3,
+        version: '1.0.0'
+      }
+      // Note: No secret key sent - server will send its key for client verification
+    );
+
+    logger.info('Registering with server', {
+      clientId: this.clientId,
+      tools: tools.join(', ')
+    });
+
+    this.client.send(registerMessage);
+  }
+
+  private handleRegistrationAck(message: RegistrationAckMessage): void {
+    if (message.status === 'accepted') {
+      logger.info('Registration accepted', {
+        evaluationsCount: message.evaluationsCount
+      });
+      this.registered = true;
+      this.sendReady();
+      this.startHeartbeat();
+      
+      // Resolve auth promise - connection is complete
+      if (this.authResolve) {
+        this.authResolve();
+        this.authResolve = null;
+        this.authReject = null;
+      }
+    } else if (message.status === 'auth_required') {
+      logger.info('Server requesting authentication verification');
+      this.handleAuthRequest(message);
+    } else {
+      if (message.newClient) {
+        logger.info('New client created, will retry connection', {
+          reason: message.reason
+        });
+        // For new clients, the server created the config and asks to reconnect
+        // We can attempt to reconnect after a short delay
+        setTimeout(() => {
+          if (this.client) {
+            this.register();
+          }
+        }, 1000);
+      } else {
+        logger.error('Registration rejected', {
+          reason: message.reason
+        });
+        
+        // Reject auth promise - authentication failed
+        if (this.authReject) {
+          this.authReject(new Error(`Registration rejected: ${message.reason}`));
+          this.authResolve = null;
+          this.authReject = null;
+        }
+        
+        this.disconnect();
+      }
+    }
+  }
+
+  private async handleAuthRequest(message: RegistrationAckMessage): Promise<void> {
+    if (!message.serverSecretKey) {
+      logger.error('Server did not provide secret key for verification');
+      this.disconnect();
+      return;
+    }
+
+    // Get the client's configured secret key from EvaluationConfig
+    const config = getEvaluationConfig();
+    const clientSecretKey = config.secretKey || '';
+
+    // Verify if the server's secret key matches the client's configured key
+    const verified = clientSecretKey === message.serverSecretKey;
+
+    logger.info('Verifying secret key', { 
+      hasClientKey: !!clientSecretKey,
+      hasServerKey: !!message.serverSecretKey,
+      verified 
+    });
+
+    // Send verification response
+    const authMessage = createAuthVerifyMessage(message.clientId, verified);
+    this.client?.send(authMessage);
+
+    if (!verified) {
+      logger.error('Secret key verification failed - keys do not match');
+      // Reject auth promise immediately since we know auth will fail
+      if (this.authReject) {
+        this.authReject(new Error('Secret key verification failed - keys do not match'));
+        this.authResolve = null;
+        this.authReject = null;
+      }
+    }
+  }
+
+  private sendReady(): void {
+    if (!this.client || !this.registered) return;
+
+    const readyMessage = createReadyMessage();
+    this.client.send(readyMessage);
+    this.ready = true;
+
+    logger.info('Sent ready signal to server');
+  }
+
+  private async handleEvaluationRequest(request: EvaluationRequest): Promise<void> {
+    const { params, id } = request;
+    const startTime = Date.now();
+
+    logger.info('Received evaluation request', {
+      evaluationId: params.evaluationId,
+      tool: params.tool,
+      url: params.url
+    });
+
+    // Track active evaluation
+    this.activeEvaluations.set(params.evaluationId, {
+      startTime,
+      tool: params.tool
+    });
+
+    // Create a trace for this evaluation
+    const traceId = `eval-${params.evaluationId}-${Date.now()}`;
+    const sessionId = `eval-session-${Date.now()}`;
+    const tracingContext: TracingContext = { 
+      traceId, 
+      sessionId,
+      parentObservationId: undefined 
+    };
+    
+    try {
+      // Initialize tracing provider if not already done
+      await this.tracingProvider.initialize();
+      
+      // Create session for this evaluation
+      await this.tracingProvider.createSession(sessionId, {
+        type: 'evaluation',
+        source: 'evaluation-server',
+        evaluationId: params.evaluationId
+      });
+      
+      // Create root trace for the evaluation
+      await this.tracingProvider.createTrace(
+        traceId,
+        sessionId,
+        `Evaluation: ${params.tool}`,
+        params.input,
+        {
+          evaluationId: params.evaluationId,
+          tool: params.tool,
+          url: params.url,
+          source: 'evaluation-server'
+        },
+        'evaluation-agent',
+        ['evaluation', params.tool]
+      );
+      
+      logger.info('Trace created successfully for evaluation', {
+        traceId,
+        sessionId,
+        evaluationId: params.evaluationId
+      });
+    } catch (error) {
+      logger.warn('Failed to create trace:', error);
+    }
+
+    try {
+      // Send status update
+      this.sendStatus(params.evaluationId, 'running', 0.1, 'Starting evaluation...');
+
+      // Get the tool from registry
+      const tool = ToolRegistry.getRegisteredTool(params.tool);
+      if (!tool) {
+        throw new Error(`Tool not found: ${params.tool}`);
+      }
+
+      // Navigate to URL if needed
+      if (params.url) {
+        this.sendStatus(params.evaluationId, 'running', 0.2, 'Navigating to URL...');
+        
+        try {
+          // Use the correct navigate_url tool from registry
+          const navigateUrlTool = ToolRegistry.getRegisteredTool('navigate_url');
+          if (navigateUrlTool) {
+            logger.info('Navigating to URL using navigate_url tool', { url: params.url });
+            const navigationResult = await this.executeToolWithTimeout(
+              navigateUrlTool,
+              { 
+                url: params.url,
+                reasoning: `Navigate to ${params.url} for evaluation ${params.evaluationId}`
+              },
+              15000, // 15 second timeout for navigation
+              tracingContext,
+              'navigate_url'
+            );
+            logger.info('Navigation result', { navigationResult });
+            this.sendStatus(params.evaluationId, 'running', 0.3, 'Navigation completed successfully');
+          } else {
+            // Fallback: try action_agent for navigation
+            const actionTool = ToolRegistry.getRegisteredTool('action_agent');
+            if (actionTool) {
+              logger.info('Navigating to URL using action_agent fallback', { url: params.url });
+              const navigationResult = await this.executeToolWithTimeout(
+                actionTool,
+                { 
+                  task: `Navigate to ${params.url}`,
+                  reasoning: 'Navigation required for evaluation'
+                },
+                15000, // 15 second timeout for navigation
+                tracingContext,
+                'action_agent'
+              );
+              logger.info('Action agent navigation result', { navigationResult });
+              this.sendStatus(params.evaluationId, 'running', 0.3, 'Navigation completed via action agent');
+            } else {
+              logger.error('No navigation tools available in registry');
+              this.sendStatus(params.evaluationId, 'running', 0.3, 'ERROR: No navigation tools available');
+              throw new Error('Navigation failed: No navigation tools available');
+            }
+          }
+        } catch (error) {
+          logger.error('Navigation failed', { url: params.url, error: error instanceof Error ? error.message : error });
+          this.sendStatus(params.evaluationId, 'running', 0.3, `Navigation failed: ${error instanceof Error ? error.message : 'Unknown error'} - continuing with current page`);
+          // Continue with evaluation even if navigation fails, but log the issue prominently
+        }
+      }
+
+      // Execute the tool
+      this.sendStatus(params.evaluationId, 'running', 0.5, `Executing ${params.tool}...`);
+      
+      const toolResult = await this.executeToolWithTimeout(
+        tool,
+        params.input,
+        params.timeout || 30000,
+        tracingContext,
+        params.tool
+      );
+
+      const executionTime = Date.now() - startTime;
+
+      // Send JSON-RPC success response
+      const rpcResponse = createSuccessResponse(
+        id,
+        toolResult,
+        executionTime,
+        [{
+          tool: params.tool,
+          timestamp: new Date().toISOString(),
+          duration: executionTime,
+          status: 'success'
+        }],
+        {
+          url: params.url,
+          evaluationId: params.evaluationId
+        }
+      );
+
+      if (this.client) {
+        this.client.send(rpcResponse);
+      }
+
+      this.sendStatus(params.evaluationId, 'completed', 1.0, 'Evaluation completed successfully');
+
+      // Update trace with success
+      try {
+        await this.tracingProvider.finalizeTrace(traceId, {
+          output: toolResult,
+          statusMessage: 'completed',
+          metadata: {
+            executionTime,
+            evaluationId: params.evaluationId
+          }
+        });
+      } catch (error) {
+        logger.warn('Failed to update trace:', error);
+      }
+
+      logger.info('Evaluation completed successfully', {
+        evaluationId: params.evaluationId,
+        executionTime
+      });
+
+    } catch (error) {
+      const executionTime = Date.now() - startTime;
+      const errorMessage = error instanceof Error ? error.message : 'Unknown error';
+
+      logger.error(`Evaluation failed: ${errorMessage} (evaluationId: ${params.evaluationId})`);
+
+      // Send JSON-RPC error response
+      const rpcResponse = createErrorResponse(
+        id,
+        ErrorCodes.TOOL_EXECUTION_ERROR,
+        'Tool execution failed',
+        {
+          tool: params.tool,
+          error: errorMessage,
+          url: params.url,
+          timestamp: new Date().toISOString()
+        }
+      );
+
+      if (this.client) {
+        this.client.send(rpcResponse);
+      }
+
+      this.sendStatus(params.evaluationId, 'failed', 1.0, errorMessage);
+
+      // Update trace with error
+      try {
+        await this.tracingProvider.finalizeTrace(traceId, {
+          error: errorMessage,
+          statusMessage: 'failed',
+          metadata: {
+            executionTime,
+            evaluationId: params.evaluationId
+          }
+        });
+      } catch (updateError) {
+        logger.warn('Failed to update trace with error:', updateError);
+      }
+
+    } finally {
+      this.activeEvaluations.delete(params.evaluationId);
+    }
+  }
+
+  private async executeToolWithTimeout(
+    tool: any,
+    input: any,
+    timeout: number,
+    tracingContext?: TracingContext,
+    toolName?: string
+  ): Promise<any> {
+    const spanId = `tool-exec-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+    const startTime = new Date();
+    
+    // Create tool execution span if tracing context is provided
+    if (tracingContext) {
+      try {
+        await this.tracingProvider.createObservation({
+          id: spanId,
+          name: `Tool: ${toolName || 'unknown'}`,
+          type: 'span',
+          startTime,
+          input,
+          metadata: {
+            tool: toolName,
+            timeout
+          }
+        }, tracingContext.traceId);
+      } catch (error) {
+        logger.warn('Failed to create tool execution span:', error);
+      }
+    }
+    
+    return new Promise((resolve, reject) => {
+      const timer = setTimeout(() => {
+        // Update span with timeout error
+        if (tracingContext) {
+          this.tracingProvider.updateObservation(spanId, {
+            endTime: new Date(),
+            error: `Tool execution timeout after ${timeout}ms`
+          }).catch(err => logger.warn('Failed to update span with timeout:', err));
+        }
+        reject(new Error(`Tool execution timeout after ${timeout}ms`));
+      }, timeout);
+
+      // Execute tool with tracing context if available
+      const executePromise = tracingContext 
+        ? withTracingContext(tracingContext, () => tool.execute(input))
+        : tool.execute(input);
+      
+      executePromise
+        .then((result: any) => {
+          clearTimeout(timer);
+          
+          // Update span with success
+          if (tracingContext) {
+            this.tracingProvider.updateObservation(spanId, {
+              endTime: new Date(),
+              output: result
+            }).catch(err => logger.warn('Failed to update span with result:', err));
+          }
+          
+          resolve(result);
+        })
+        .catch((error: Error) => {
+          clearTimeout(timer);
+          
+          // Update span with error
+          if (tracingContext) {
+            this.tracingProvider.updateObservation(spanId, {
+              endTime: new Date(),
+              error: error.message
+            }).catch(err => logger.warn('Failed to update span with error:', err));
+          }
+          
+          reject(error);
+        });
+    });
+  }
+
+  private sendStatus(
+    evaluationId: string,
+    status: 'running' | 'completed' | 'failed',
+    progress?: number,
+    message?: string
+  ): void {
+    if (!this.client || !this.ready) return;
+
+    const statusMessage = createStatusMessage(
+      evaluationId,
+      status,
+      progress,
+      message
+    );
+
+    this.client.send(statusMessage);
+  }
+
+  private startHeartbeat(): void {
+    if (this.heartbeatInterval) return;
+
+    this.heartbeatInterval = setInterval(() => {
+      if (this.client && this.ready) {
+        this.client.send({
+          type: 'ping',
+          timestamp: new Date().toISOString()
+        });
+      }
+    }, 30000); // Send ping every 30 seconds
+  }
+
+  private stopHeartbeat(): void {
+    if (this.heartbeatInterval) {
+      clearInterval(this.heartbeatInterval);
+      this.heartbeatInterval = null;
+    }
+  }
+
+  public getActiveEvaluationsCount(): number {
+    return this.activeEvaluations.size;
+  }
+
+  public getActiveEvaluations(): string[] {
+    return Array.from(this.activeEvaluations.keys());
+  }
+}
+
+// Global instance management
+let evaluationAgent: EvaluationAgent | null = null;
+
+export function getEvaluationAgent(): EvaluationAgent | null {
+  return evaluationAgent;
+}
+
+export async function createAndConnectEvaluationAgent(
+  clientId: string,
+  endpoint: string,
+  secretKey?: string
+): Promise<EvaluationAgent> {
+  if (evaluationAgent) {
+    evaluationAgent.disconnect();
+  }
+
+  evaluationAgent = new EvaluationAgent({
+    clientId,
+    endpoint,
+    secretKey
+  });
+
+  await evaluationAgent.connect();
+  return evaluationAgent;
+}
+
+export function disconnectEvaluationAgent(): void {
+  if (evaluationAgent) {
+    evaluationAgent.disconnect();
+    evaluationAgent = null;
+  }
+}
\ No newline at end of file
diff --git a/front_end/panels/ai_chat/evaluation/EvaluationProtocol.ts b/front_end/panels/ai_chat/evaluation/EvaluationProtocol.ts
new file mode 100644
index 00000000000..c21b24e43e8
--- /dev/null
+++ b/front_end/panels/ai_chat/evaluation/EvaluationProtocol.ts
@@ -0,0 +1,247 @@
+// Copyright 2025 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+/**
+ * Protocol definitions for WebSocket evaluation communication
+ */
+
+// Client → Server Messages
+
+export interface RegisterMessage {
+  type: 'register';
+  clientId: string;
+  secretKey?: string;
+  capabilities: ClientCapabilities;
+}
+
+export interface ClientCapabilities {
+  tools: string[];
+  maxConcurrency: number;
+  version: string;
+}
+
+export interface ReadyMessage {
+  type: 'ready';
+  timestamp: string;
+}
+
+export interface StatusMessage {
+  type: 'status';
+  evaluationId: string;
+  status: 'running' | 'completed' | 'failed';
+  progress?: number;  // 0-1
+  message?: string;
+}
+
+export interface PingMessage {
+  type: 'ping';
+  timestamp: string;
+}
+
+// Server → Client Messages
+
+export interface WelcomeMessage {
+  type: 'welcome';
+  serverId: string;
+  version: string;
+  timestamp: string;
+}
+
+export interface RegistrationAckMessage {
+  type: 'registration_ack';
+  clientId: string;
+  status: 'accepted' | 'rejected' | 'auth_required';
+  message?: string;
+  evaluationsCount?: number;
+  reason?: string;  // Only present if rejected
+  serverSecretKey?: string;  // Present when status is 'auth_required'
+  newClient?: boolean;  // Present when a new client was created
+}
+
+export interface AuthVerifyMessage {
+  type: 'auth_verify';
+  clientId: string;
+  verified: boolean;
+}
+
+export interface PongMessage {
+  type: 'pong';
+  timestamp: string;
+}
+
+// JSON-RPC Messages
+
+export interface EvaluationRequest {
+  jsonrpc: '2.0';
+  method: 'evaluate';
+  params: EvaluationParams;
+  id: string;
+}
+
+export interface EvaluationParams {
+  evaluationId: string;
+  name: string;
+  url: string;
+  tool: string;
+  input: any;
+  timeout: number;
+  metadata: {
+    tags: string[];
+    retries: number;
+    priority?: 'low' | 'normal' | 'high';
+  };
+}
+
+export interface EvaluationSuccessResponse {
+  jsonrpc: '2.0';
+  result: {
+    status: 'success';
+    output: any;
+    executionTime: number;
+    toolCalls?: ToolCall[];
+    metadata?: Record<string, any>;
+  };
+  id: string;
+}
+
+export interface ToolCall {
+  tool: string;
+  timestamp: string;
+  duration: number;
+  status: 'success' | 'failed';
+  error?: string;
+}
+
+export interface EvaluationErrorResponse {
+  jsonrpc: '2.0';
+  error: {
+    code: number;
+    message: string;
+    data?: {
+      tool: string;
+      error: string;
+      url?: string;
+      timestamp: string;
+      stackTrace?: string;
+    };
+  };
+  id: string;
+}
+
+// Error codes
+export const ErrorCodes = {
+  PARSE_ERROR: -32700,
+  INVALID_REQUEST: -32600,
+  METHOD_NOT_FOUND: -32601,
+  INVALID_PARAMS: -32602,
+  INTERNAL_ERROR: -32603,
+  
+  // Custom error codes
+  TOOL_EXECUTION_ERROR: -32000,
+  TIMEOUT_ERROR: -32001,
+  AUTHENTICATION_ERROR: -32002,
+  RATE_LIMIT_EXCEEDED: -32003,
+  INVALID_TOOL: -32004,
+  RESOURCE_ERROR: -32005
+} as const;
+
+// Type guards
+
+export function isWelcomeMessage(msg: any): msg is WelcomeMessage {
+  return msg?.type === 'welcome';
+}
+
+export function isRegistrationAckMessage(msg: any): msg is RegistrationAckMessage {
+  return msg?.type === 'registration_ack';
+}
+
+export function isEvaluationRequest(msg: any): msg is EvaluationRequest {
+  return msg?.jsonrpc === '2.0' && msg?.method === 'evaluate';
+}
+
+export function isPongMessage(msg: any): msg is PongMessage {
+  return msg?.type === 'pong';
+}
+
+// Helper functions
+
+export function createRegisterMessage(
+  clientId: string,
+  capabilities: ClientCapabilities,
+  secretKey?: string
+): RegisterMessage {
+  return {
+    type: 'register',
+    clientId,
+    secretKey,
+    capabilities
+  };
+}
+
+export function createReadyMessage(): ReadyMessage {
+  return {
+    type: 'ready',
+    timestamp: new Date().toISOString()
+  };
+}
+
+export function createAuthVerifyMessage(clientId: string, verified: boolean): AuthVerifyMessage {
+  return {
+    type: 'auth_verify',
+    clientId,
+    verified
+  };
+}
+
+export function createStatusMessage(
+  evaluationId: string,
+  status: 'running' | 'completed' | 'failed',
+  progress?: number,
+  message?: string
+): StatusMessage {
+  return {
+    type: 'status',
+    evaluationId,
+    status,
+    progress,
+    message
+  };
+}
+
+export function createSuccessResponse(
+  id: string,
+  output: any,
+  executionTime: number,
+  toolCalls?: ToolCall[],
+  metadata?: Record<string, any>
+): EvaluationSuccessResponse {
+  return {
+    jsonrpc: '2.0',
+    result: {
+      status: 'success',
+      output,
+      executionTime,
+      toolCalls,
+      metadata
+    },
+    id
+  };
+}
+
+export function createErrorResponse(
+  id: string,
+  code: number,
+  message: string,
+  data?: any
+): EvaluationErrorResponse {
+  return {
+    jsonrpc: '2.0',
+    error: {
+      code,
+      message,
+      data
+    },
+    id
+  };
+}
\ No newline at end of file
diff --git a/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts b/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts
index 157a8b9d581..723fab6889f 100644
--- a/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts
+++ b/front_end/panels/ai_chat/evaluation/framework/GenericToolEvaluator.ts
@@ -9,6 +9,8 @@ import { createLogger } from '../../core/Logger.js';
 import { SanitizationUtils } from '../utils/SanitizationUtils.js';
 import { ErrorHandlingUtils } from '../utils/ErrorHandlingUtils.js';
 import type { ToolExecutionResult } from '../utils/EvaluationTypes.js';
+import { createTracingProvider } from '../../tracing/TracingConfig.js';
+import type { TracingProvider, TracingContext } from '../../tracing/TracingProvider.js';
 
 const logger = createLogger('GenericToolEvaluator');
 
@@ -29,17 +31,19 @@ export class GenericToolEvaluator {
   private navigateTool: NavigateURLTool;
   private config: EvaluationConfig;
   private hooks?: TestExecutionHooks;
+  private tracingProvider: TracingProvider;
 
   constructor(config: EvaluationConfig, hooks?: TestExecutionHooks) {
     this.config = config;
     this.navigateTool = new NavigateURLTool();
     this.hooks = hooks;
+    this.tracingProvider = createTracingProvider();
   }
 
   /**
    * Run a test case for any tool
    */
-  async runTest(testCase: TestCase, tool: Tool): Promise<TestResult> {
+  async runTest(testCase: TestCase, tool: Tool, tracingContext?: TracingContext): Promise<TestResult> {
     const startTime = Date.now();
 
     // Use withErrorHandling wrapper for better error management
@@ -56,7 +60,44 @@ export class GenericToolEvaluator {
             await this.hooks.beforeNavigation(testCase);
           }
           
+          // Create navigation span
+          const navSpanId = `nav-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+          const navStartTime = new Date();
+          
+          if (tracingContext) {
+            try {
+              await this.tracingProvider.createObservation({
+                id: navSpanId,
+                name: 'Navigation',
+                type: 'span',
+                startTime: navStartTime,
+                input: { url: testCase.url },
+                metadata: {
+                  phase: 'navigation',
+                  url: testCase.url,
+                  testId: testCase.id || testCase.name
+                }
+              }, tracingContext.traceId);
+            } catch (error) {
+              logger.warn('Failed to create navigation span:', error);
+            }
+          }
+          
           const navResult = await this.navigateTool.execute({ url: testCase.url, reasoning: `Navigate to ${testCase.url} for test case ${testCase.name}` });
+          
+          // Update navigation span
+          if (tracingContext) {
+            try {
+              await this.tracingProvider.updateObservation(navSpanId, {
+                endTime: new Date(),
+                output: navResult,
+                error: (navResult && typeof navResult === 'object' && 'error' in navResult) ? String(navResult.error) : undefined
+              });
+            } catch (error) {
+              logger.warn('Failed to update navigation span:', error);
+            }
+          }
+          
           if ('error' in navResult) {
             throw new Error(`Navigation failed: ${navResult.error}`);
           }
@@ -71,6 +112,28 @@ export class GenericToolEvaluator {
         }
 
         // 2. Execute the tool with the input - wrapped with error handling
+        const toolSpanId = `tool-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+        const toolStartTime = new Date();
+        
+        if (tracingContext) {
+          try {
+            await this.tracingProvider.createObservation({
+              id: toolSpanId,
+              name: `Tool Execution: ${testCase.tool}`,
+              type: 'span',
+              startTime: toolStartTime,
+              input: testCase.input,
+              metadata: {
+                phase: 'tool-execution',
+                tool: testCase.tool,
+                testId: testCase.id || testCase.name
+              }
+            }, tracingContext.traceId);
+          } catch (error) {
+            logger.warn('Failed to create tool execution span:', error);
+          }
+        }
+        
         const toolResult = await ErrorHandlingUtils.withErrorHandling(
           async () => {
             return await tool.execute(testCase.input);
@@ -79,6 +142,19 @@ export class GenericToolEvaluator {
           logger,
           `GenericToolEvaluator.toolExecution:${testCase.tool}`
         );
+        
+        // Update tool execution span
+        if (tracingContext) {
+          try {
+            await this.tracingProvider.updateObservation(toolSpanId, {
+              endTime: new Date(),
+              output: toolResult,
+              error: (toolResult && typeof toolResult === 'object' && 'error' in toolResult) ? String(toolResult.error) : undefined
+            });
+          } catch (error) {
+            logger.warn('Failed to update tool execution span:', error);
+          }
+        }
 
 
         // Call afterToolExecution hook
@@ -151,7 +227,7 @@ export class GenericToolEvaluator {
   /**
    * Run a test with retry logic
    */
-  private async runTestWithRetries(testCase: TestCase, tool: Tool): Promise<TestResult> {
+  private async runTestWithRetries(testCase: TestCase, tool: Tool, tracingContext?: TracingContext): Promise<TestResult> {
     const maxRetries = testCase.metadata?.retries || this.config.retries || 1;
     let lastResult: TestResult | null = null;
     let lastError: unknown = null;
@@ -162,7 +238,7 @@ export class GenericToolEvaluator {
         await new Promise(resolve => setTimeout(resolve, 2000 * attempt)); // Exponential backoff
       }
 
-      lastResult = await this.runTest(testCase, tool);
+      lastResult = await this.runTest(testCase, tool, tracingContext);
       
       // Only retry on errors, not on test failures
       if (lastResult.status !== 'error') {
diff --git a/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts b/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts
index adb3f7853fc..b55507b809b 100644
--- a/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts
+++ b/front_end/panels/ai_chat/evaluation/runner/EvaluationRunner.ts
@@ -9,6 +9,8 @@ import { ToolRegistry } from '../../agent_framework/ConfigurableAgentTool.js';
 import type { EvaluationConfig, TestResult, TestCase } from '../framework/types.js';
 import { createLogger } from '../../core/Logger.js';
 import { TIMING_CONSTANTS } from '../../core/Constants.js';
+import { createTracingProvider, isTracingEnabled, getTracingConfig } from '../../tracing/TracingConfig.js';
+import type { TracingProvider, TracingContext } from '../../tracing/TracingProvider.js';
 
 const logger = createLogger('EvaluationRunner');
 
@@ -19,6 +21,8 @@ export class EvaluationRunner {
   private evaluator: GenericToolEvaluator;
   private llmEvaluator: LLMEvaluator;
   private config: EvaluationConfig;
+  private tracingProvider: TracingProvider;
+  private sessionId: string;
 
   constructor(judgeModel?: string) {
     // Get API key from AgentService
@@ -46,36 +50,162 @@ export class EvaluationRunner {
 
     this.evaluator = new GenericToolEvaluator(this.config);
     this.llmEvaluator = new LLMEvaluator(this.config.evaluationApiKey, this.config.evaluationModel);
+    
+    // Initialize tracing
+    this.tracingProvider = createTracingProvider();
+    this.sessionId = `evaluation-session-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+    
+    logger.info('EvaluationRunner created with tracing provider', {
+      sessionId: this.sessionId,
+      providerType: this.tracingProvider.constructor.name,
+      tracingEnabled: isTracingEnabled(),
+      tracingConfig: getTracingConfig()
+    });
+    
+    // Initialize tracing provider
+    this.initializeTracing();
+  }
+  
+  private async initializeTracing(): Promise<void> {
+    if (isTracingEnabled()) {
+      try {
+        logger.info('Initializing tracing for evaluation runner', {
+          sessionId: this.sessionId,
+          providerType: this.tracingProvider.constructor.name
+        });
+        
+        await this.tracingProvider.initialize();
+        await this.tracingProvider.createSession(this.sessionId, {
+          type: 'evaluation',
+          runner: 'EvaluationRunner',
+          timestamp: new Date().toISOString()
+        });
+        
+        logger.info('Tracing initialized successfully for evaluation runner');
+      } catch (error) {
+        logger.warn('Failed to initialize tracing for evaluation:', error);
+      }
+    } else {
+      logger.info('Tracing disabled, skipping initialization');
+    }
   }
 
   /**
    * Run a single test case
    */
   async runSingleTest(testCase: TestCase<any>): Promise<TestResult> {
+    const traceId = `eval-${testCase.id || testCase.name}-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+    const startTime = new Date();
 
     logger.debug(`[EvaluationRunner] Running test: ${testCase.name}`);
     logger.debug(`[EvaluationRunner] URL: ${testCase.url}`);
     logger.debug(`[EvaluationRunner] Tool: ${testCase.tool}`);
 
+    // Create tracing context
+    const tracingContext: TracingContext = {
+      sessionId: this.sessionId,
+      traceId,
+      parentObservationId: undefined
+    };
+
+    // Create trace for this evaluation
+    if (isTracingEnabled()) {
+      try {
+        logger.info('Creating trace for evaluation', {
+          traceId,
+          sessionId: this.sessionId,
+          testName: testCase.name,
+          tool: testCase.tool,
+          providerType: this.tracingProvider.constructor.name
+        });
+        
+        await this.tracingProvider.createTrace(
+          traceId,
+          this.sessionId,
+          `Evaluation: ${testCase.name}`,
+          {
+            testCase: {
+              id: testCase.id,
+              name: testCase.name,
+              tool: testCase.tool,
+              url: testCase.url,
+              description: testCase.description
+            }
+          },
+          {
+            type: 'evaluation',
+            tool: testCase.tool,
+            url: testCase.url,
+            testId: testCase.id || testCase.name
+          },
+          'evaluation-runner',
+          ['evaluation', testCase.tool, 'test']
+        );
+        
+        logger.info('Trace created successfully');
+      } catch (error) {
+        logger.error('Failed to create trace for evaluation:', error);
+      }
+    } else {
+      logger.info('Tracing disabled, skipping trace creation');
+    }
+
     // Get the tool instance from ToolRegistry based on what the test specifies
     const tool = ToolRegistry.getRegisteredTool(testCase.tool);
     if (!tool) {
       throw new Error(`Tool "${testCase.tool}" not found in ToolRegistry. Ensure it is properly registered.`);
     }
 
-    const result = await this.evaluator.runTest(testCase, tool as any);
+    const result = await this.evaluator.runTest(testCase, tool as any, tracingContext);
     
     // Add LLM evaluation if test passed
     if (result.status === 'passed' && result.output && testCase.validation.type !== 'snapshot') {
       logger.debug(`[EvaluationRunner] Adding LLM evaluation...`);
       
+      // Create span for LLM evaluation
+      const llmSpanId = `llm-judge-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+      const llmStartTime = new Date();
+      
       try {
+        if (isTracingEnabled()) {
+          await this.tracingProvider.createObservation({
+            id: llmSpanId,
+            name: 'LLM Judge Evaluation',
+            type: 'generation',
+            startTime: llmStartTime,
+            input: {
+              output: result.output,
+              testCase: testCase.name,
+              validation: testCase.validation
+            },
+            model: this.config.evaluationModel,
+            metadata: {
+              tool: testCase.tool,
+              testId: testCase.id || testCase.name,
+              phase: 'llm-evaluation'
+            }
+          }, traceId);
+        }
+        
         const llmJudgment = await this.llmEvaluator.evaluate(
           result.output,
           testCase,
           testCase.validation
         );
         
+        // Update LLM evaluation span with result
+        if (isTracingEnabled()) {
+          await this.tracingProvider.updateObservation(llmSpanId, {
+            endTime: new Date(),
+            output: llmJudgment,
+            metadata: {
+              score: llmJudgment.score,
+              passed: llmJudgment.passed,
+              explanation: llmJudgment.explanation
+            }
+          });
+        }
+        
         if (result.validation) {
           result.validation.llmJudge = llmJudgment;
           result.validation.passed = result.validation.passed && llmJudgment.passed;
@@ -83,6 +213,31 @@ export class EvaluationRunner {
         }
       } catch (error) {
         console.warn('[EvaluationRunner] LLM evaluation failed:', error);
+        // Update span with error
+        if (isTracingEnabled()) {
+          try {
+            await this.tracingProvider.updateObservation(llmSpanId, {
+              endTime: new Date(),
+              error: error instanceof Error ? error.message : String(error)
+            });
+          } catch (tracingError) {
+            logger.warn('Failed to update LLM evaluation span with error:', tracingError);
+          }
+        }
+      }
+    }
+
+    // Finalize the trace
+    if (isTracingEnabled()) {
+      try {
+        await this.tracingProvider.finalizeTrace(traceId, {
+          status: result.status,
+          output: result.output,
+          duration: Date.now() - startTime.getTime(),
+          validation: result.validation
+        });
+      } catch (error) {
+        logger.warn('Failed to finalize trace:', error);
       }
     }
 
diff --git a/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts b/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts
index fb9e11cc357..a76be741fc6 100644
--- a/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts
+++ b/front_end/panels/ai_chat/evaluation/runner/VisionAgentEvaluationRunner.ts
@@ -11,6 +11,8 @@ import type { EvaluationConfig, TestResult, TestCase, ValidationConfig } from '.
 import type { ScreenshotData } from '../utils/EvaluationTypes.js';
 import { createLogger } from '../../core/Logger.js';
 import { TIMING_CONSTANTS } from '../../core/Constants.js';
+import { createTracingProvider, isTracingEnabled } from '../../tracing/TracingConfig.js';
+import type { TracingProvider, TracingContext } from '../../tracing/TracingProvider.js';
 
 const logger = createLogger('VisionAgentEvaluationRunner');
 
@@ -43,6 +45,7 @@ export class VisionAgentEvaluationRunner {
   private screenshotTool: TakeScreenshotTool;
   private config: EvaluationConfig;
   private globalVisionEnabled: boolean;
+  private tracingProvider: TracingProvider;
 
   constructor(visionEnabled: boolean = false, judgeModel?: string) {
     // Get API key from AgentService
@@ -71,6 +74,7 @@ export class VisionAgentEvaluationRunner {
     this.llmEvaluator = new LLMEvaluator(this.config.evaluationApiKey, this.config.evaluationModel);
     this.screenshotTool = new TakeScreenshotTool();
     this.globalVisionEnabled = visionEnabled;
+    this.tracingProvider = createTracingProvider();
   }
 
   /**
@@ -96,6 +100,44 @@ export class VisionAgentEvaluationRunner {
     let beforeScreenshot: ScreenshotData | undefined;
     let afterScreenshot: ScreenshotData | undefined;
     
+    // Create a trace for this test
+    const traceId = `test-${testCase.id}-${Date.now()}`;
+    const tracingContext: TracingContext = { 
+      traceId, 
+      sessionId: `vision-session-${Date.now()}`,
+      parentObservationId: undefined 
+    };
+    
+    try {
+      // Create a root trace for the test
+      if (isTracingEnabled()) {
+        await this.tracingProvider.initialize();
+        await this.tracingProvider.createSession(tracingContext.sessionId, {
+          type: 'vision-evaluation',
+          source: 'ui-dialog'
+        });
+        
+        await this.tracingProvider.createTrace(
+          traceId,
+          tracingContext.sessionId,
+          `Vision Agent Evaluation: ${testCase.name}`,
+          testCase.input,
+          {
+            testId: testCase.id,
+            testName: testCase.name,
+            agent: toolName,
+            visionEnabled: shouldUseVision,
+            url: testCase.url,
+            tags: testCase.metadata?.tags || []
+          },
+          'vision-agent-runner',
+          ['evaluation', 'vision', toolName]
+        );
+      }
+    } catch (error) {
+      logger.warn('Failed to create trace:', error);
+    }
+    
     try {
       // Always create hooks for screenshot capture in VisionAgentEvaluationRunner
       const visualConfig = testCase.validation.llmJudge?.visualVerification;
@@ -138,8 +180,8 @@ export class VisionAgentEvaluationRunner {
       // Always use evaluator with hooks in VisionAgentEvaluationRunner
       const evaluator = new GenericToolEvaluator(this.config, testHooks);
       
-      // Execute the agent action
-      const agentResult = await evaluator.runTest(testCase, agent as any);
+      // Execute the agent action with tracing context
+      const agentResult = await evaluator.runTest(testCase, agent as any, tracingContext);
 
       // Perform evaluation based on vision mode
       if (agentResult.status === 'passed' && agentResult.output && testCase.validation.type === 'llm-judge') {
@@ -212,10 +254,45 @@ export class VisionAgentEvaluationRunner {
         }
       };
 
+      // Update trace with final result
+      try {
+        if (isTracingEnabled()) {
+          await this.tracingProvider.finalizeTrace(traceId, {
+            output: agentResult,
+            statusMessage: agentResult.status,
+            metadata: {
+              ...(agentResult.validation?.llmJudge ? {
+                llmScore: agentResult.validation.llmJudge.score,
+                llmConfidence: agentResult.validation.llmJudge.confidence,
+                llmExplanation: agentResult.validation.llmJudge.explanation
+              } : {}),
+              toolsUsed: agentResult.output?.toolUsageStats?.toolsList || [],
+              toolCallCount: agentResult.output?.toolUsageStats?.totalCalls || 0,
+              duration: agentResult.duration
+            }
+          });
+        }
+      } catch (error) {
+        logger.warn('Failed to update trace:', error);
+      }
+
       return agentResult;
 
     } catch (error) {
       logger.error(`❌ Test failed with error:`, error);
+      
+      // Update trace with error
+      try {
+        if (isTracingEnabled()) {
+          await this.tracingProvider.finalizeTrace(traceId, {
+            error: error instanceof Error ? error.message : String(error),
+            statusMessage: 'error'
+          });
+        }
+      } catch (updateError) {
+        logger.warn('Failed to update trace with error:', updateError);
+      }
+      
       return {
         testId: testCase.id,
         status: 'error',
diff --git a/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts b/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts
index 8a45a2a403d..965f8885111 100644
--- a/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts
+++ b/front_end/panels/ai_chat/evaluation/utils/ErrorHandlingUtils.ts
@@ -59,7 +59,7 @@ export class ErrorHandlingUtils {
     try {
       return await operation();
     } catch (error) {
-      logger.error(`[${context}] Operation failed:`, error);
+      logger.error(`[${context}] Operation failed:`, error instanceof Error ? error.message : String(error));
       return errorBuilder(error);
     }
   }
diff --git a/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts b/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts
index 3d6a78e9781..ae0dba4b870 100644
--- a/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts
+++ b/front_end/panels/ai_chat/tools/StreamlinedSchemaExtractorTool.ts
@@ -108,7 +108,7 @@ export class StreamlinedSchemaExtractorTool implements Tool<StreamlinedSchemaExt
       };
 
     } catch (error) {
-      logger.error('Execution Error:', error);
+      logger.error('Execution Error:', error instanceof Error ? error.message : String(error));
       return {
         success: false,
         error: error instanceof Error ? error.message : String(error),
@@ -289,7 +289,7 @@ IMPORTANT: Only extract data that you can see in the accessibility tree above. D
           // Add delay before next attempt to prevent overloading the LLM
           await new Promise(resolve => setTimeout(resolve, this.RETRY_DELAY_MS));
         } else {
-          logger.error(`JSON extraction failed after ${attempt} attempts:`, error);
+          logger.error(`JSON extraction failed after ${attempt} attempts:`, error instanceof Error ? error.message : String(error));
           throw new Error(`Data extraction failed after ${attempt} attempts: ${error instanceof Error ? error.message : String(error)}`);
         }
       }
@@ -408,7 +408,7 @@ CRITICAL: Only use nodeIds that you can actually see in the accessibility tree a
       
       return result;
     } catch (error) {
-      logger.error(`Error in URL retry attempt ${attemptNumber}:`, error);
+      logger.error(`Error in URL retry attempt ${attemptNumber}:`, error instanceof Error ? error.message : String(error));
       return null;
     }
   }
diff --git a/front_end/panels/ai_chat/ui/SettingsDialog.ts b/front_end/panels/ai_chat/ui/SettingsDialog.ts
index cb670328784..3c1157eb22e 100644
--- a/front_end/panels/ai_chat/ui/SettingsDialog.ts
+++ b/front_end/panels/ai_chat/ui/SettingsDialog.ts
@@ -7,6 +7,7 @@ import * as UI from '../../../ui/legacy/legacy.js';
 import { LLMClient } from '../LLM/LLMClient.js';
 import { createLogger } from '../core/Logger.js';
 import { getTracingConfig, setTracingConfig, isTracingEnabled } from '../tracing/TracingConfig.js';
+import { getEvaluationConfig, setEvaluationConfig, isEvaluationEnabled, testEvaluationConnection, connectToEvaluationService, getEvaluationClientId, isEvaluationConnected } from '../common/EvaluationConfig.js';
 
 const logger = createLogger('SettingsDialog');
 
@@ -312,6 +313,42 @@ const UIStrings = {
    *@description Test tracing button
    */
   testTracing: 'Test Connection',
+  /**
+   *@description Evaluation section title
+   */
+  evaluationSection: 'Evaluation Configuration',
+  /**
+   *@description Evaluation enabled label
+   */
+  evaluationEnabled: 'Enable Evaluation',
+  /**
+   *@description Evaluation enabled hint
+   */
+  evaluationEnabledHint: 'Enable evaluation service connection for AI Chat interactions',
+  /**
+   *@description Evaluation endpoint label
+   */
+  evaluationEndpoint: 'Evaluation Endpoint',
+  /**
+   *@description Evaluation endpoint hint
+   */
+  evaluationEndpointHint: 'WebSocket endpoint for the evaluation service (e.g., ws://localhost:8080)',
+  /**
+   *@description Evaluation secret key label
+   */
+  evaluationSecretKey: 'Evaluation Secret Key',
+  /**
+   *@description Evaluation secret key hint
+   */
+  evaluationSecretKeyHint: 'Secret key for authentication with the evaluation service (optional)',
+  /**
+   *@description Connect to evaluation button
+   */
+  connectEvaluation: 'Connect',
+  /**
+   *@description Test evaluation button
+   */
+  testEvaluation: 'Test Connection',
 };
 
 const str_ = i18n.i18n.registerUIStrings('panels/ai_chat/ui/SettingsDialog.ts', UIStrings);
@@ -1886,6 +1923,271 @@ export class SettingsDialog {
         }, 5000);
       }
     });
+
+    // Add evaluation configuration section
+    const evaluationSection = document.createElement('div');
+    evaluationSection.className = 'settings-section evaluation-section';
+    contentDiv.appendChild(evaluationSection);
+
+    const evaluationSectionTitle = document.createElement('h3');
+    evaluationSectionTitle.className = 'settings-subtitle';
+    evaluationSectionTitle.textContent = i18nString(UIStrings.evaluationSection);
+    evaluationSection.appendChild(evaluationSectionTitle);
+
+    // Get current evaluation configuration
+    const currentEvaluationConfig = getEvaluationConfig();
+
+    // Evaluation enabled checkbox
+    const evaluationEnabledContainer = document.createElement('div');
+    evaluationEnabledContainer.className = 'evaluation-enabled-container';
+    evaluationSection.appendChild(evaluationEnabledContainer);
+
+    const evaluationEnabledCheckbox = document.createElement('input');
+    evaluationEnabledCheckbox.type = 'checkbox';
+    evaluationEnabledCheckbox.id = 'evaluation-enabled';
+    evaluationEnabledCheckbox.className = 'evaluation-checkbox';
+    evaluationEnabledCheckbox.checked = isEvaluationEnabled();
+    evaluationEnabledContainer.appendChild(evaluationEnabledCheckbox);
+
+    const evaluationEnabledLabel = document.createElement('label');
+    evaluationEnabledLabel.htmlFor = 'evaluation-enabled';
+    evaluationEnabledLabel.className = 'evaluation-label';
+    evaluationEnabledLabel.textContent = i18nString(UIStrings.evaluationEnabled);
+    evaluationEnabledContainer.appendChild(evaluationEnabledLabel);
+
+    const evaluationEnabledHint = document.createElement('div');
+    evaluationEnabledHint.className = 'settings-hint';
+    evaluationEnabledHint.textContent = i18nString(UIStrings.evaluationEnabledHint);
+    evaluationSection.appendChild(evaluationEnabledHint);
+
+    // Connection status indicator
+    const connectionStatusContainer = document.createElement('div');
+    connectionStatusContainer.className = 'connection-status-container';
+    connectionStatusContainer.style.display = 'flex';
+    connectionStatusContainer.style.alignItems = 'center';
+    connectionStatusContainer.style.gap = '8px';
+    connectionStatusContainer.style.marginTop = '8px';
+    connectionStatusContainer.style.fontSize = '13px';
+    evaluationSection.appendChild(connectionStatusContainer);
+
+    const connectionStatusDot = document.createElement('div');
+    connectionStatusDot.className = 'connection-status-dot';
+    connectionStatusDot.style.width = '8px';
+    connectionStatusDot.style.height = '8px';
+    connectionStatusDot.style.borderRadius = '50%';
+    connectionStatusDot.style.flexShrink = '0';
+    connectionStatusContainer.appendChild(connectionStatusDot);
+
+    const connectionStatusText = document.createElement('span');
+    connectionStatusText.className = 'connection-status-text';
+    connectionStatusContainer.appendChild(connectionStatusText);
+
+    // Function to update connection status
+    const updateConnectionStatus = () => {
+      const isConnected = isEvaluationConnected();
+      
+      logger.debug('Updating connection status', { isConnected });
+      
+      if (isConnected) {
+        connectionStatusDot.style.backgroundColor = 'var(--color-accent-green)';
+        connectionStatusText.textContent = 'Connected to evaluation server';
+        connectionStatusText.style.color = 'var(--color-accent-green)';
+      } else {
+        connectionStatusDot.style.backgroundColor = 'var(--color-text-disabled)';
+        connectionStatusText.textContent = 'Not connected';
+        connectionStatusText.style.color = 'var(--color-text-disabled)';
+      }
+    };
+
+    // Update status initially and when evaluation is enabled/disabled
+    updateConnectionStatus();
+    
+    // Set up periodic status updates every 2 seconds
+    const statusUpdateInterval = setInterval(updateConnectionStatus, 2000);
+
+    // Evaluation configuration container (shown when enabled)
+    const evaluationConfigContainer = document.createElement('div');
+    evaluationConfigContainer.className = 'evaluation-config-container';
+    evaluationConfigContainer.style.display = evaluationEnabledCheckbox.checked ? 'block' : 'none';
+    evaluationSection.appendChild(evaluationConfigContainer);
+
+    // Client ID display (read-only)
+    const clientIdLabel = document.createElement('div');
+    clientIdLabel.className = 'settings-label';
+    clientIdLabel.textContent = 'Client ID';
+    evaluationConfigContainer.appendChild(clientIdLabel);
+
+    const clientIdHint = document.createElement('div');
+    clientIdHint.className = 'settings-hint';
+    clientIdHint.textContent = 'Unique identifier for this DevTools instance';
+    evaluationConfigContainer.appendChild(clientIdHint);
+
+    const clientIdInput = document.createElement('input');
+    clientIdInput.type = 'text';
+    clientIdInput.className = 'settings-input';
+    clientIdInput.value = currentEvaluationConfig.clientId || 'Auto-generated on first connection';
+    clientIdInput.readOnly = true;
+    clientIdInput.style.backgroundColor = 'var(--color-background-elevation-1)';
+    clientIdInput.style.cursor = 'default';
+    evaluationConfigContainer.appendChild(clientIdInput);
+
+    // Evaluation endpoint
+    const evaluationEndpointLabel = document.createElement('div');
+    evaluationEndpointLabel.className = 'settings-label';
+    evaluationEndpointLabel.textContent = i18nString(UIStrings.evaluationEndpoint);
+    evaluationConfigContainer.appendChild(evaluationEndpointLabel);
+
+    const evaluationEndpointHint = document.createElement('div');
+    evaluationEndpointHint.className = 'settings-hint';
+    evaluationEndpointHint.textContent = i18nString(UIStrings.evaluationEndpointHint);
+    evaluationConfigContainer.appendChild(evaluationEndpointHint);
+
+    const evaluationEndpointInput = document.createElement('input');
+    evaluationEndpointInput.type = 'text';
+    evaluationEndpointInput.className = 'settings-input';
+    evaluationEndpointInput.placeholder = 'ws://localhost:8080';
+    evaluationEndpointInput.value = currentEvaluationConfig.endpoint || 'ws://localhost:8080';
+    evaluationConfigContainer.appendChild(evaluationEndpointInput);
+
+    // Evaluation secret key
+    const evaluationSecretKeyLabel = document.createElement('div');
+    evaluationSecretKeyLabel.className = 'settings-label';
+    evaluationSecretKeyLabel.textContent = i18nString(UIStrings.evaluationSecretKey);
+    evaluationConfigContainer.appendChild(evaluationSecretKeyLabel);
+
+    const evaluationSecretKeyHint = document.createElement('div');
+    evaluationSecretKeyHint.className = 'settings-hint';
+    evaluationSecretKeyHint.textContent = i18nString(UIStrings.evaluationSecretKeyHint);
+    evaluationConfigContainer.appendChild(evaluationSecretKeyHint);
+
+    const evaluationSecretKeyInput = document.createElement('input');
+    evaluationSecretKeyInput.type = 'password';
+    evaluationSecretKeyInput.className = 'settings-input';
+    evaluationSecretKeyInput.placeholder = 'Optional secret key';
+    evaluationSecretKeyInput.value = currentEvaluationConfig.secretKey || '';
+    evaluationConfigContainer.appendChild(evaluationSecretKeyInput);
+
+    // Connect and Test buttons container
+    const evaluationButtonsContainer = document.createElement('div');
+    evaluationButtonsContainer.className = 'evaluation-buttons-container';
+    evaluationConfigContainer.appendChild(evaluationButtonsContainer);
+
+    const connectEvaluationButton = document.createElement('button');
+    connectEvaluationButton.className = 'settings-button connect-button';
+    connectEvaluationButton.textContent = i18nString(UIStrings.connectEvaluation);
+    evaluationButtonsContainer.appendChild(connectEvaluationButton);
+
+    const testEvaluationButton = document.createElement('button');
+    testEvaluationButton.className = 'settings-button test-button';
+    testEvaluationButton.textContent = i18nString(UIStrings.testEvaluation);
+    evaluationButtonsContainer.appendChild(testEvaluationButton);
+
+    // Test status message
+    const testEvaluationStatus = document.createElement('div');
+    testEvaluationStatus.className = 'settings-status';
+    testEvaluationStatus.style.display = 'none';
+    evaluationConfigContainer.appendChild(testEvaluationStatus);
+
+    // Toggle evaluation config visibility
+    evaluationEnabledCheckbox.addEventListener('change', () => {
+      evaluationConfigContainer.style.display = evaluationEnabledCheckbox.checked ? 'block' : 'none';
+    });
+
+    // Test evaluation connection
+    testEvaluationButton.addEventListener('click', async () => {
+      testEvaluationButton.disabled = true;
+      testEvaluationStatus.style.display = 'block';
+      testEvaluationStatus.textContent = 'Testing connection...';
+      testEvaluationStatus.style.backgroundColor = 'var(--color-background-elevation-1)';
+      testEvaluationStatus.style.color = 'var(--color-text-primary)';
+
+      try {
+        const endpoint = evaluationEndpointInput.value.trim();
+        const secretKey = evaluationSecretKeyInput.value.trim();
+
+        if (!endpoint) {
+          throw new Error('Endpoint is required for testing');
+        }
+
+        // Temporarily update config for testing
+        setEvaluationConfig({
+          enabled: true,
+          endpoint,
+          secretKey
+        });
+
+        const result = await testEvaluationConnection();
+        
+        if (result.success) {
+          testEvaluationStatus.textContent = '✓ Connection successful';
+          testEvaluationStatus.style.backgroundColor = 'var(--color-accent-green-background)';
+          testEvaluationStatus.style.color = 'var(--color-accent-green)';
+        } else {
+          throw new Error(result.message);
+        }
+      } catch (error) {
+        testEvaluationStatus.textContent = `✗ ${error instanceof Error ? error.message : 'Connection failed'}`;
+        testEvaluationStatus.style.backgroundColor = 'var(--color-accent-red-background)';
+        testEvaluationStatus.style.color = 'var(--color-accent-red)';
+      } finally {
+        testEvaluationButton.disabled = false;
+        setTimeout(() => {
+          testEvaluationStatus.style.display = 'none';
+        }, 5000);
+      }
+    });
+
+    // Connect evaluation service
+    connectEvaluationButton.addEventListener('click', async () => {
+      connectEvaluationButton.disabled = true;
+      testEvaluationStatus.style.display = 'block';
+      testEvaluationStatus.textContent = 'Connecting...';
+      testEvaluationStatus.style.backgroundColor = 'var(--color-background-elevation-1)';
+      testEvaluationStatus.style.color = 'var(--color-text-primary)';
+
+      try {
+        const endpoint = evaluationEndpointInput.value.trim();
+        const secretKey = evaluationSecretKeyInput.value.trim();
+
+        if (!endpoint) {
+          throw new Error('Endpoint is required for connection');
+        }
+
+        // Update config and connect
+        setEvaluationConfig({
+          enabled: true,
+          endpoint,
+          secretKey
+        });
+
+        await connectToEvaluationService();
+        
+        // Update client ID display after connection
+        const clientId = getEvaluationClientId();
+        if (clientId) {
+          clientIdInput.value = clientId;
+        }
+        
+        testEvaluationStatus.textContent = '✓ Connected successfully';
+        testEvaluationStatus.style.backgroundColor = 'var(--color-accent-green-background)';
+        testEvaluationStatus.style.color = 'var(--color-accent-green)';
+        
+        // Update connection status indicator with a small delay to ensure connection is established
+        setTimeout(updateConnectionStatus, 500);
+      } catch (error) {
+        testEvaluationStatus.textContent = `✗ ${error instanceof Error ? error.message : 'Connection failed'}`;
+        testEvaluationStatus.style.backgroundColor = 'var(--color-accent-red-background)';
+        testEvaluationStatus.style.color = 'var(--color-accent-red)';
+        
+        // Update connection status indicator
+        updateConnectionStatus();
+      } finally {
+        connectEvaluationButton.disabled = false;
+        setTimeout(() => {
+          testEvaluationStatus.style.display = 'none';
+        }, 5000);
+      }
+    });
     
     // Add disclaimer section
     const disclaimerSection = document.createElement('div');
@@ -2067,6 +2369,13 @@ export class SettingsDialog {
       } else {
         setTracingConfig({ provider: 'disabled' });
       }
+
+      // Save evaluation configuration
+      setEvaluationConfig({
+        enabled: evaluationEnabledCheckbox.checked,
+        endpoint: evaluationEndpointInput.value.trim() || 'ws://localhost:8080',
+        secretKey: evaluationSecretKeyInput.value.trim()
+      });
       
       logger.debug('Settings saved successfully');
       logger.debug('Mini Model:', localStorage.getItem(MINI_MODEL_STORAGE_KEY));
@@ -2447,6 +2756,58 @@ export class SettingsDialog {
         padding-left: 24px;
         border-left: 2px solid var(--color-details-hairline);
       }
+
+      /* Evaluation section styles */
+      .evaluation-section {
+        margin-top: 16px;
+        padding: 16px 20px;
+        border-bottom: 1px solid var(--color-details-hairline);
+      }
+      
+      .evaluation-enabled-container {
+        display: flex;
+        align-items: center;
+        gap: 8px;
+        margin-bottom: 8px;
+      }
+      
+      .evaluation-checkbox {
+        margin: 0;
+      }
+      
+      .evaluation-label {
+        font-weight: 500;
+        color: var(--color-text-primary);
+        cursor: pointer;
+      }
+      
+      .evaluation-config-container {
+        margin-top: 16px;
+        padding-left: 24px;
+        border-left: 2px solid var(--color-details-hairline);
+      }
+
+      .evaluation-buttons-container {
+        display: flex;
+        gap: 8px;
+        margin-top: 16px;
+      }
+
+      .connect-button {
+        background-color: var(--color-accent-blue-background);
+        color: var(--color-accent-blue);
+        border: 1px solid var(--color-accent-blue);
+      }
+
+      .connect-button:hover {
+        background-color: var(--color-accent-blue);
+        color: var(--color-background);
+      }
+
+      .connect-button:disabled {
+        opacity: 0.6;
+        cursor: not-allowed;
+      }
     `;
     dialog.contentElement.appendChild(styleElement);