tysonthomas9 · tysonthomas9 · Jul 18, 2025 · Jul 13, 2025 · Jul 13, 2025 · Jul 14, 2025
diff --git a/eval-server/CLAUDE.md b/eval-server/CLAUDE.md
@@ -69,9 +69,6 @@ src/
 ├── config.js          # Configuration management
 └── cli.js             # Interactive CLI for testing and management
 
-test/
-└── example-agent.js   # Example agent implementation for testing
-
 logs/                  # Log files (created automatically)
 ├── combined.log       # All log events
 ├── error.log          # Error events only

diff --git a/eval-server/README.md b/eval-server/README.md
@@ -20,12 +20,7 @@ A WebSocket-based evaluation server for LLM agents using LLM-as-a-judge methodol
    npm start
    ```
 
-4. **Test with example agent** (in another terminal)
-   ```bash
-   npm test
-   ```
-
-5. **Use interactive CLI** (alternative to step 3)
+4. **Use interactive CLI** (alternative to step 3)
    ```bash
    npm run cli
    ```
@@ -47,10 +42,6 @@ Your agent needs to:
 2. Send a `{"type": "ready"}` message when ready for evaluations
 3. Implement the `Evaluate` RPC method that accepts a string task and returns a string response
 
-## Example Agent
-
-See `test/example-agent.js` for a complete implementation example.
-
 ## For more details
 
 See [CLAUDE.md](./CLAUDE.md) for comprehensive documentation of the architecture and implementation.
diff --git a/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml b/eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
@@ -0,0 +1,12 @@
+client:
+  id: 1233ae25-9f9e-4f77-924d-865f7d615cef
+  name: DevTools Client 1233ae25
+  secret_key: hello
+  description: Auto-generated DevTools evaluation client
+settings:
+  max_concurrent_evaluations: 3
+  default_timeout: 45000
+  retry_policy:
+    max_retries: 2
+    backoff_multiplier: 2
+    initial_delay: 1000
diff --git a/eval-server/evals/action-agent/action-agent-a11y-001.yaml b/eval-server/evals/action-agent/action-agent-a11y-001.yaml
@@ -0,0 +1,46 @@
+# Accessibility action test
+id: "action-agent-a11y-001"
+name: "Click Using ARIA Label"
+description: "Test clicking an element identified primarily by ARIA attributes"
+enabled: true
+
+target:
+  url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click the button with aria-label \"Print Page\""
+  reasoning: "Testing action selection using accessibility attributes"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Used accessibility tree to find elements"
+      - "Correctly identified element by ARIA label"
+      - "Successfully clicked the target button"
+      - "Demonstrated understanding of accessibility attributes"
+      - "No reliance on visual appearance alone"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify the Print Page button was successfully clicked"
+        - "Check if any print dialog or print preview appeared"
+        - "Confirm the button showed visual feedback (pressed state)"
+        - "Ensure the action was performed on the correct accessibility-labeled element"
+
+metadata:
+  tags: ["action", "accessibility", "aria", "click", "a11y"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
diff --git a/eval-server/evals/action-agent/action-agent-accordion-001.yaml b/eval-server/evals/action-agent/action-agent-accordion-001.yaml
@@ -0,0 +1,46 @@
+# Accordion expansion test
+id: "action-agent-accordion-001"
+name: "Expand Accordion Section"
+description: "Test clicking to expand an accordion panel"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/accordion/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Click to expand the \"Section 2\" accordion panel"
+  reasoning: "Testing accordion expand/collapse interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the Section 2 accordion header"
+      - "Successfully clicked to expand the section"
+      - "Section 2 content became visible"
+      - "Other sections collapsed appropriately"
+      - "Accordion animation completed smoothly"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify Section 2 is now expanded and content visible"
+        - "Check if other accordion sections collapsed"
+        - "Confirm the expansion animation completed"
+        - "Ensure Section 2 header shows expanded state"
+
+metadata:
+  tags: ["action", "accordion", "expand", "collapse", "ui"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
diff --git a/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml b/eval-server/evals/action-agent/action-agent-autocomplete-001.yaml
@@ -0,0 +1,46 @@
+# Autocomplete search test
+id: "action-agent-autocomplete-001"
+name: "Use Autocomplete Search"
+description: "Test typing in autocomplete field and selecting from suggestions"
+enabled: true
+
+target:
+  url: "https://jqueryui.com/autocomplete/"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
+  reasoning: "Testing autocomplete/typeahead interaction patterns"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the autocomplete input field"
+      - "Typed \"Java\" to trigger suggestions"
+      - "Autocomplete dropdown appeared with suggestions"
+      - "Selected \"JavaScript\" from the suggestion list"
+      - "Input field shows the selected value"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify \"JavaScript\" appears in the input field"
+        - "Check if autocomplete suggestions appeared"
+        - "Confirm the correct suggestion was selected"
+        - "Ensure dropdown closed after selection"
+
+metadata:
+  tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
diff --git a/eval-server/evals/action-agent/action-agent-checkbox-001.yaml b/eval-server/evals/action-agent/action-agent-checkbox-001.yaml
@@ -6,6 +6,8 @@ enabled: true
 
 target:
   url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
+  wait_for: "networkidle"
+  wait_timeout: 5000
 
 tool: "action_agent"
 timeout: 45000
@@ -14,13 +16,11 @@ input:
   objective: "Click the checkbox labeled \"I have a bike\" to check it"
   reasoning: "Testing interaction with checkbox form elements"
 
-schedule:
-  type: "on_demand"
-
 validation:
   type: "llm-judge"
   llm_judge:
-    model: "gpt-4o"
+    model: "gpt-4.1-mini"
+    temperature: 0.3
     criteria:
       - "Identified the correct checkbox among multiple options"
       - "Used click action on the checkbox element"
@@ -39,4 +39,8 @@ validation:
 
 metadata:
   tags: ["action", "checkbox", "form", "w3schools", "input"]
-  priority: "normal"
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
diff --git a/eval-server/evals/action-agent/action-agent-checkbox-002.yaml b/eval-server/evals/action-agent/action-agent-checkbox-002.yaml
@@ -6,6 +6,8 @@ enabled: true
 
 target:
   url: "https://httpbin.org/forms/post"
+  wait_for: "networkidle"
+  wait_timeout: 5000
 
 tool: "action_agent"
 timeout: 45000
@@ -15,13 +17,11 @@ input:
   reasoning: "Testing checkbox interaction functionality using check method"
   hint: "Look for the Extra Cheese checkbox and use the check method to select it"
 
-schedule:
-  type: "on_demand"
-
 validation:
   type: "llm-judge"
   llm_judge:
-    model: "gpt-4o"
+    model: "gpt-4.1-mini"
+    temperature: 0.3
     criteria:
       - "Located the Extra Cheese checkbox in the Pizza Toppings section"
       - "Used the check method instead of click for better reliability"
@@ -40,7 +40,8 @@ validation:
 
 metadata:
   tags: ["action", "checkbox", "check", "form", "httpbin"]
-  priority: "normal"
+  priority: "high"
   timeout: 45000
   retries: 2
-  flaky: false
+  flaky: false
+  owner: "devtools-team"
diff --git a/eval-server/evals/action-agent/action-agent-click-001.yaml b/eval-server/evals/action-agent/action-agent-click-001.yaml
@@ -6,22 +6,22 @@ enabled: true
 
 target:
   url: "https://www.google.com"
+  wait_for: "networkidle"
+  wait_timeout: 5000
 
 tool: "action_agent"
-timeout: 60000
+timeout: 90000
 
 input:
   objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
   reasoning: "Testing multi-step interaction: text input followed by button click"
   hint: "First fill the search input field, then find and click the search button"
 
-schedule:
-  type: "on_demand"
-
 validation:
   type: "llm-judge"
   llm_judge:
-    model: "gpt-4o"
+    model: "gpt-4.1-mini"
+    temperature: 0.3
     criteria:
       - "Successfully located the search input field"
       - "Entered \"DevTools automation\" text in the search box"
@@ -40,4 +40,8 @@ validation:
 
 metadata:
   tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
-  priority: "high"
+  priority: "high"
+  timeout: 90000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"
diff --git a/eval-server/evals/action-agent/action-agent-context-001.yaml b/eval-server/evals/action-agent/action-agent-context-001.yaml
@@ -0,0 +1,46 @@
+# Right click context menu test
+id: "action-agent-context-001"
+name: "Right Click Context Menu"
+description: "Test right-clicking to open context menu"
+enabled: true
+
+target:
+  url: "https://the-internet.herokuapp.com/context_menu"
+  wait_for: "networkidle"
+  wait_timeout: 5000
+
+tool: "action_agent"
+timeout: 60000
+
+input:
+  objective: "Right-click on the context menu area to open the context menu"
+  reasoning: "Testing right-click context menu interaction"
+
+validation:
+  type: "llm-judge"
+  llm_judge:
+    model: "gpt-4.1-mini"
+    temperature: 0.3
+    criteria:
+      - "Located the designated context menu area"
+      - "Performed right-click action correctly"
+      - "Context menu appeared with options"
+      - "Successfully triggered the right-click event"
+      - "Alert or confirmation appeared as expected"
+    visual_verification:
+      enabled: true
+      capture_before: true
+      capture_after: true
+      prompts:
+        - "Verify right-click was performed on correct area"
+        - "Check if context menu or alert appeared"
+        - "Confirm right-click event was properly triggered"
+        - "Ensure the expected response occurred"
+
+metadata:
+  tags: ["action", "context-menu", "right-click", "mouse", "menu"]
+  priority: "high"
+  timeout: 60000
+  retries: 2
+  flaky: false
+  owner: "devtools-team"