Skip to content

Feat/external eval server #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 18, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions eval-server/CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,6 @@ src/
├── config.js # Configuration management
└── cli.js # Interactive CLI for testing and management

test/
└── example-agent.js # Example agent implementation for testing

logs/ # Log files (created automatically)
├── combined.log # All log events
├── error.log # Error events only
Expand Down
11 changes: 1 addition & 10 deletions eval-server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@ A WebSocket-based evaluation server for LLM agents using LLM-as-a-judge methodol
npm start
```

4. **Test with example agent** (in another terminal)
```bash
npm test
```

5. **Use interactive CLI** (alternative to step 3)
4. **Use interactive CLI** (alternative to step 3)
```bash
npm run cli
```
Expand All @@ -47,10 +42,6 @@ Your agent needs to:
2. Send a `{"type": "ready"}` message when ready for evaluations
3. Implement the `Evaluate` RPC method that accepts a string task and returns a string response

## Example Agent

See `test/example-agent.js` for a complete implementation example.

## For more details

See [CLAUDE.md](./CLAUDE.md) for comprehensive documentation of the architecture and implementation.
12 changes: 12 additions & 0 deletions eval-server/clients/1233ae25-9f9e-4f77-924d-865f7d615cef.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
client:
id: 1233ae25-9f9e-4f77-924d-865f7d615cef
name: DevTools Client 1233ae25
secret_key: hello
description: Auto-generated DevTools evaluation client
settings:
max_concurrent_evaluations: 3
default_timeout: 45000
retry_policy:
max_retries: 2
backoff_multiplier: 2
initial_delay: 1000
46 changes: 46 additions & 0 deletions eval-server/evals/action-agent/action-agent-a11y-001.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Accessibility action test
id: "action-agent-a11y-001"
name: "Click Using ARIA Label"
description: "Test clicking an element identified primarily by ARIA attributes"
enabled: true

target:
url: "https://www.w3.org/WAI/ARIA/apg/patterns/button/examples/button/"
wait_for: "networkidle"
wait_timeout: 5000

tool: "action_agent"
timeout: 60000

input:
objective: "Click the button with aria-label \"Print Page\""
reasoning: "Testing action selection using accessibility attributes"

validation:
type: "llm-judge"
llm_judge:
model: "gpt-4.1-mini"
temperature: 0.3
criteria:
- "Used accessibility tree to find elements"
- "Correctly identified element by ARIA label"
- "Successfully clicked the target button"
- "Demonstrated understanding of accessibility attributes"
- "No reliance on visual appearance alone"
visual_verification:
enabled: true
capture_before: true
capture_after: true
prompts:
- "Verify the Print Page button was successfully clicked"
- "Check if any print dialog or print preview appeared"
- "Confirm the button showed visual feedback (pressed state)"
- "Ensure the action was performed on the correct accessibility-labeled element"

metadata:
tags: ["action", "accessibility", "aria", "click", "a11y"]
priority: "high"
timeout: 60000
retries: 2
flaky: false
owner: "devtools-team"
46 changes: 46 additions & 0 deletions eval-server/evals/action-agent/action-agent-accordion-001.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Accordion expansion test
id: "action-agent-accordion-001"
name: "Expand Accordion Section"
description: "Test clicking to expand an accordion panel"
enabled: true

target:
url: "https://jqueryui.com/accordion/"
wait_for: "networkidle"
wait_timeout: 5000

tool: "action_agent"
timeout: 60000

input:
objective: "Click to expand the \"Section 2\" accordion panel"
reasoning: "Testing accordion expand/collapse interaction"

validation:
type: "llm-judge"
llm_judge:
model: "gpt-4.1-mini"
temperature: 0.3
criteria:
- "Located the Section 2 accordion header"
- "Successfully clicked to expand the section"
- "Section 2 content became visible"
- "Other sections collapsed appropriately"
- "Accordion animation completed smoothly"
visual_verification:
enabled: true
capture_before: true
capture_after: true
prompts:
- "Verify Section 2 is now expanded and content visible"
- "Check if other accordion sections collapsed"
- "Confirm the expansion animation completed"
- "Ensure Section 2 header shows expanded state"

metadata:
tags: ["action", "accordion", "expand", "collapse", "ui"]
priority: "high"
timeout: 60000
retries: 2
flaky: false
owner: "devtools-team"
46 changes: 46 additions & 0 deletions eval-server/evals/action-agent/action-agent-autocomplete-001.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Autocomplete search test
id: "action-agent-autocomplete-001"
name: "Use Autocomplete Search"
description: "Test typing in autocomplete field and selecting from suggestions"
enabled: true

target:
url: "https://jqueryui.com/autocomplete/"
wait_for: "networkidle"
wait_timeout: 5000

tool: "action_agent"
timeout: 60000

input:
objective: "Type \"Java\" in the autocomplete field and select \"JavaScript\" from suggestions"
reasoning: "Testing autocomplete/typeahead interaction patterns"

validation:
type: "llm-judge"
llm_judge:
model: "gpt-4.1-mini"
temperature: 0.3
criteria:
- "Located the autocomplete input field"
- "Typed \"Java\" to trigger suggestions"
- "Autocomplete dropdown appeared with suggestions"
- "Selected \"JavaScript\" from the suggestion list"
- "Input field shows the selected value"
visual_verification:
enabled: true
capture_before: true
capture_after: true
prompts:
- "Verify \"JavaScript\" appears in the input field"
- "Check if autocomplete suggestions appeared"
- "Confirm the correct suggestion was selected"
- "Ensure dropdown closed after selection"

metadata:
tags: ["action", "autocomplete", "typeahead", "search", "suggestions"]
priority: "high"
timeout: 60000
retries: 2
flaky: false
owner: "devtools-team"
14 changes: 9 additions & 5 deletions eval-server/evals/action-agent/action-agent-checkbox-001.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ enabled: true

target:
url: "https://www.w3schools.com/html/tryit.asp?filename=tryhtml_checkbox"
wait_for: "networkidle"
wait_timeout: 5000

tool: "action_agent"
timeout: 45000
Expand All @@ -14,13 +16,11 @@ input:
objective: "Click the checkbox labeled \"I have a bike\" to check it"
reasoning: "Testing interaction with checkbox form elements"

schedule:
type: "on_demand"

validation:
type: "llm-judge"
llm_judge:
model: "gpt-4o"
model: "gpt-4.1-mini"
temperature: 0.3
criteria:
- "Identified the correct checkbox among multiple options"
- "Used click action on the checkbox element"
Expand All @@ -39,4 +39,8 @@ validation:

metadata:
tags: ["action", "checkbox", "form", "w3schools", "input"]
priority: "normal"
priority: "high"
timeout: 60000
retries: 2
flaky: false
owner: "devtools-team"
13 changes: 7 additions & 6 deletions eval-server/evals/action-agent/action-agent-checkbox-002.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ enabled: true

target:
url: "https://httpbin.org/forms/post"
wait_for: "networkidle"
wait_timeout: 5000

tool: "action_agent"
timeout: 45000
Expand All @@ -15,13 +17,11 @@ input:
reasoning: "Testing checkbox interaction functionality using check method"
hint: "Look for the Extra Cheese checkbox and use the check method to select it"

schedule:
type: "on_demand"

validation:
type: "llm-judge"
llm_judge:
model: "gpt-4o"
model: "gpt-4.1-mini"
temperature: 0.3
criteria:
- "Located the Extra Cheese checkbox in the Pizza Toppings section"
- "Used the check method instead of click for better reliability"
Expand All @@ -40,7 +40,8 @@ validation:

metadata:
tags: ["action", "checkbox", "check", "form", "httpbin"]
priority: "normal"
priority: "high"
timeout: 45000
retries: 2
flaky: false
flaky: false
owner: "devtools-team"
16 changes: 10 additions & 6 deletions eval-server/evals/action-agent/action-agent-click-001.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@ enabled: true

target:
url: "https://www.google.com"
wait_for: "networkidle"
wait_timeout: 5000

tool: "action_agent"
timeout: 60000
timeout: 90000

input:
objective: "Type \"DevTools automation\" in the search box and then click the \"Google Search\" button"
reasoning: "Testing multi-step interaction: text input followed by button click"
hint: "First fill the search input field, then find and click the search button"

schedule:
type: "on_demand"

validation:
type: "llm-judge"
llm_judge:
model: "gpt-4o"
model: "gpt-4.1-mini"
temperature: 0.3
criteria:
- "Successfully located the search input field"
- "Entered \"DevTools automation\" text in the search box"
Expand All @@ -40,4 +40,8 @@ validation:

metadata:
tags: ["action", "multi-step", "search", "form-fill", "click", "google", "basic"]
priority: "high"
priority: "high"
timeout: 90000
retries: 2
flaky: false
owner: "devtools-team"
46 changes: 46 additions & 0 deletions eval-server/evals/action-agent/action-agent-context-001.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Right click context menu test
id: "action-agent-context-001"
name: "Right Click Context Menu"
description: "Test right-clicking to open context menu"
enabled: true

target:
url: "https://the-internet.herokuapp.com/context_menu"
wait_for: "networkidle"
wait_timeout: 5000

tool: "action_agent"
timeout: 60000

input:
objective: "Right-click on the context menu area to open the context menu"
reasoning: "Testing right-click context menu interaction"

validation:
type: "llm-judge"
llm_judge:
model: "gpt-4.1-mini"
temperature: 0.3
criteria:
- "Located the designated context menu area"
- "Performed right-click action correctly"
- "Context menu appeared with options"
- "Successfully triggered the right-click event"
- "Alert or confirmation appeared as expected"
visual_verification:
enabled: true
capture_before: true
capture_after: true
prompts:
- "Verify right-click was performed on correct area"
- "Check if context menu or alert appeared"
- "Confirm right-click event was properly triggered"
- "Ensure the expected response occurred"

metadata:
tags: ["action", "context-menu", "right-click", "mouse", "menu"]
priority: "high"
timeout: 60000
retries: 2
flaky: false
owner: "devtools-team"
Loading