OctagonAI · mclenhard · May 16, 2025 · May 18, 2025 · coderabbitai · May 18, 2025
diff --git a/README.md b/README.md
@@ -73,7 +73,6 @@ If you see version numbers for all three, you are ready to proceed with the inst
 
 ## Installation
 
-
 ### Running on Claude Desktop
 
 To configure Octagon MCP for Claude Desktop:
@@ -276,6 +275,14 @@ Research the financial impact of Apple's privacy changes on digital advertising
 2. **Connection Issues**: Make sure the connectivity to the Octagon API is working properly.
 3. **Rate Limiting**: If you encounter rate limiting errors, reduce the frequency of your requests.
 
+## Running Evals
+
+The evals package loads an mcp client that then runs the index.ts file, so there is no need to rebuild between tests. You can load environment variables by prefixing the npx command. Full documentation can be found [here](https://www.mcpevals.io/docs).
+
+```bash
+OPENAI_API_KEY=your-key  npx mcp-eval src/evals/evals.ts src/index.ts
+```
+
 ## Installation
 
 ### Running with npx

diff --git a/package.json b/package.json
@@ -38,7 +38,8 @@
     "@modelcontextprotocol/sdk": "^1.0.0",
     "dotenv": "^16.3.1",
     "openai": "^4.20.1",
-    "zod": "^3.22.4"
+    "zod": "^3.22.4",
+    "mcp-evals": "^1.0.18"
   },
   "devDependencies": {
     "@types/node": "^20.10.0",
@@ -56,4 +57,4 @@
     "url": "https://github.com/OctagonAI/octagon-mcp-server/issues"
   },
   "homepage": "https://docs.octagonagents.com"
-}
+}
diff --git a/src/evals/evals.ts b/src/evals/evals.ts
@@ -0,0 +1,59 @@
+//evals.ts
+
+import { EvalConfig } from 'mcp-evals';
+import { openai } from "@ai-sdk/openai";
+import { grade, EvalFunction } from "mcp-evals";
+
+const octagonSecAgentEval: EvalFunction = {
+    name: "octagon-sec-agent Tool Evaluation",
+    description: "Evaluates the SEC filings analysis capabilities of the octagon-sec-agent",
+    run: async () => {
+        const result = await grade(openai("gpt-4"), "What was Apple's R&D expense as a percentage of revenue in their latest fiscal year?");
+        return JSON.parse(result);
-        const result = await grade(openai("gpt-4"), "What was Apple's R&D expense as a percentage of revenue in their latest fiscal year?");
-        return JSON.parse(result);
+    run: async () => {
+        try {
+            const result = await grade(openai("gpt-4"), "What was Apple's R&D expense as a percentage of revenue in their latest fiscal year?");
+            return JSON.parse(result);
+        } catch (error) {
+            console.error("Error in octagonSecAgentEval:", error);
+            throw error;
+        }
+    }
-        const result = await grade(openai("gpt-4"), "What was Apple's R&D expense as a percentage of revenue in their latest fiscal year?");
-        return JSON.parse(result);
+    run: async () => {
+        try {
+            const result = await grade(openai("gpt-4"), "What was Apple's R&D expense as a percentage of revenue in their latest fiscal year?");
+            return JSON.parse(result);
+        } catch (error) {
+            console.error("Error in octagonSecAgentEval:", error);
+            throw error;
+        }
+    }
+    }
+};
+
+const octagonTranscriptsAgentEval: EvalFunction = {
+    name: "octagon-transcripts-agent Evaluation",
+    description: "Evaluates the accuracy and completeness of the octagon-transcripts-agent for analyzing earnings call transcripts",
+    run: async () => {
+        const result = await grade(openai("gpt-4"), "What did Amazon's CEO say about AWS growth expectations in the latest earnings call?");
+        return JSON.parse(result);
+    }
+};
+
+const octagonFinancialsAgentEval: EvalFunction = {
+    name: "octagon-financials-agent Evaluation",
+    description: "Evaluates the financial analysis and ratio calculation capabilities of the octagon-financials-agent",
+    run: async () => {
+        const result = await grade(openai("gpt-4"), "Compare the gross margins, operating margins, and net margins of Apple, Microsoft, and Google over the last 3 years and provide insights on which company shows the strongest profitability trends.");
+        return JSON.parse(result);
+    }
+};
+
+const octagonStockDataAgentEval: EvalFunction = {
+    name: "Octagon Stock Data Agent Evaluation",
+    description: "Evaluates the performance of the Octagon Stock Data Agent for stock market data and valuation analysis",
+    run: async () => {
+        const result = await grade(openai("gpt-4"), "Compare Apple's stock performance to the S&P 500 over the last 6 months, including any significant events or catalysts that influenced price movements.");
+        return JSON.parse(result);
+    }
+};
+
+const octagonCompaniesAgentEval: EvalFunction = {
+    name: 'octagon-companies-agent Evaluation',
+    description: 'Evaluates the specialized private market intelligence tool for company info lookups and financials',
+    run: async () => {
+        const result = await grade(openai("gpt-4"), "List the top 5 companies in the AI sector by revenue growth");
+        return JSON.parse(result);
+    }
+};
+
+const config: EvalConfig = {
+    model: openai("gpt-4"),
+    evals: [octagonSecAgentEval, octagonTranscriptsAgentEval, octagonFinancialsAgentEval, octagonStockDataAgentEval, octagonCompaniesAgentEval]
+};
+
+export default config;
+
+export const evals = [octagonSecAgentEval, octagonTranscriptsAgentEval, octagonFinancialsAgentEval, octagonStockDataAgentEval, octagonCompaniesAgentEval];