diff --git a/docs/extending.md b/docs/extending.md index c8f70af..3f15045 100644 --- a/docs/extending.md +++ b/docs/extending.md @@ -24,6 +24,7 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and ### 📋 Implementation Requirements **✅ Essential Requirements:** + - Extend `AgentSystem` base class - Implement `run_agent()` method (abstract method - required) - Include `evaluator` in config during initialization @@ -31,6 +32,7 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and - Register with `AgentSystemRegistry` **💡 Optional but Recommended:** + - Implement `_create_agents()` for tool integration support - Use `self.format_prompt` for benchmark-specific formatting - Handle async execution properly if needed @@ -40,18 +42,19 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and #### Step 1: Create Agent System Class Structure ✅ Langgraph supported -✅ Customizable agent and multi-agent interaction +✅ Customizable agent and multi-agent interaction **📋 Implementation Guide:** - - Inherit from `AgentSystem` base class - - Initialize configuration parameters (num_agents, num_rounds, model_name) - - Set up agent components using `_create_agents()` method - - Extract workers and result extractors from created components - - Validate that required components are available + +- Inherit from `AgentSystem` base class +- Initialize configuration parameters (num_agents, num_rounds, model_name) +- Set up agent components using `_create_agents()` method +- Extract workers and result extractors from created components +- Validate that required components are available **💡 SupervisorMAS Implementation Example (LangGraph Structure):** -``` +```python # mas_arena/agents/supervisor_mas.py def _init_graph_if_needed(self, problem_input: Optional[Any] = None, feedback: Optional[Any] = None): @@ -93,40 +96,45 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and ``` **💡 ChatEval Implementation Example (Basic Structure):** -``` + +```python # mas_arena/agents/chateval.py class ChatEval(AgentSystem): """Multi-agent evaluation system based on iterative debate""" - def __init__(self, name: str = "chateval", config: Dict[str, Any] = None): - super().__init__(name, config) + def __init__(self, name: str = "chateval", config: Optional[Dict[str, Any]] = None): + super().__init__(name, config or {}) self.config = config or {} self.num_agents = self.config.get("num_agents", 3) self.num_rounds = self.config.get("num_rounds", 2) - self.model_name = self.config.get("model_name") or os.getenv("MODEL_NAME", "gpt-4o-mini") - - # Initialize agents and extractor via _create_agents - # self.agents and self.extractor will be set by _create_agents + self.model_name = get_model_name(self.config.get("model_name")) + agent_components = self._create_agents() self.agents = [w for w in agent_components["workers"] if isinstance(w, Agent)] - extractors = [w for w in agent_components["workers"] if isinstance(w, ResultExtractor)] + extractors = [ + w for w in agent_components["workers"] if isinstance(w, ResultExtractor) + ] if not extractors: - raise ValueError("ResultExtractor not found in components created by _create_agents.") + raise ValueError( + "ResultExtractor not found in components created by _create_agents." + ) self.extractor = extractors[0] ``` #### Step 2: Implement Core `run_agent` Method **📋 Implementation Guide:** - - Extract problem text from input dictionary - - Initialize message storage for tracking LLM responses - - Implement multi-round agent interaction logic - - Collect and process agent responses with proper metadata - - Extract final answer using result extractor - - Return formatted result with messages and final answer + +- Extract problem text from input dictionary +- Initialize message storage for tracking LLM responses +- Implement multi-round agent interaction logic +- Collect and process agent responses with proper metadata +- Extract final answer using result extractor +- Return formatted result with messages and final answer **💡 ChatEval Implementation Example (run_agent Core Method):** -``` + +```python # mas_arena/agents/chateval.py async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]: """Run iterative debate process""" @@ -134,101 +142,103 @@ class ChatEval(AgentSystem): # store all LLM response objects all_messages = [] - agent_histories = [] - + # Store all responses for context building - indexed by [round][agent_index] + round_responses = [] + # iterative discussion process - agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"] for t in range(self.num_rounds): + # Clear all agents' chat history at the start of each round to prevent accumulation + for agent in self.agents: + agent.chat_history = [] + + # Store responses for this round + current_round_responses = [] + for n, agent in enumerate(self.agents): - # generate response for current agent - context = self._build_context(problem_text, n, t) - response_data = agent.generate_response(context) - + # Rebuild context from scratch using all previous responses + context = self._build_context(problem_text, n, t, round_responses) + response_data = await agent.generate_response(context) + # save response object if "message" in response_data: all_messages.append(response_data["message"]) - - # add response to context of subsequent agents + + # Store this agent's response for current round solution_text = response_data.get("solution", "") - for m in range(n + 1, len(self.agents)): - self.agents[m].chat_history.append({ - "role": "human", - "human": f"{agent_names[n]}'s response: {solution_text}" - }) - - # extract all agent chat histories - agent_histories = [agent.chat_history for agent in self.agents] - - # extract final answer - extractor_result = self.extractor.extract(agent_histories, problem_text) + current_round_responses.append({ + "agent_index": n, + "agent_name": AGENT_NAMES[n], + "response": solution_text + }) + + # Add current round responses to the overall history + round_responses.append(current_round_responses) + + # Generate conversation history from round responses + conversation_history = self._generate_conversation_history(round_responses) + # extract final answer using ResultExtractor + extractor_result = await self.extractor.extract(conversation_history, problem_text) + # add evaluator message if "message" in extractor_result and extractor_result["message"]: all_messages.append(extractor_result["message"]) + return { - "messages": all_messages, # contains all LLM response objects - "final_answer": extractor_result["message"].content + "messages": all_messages, + "conversation_history": conversation_history, + "final_answer": extractor_result["message"].content if extractor_result["message"] else None, } ``` #### Step 3: Implement `_create_agents` Method (Tool Integration Support) **📋 Implementation Guide:** - - Create specialized `AgentNode` instances for each role - - Set agent names, models, and system prompts - - Create result extractor with format prompt integration - - Return dictionary with "workers" key containing all components - - Ensure each worker has `.name` and `.llm` attributes for tool binding + +- Create specialized `AgentNode` instances for each role +- Set agent names, models, and system prompts +- Create result extractor with format prompt integration +- Return dictionary with "workers" key containing all components +- Ensure each worker has `.name` and `.llm` attributes for tool binding **💡 ChatEval Implementation Example (_create_agents Tool Integration):** -``` + +```python # mas_arena/agents/chateval.py - def _create_agents(self) -> List[Agent]: + def _create_agents(self) -> Dict[str, List[Any]]: """Create multiple agent instances and result extractor""" - # This method will be patched by ToolIntegrationWrapper if this system is wrapped. - # The wrapper expects a dictionary: {"workers": [worker1, worker2, ...]} - # Each worker should have a .name and .llm attribute. - debate_agents = [] - agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"] for i in range(self.num_agents): agent = Agent( - agent_id=f"agent_{i+1}", - name=agent_names[i], + agent_id=i+1, + name=AGENT_NAMES[i], model_name=self.model_name, - system_prompt=self._get_agent_prompt(i) + system_prompt=self._get_agent_prompt(i), ) debate_agents.append(agent) - - # Create and assign the extractor here - extractor = ResultExtractor(self.model_name, self.format_prompt) - # self.extractor = extractor # Assign to self if needed elsewhere before run_agent completes, - # but __init__ already handles setting self.extractor. - return { - "workers": debate_agents + [extractor] - } + # Create and assign the extractor here + extractor = ResultExtractor(self.model_name, getattr(self, 'format_prompt', '')) + + return {"workers": debate_agents + [extractor]} ``` #### Step 4: Register System with Framework **📋 Implementation Guide:** - - Use `AgentSystemRegistry.register()` to make system available - - Provide system name as string identifier - - Pass class reference (not instance) - - Include default configuration parameters - - These defaults can be overridden during initialization + +- Use `AgentSystemRegistry.register()` to make system available +- Provide system name as string identifier +- Pass class reference (not instance) +- Include default configuration parameters +- These defaults can be overridden during initialization **💡 ChatEval Implementation Example (Registration):** -``` + +```python # mas_arena/agents/chateval.py # register agent system -AgentSystemRegistry.register( - "chateval", - ChatEval, - num_agents=3, - num_rounds=2 -) +AgentSystemRegistry.register("chateval", ChatEval, num_agents=3, num_rounds=2) ``` ### ⚡ Advanced Features @@ -236,66 +246,82 @@ AgentSystemRegistry.register( #### 🎨 Format Prompt Integration **📋 Implementation Guide:** - - Accept `format_prompt` parameter in initialization - - Store format prompt for benchmark-specific requirements - - Use format prompt in result extraction and agent prompts - - Configure timeout and retry settings for robust operation + +- Accept `format_prompt` parameter in initialization +- Store format prompt for benchmark-specific requirements +- Use format prompt in result extraction and agent prompts +- Configure timeout and retry settings for robust operation **💡 ChatEval Implementation Example (Format Prompt Integration):** -``` + +```python # mas_arena/agents/chateval.py - def __init__(self, model_name: str = None, format_prompt: str = ""): - self.model_name = model_name or os.getenv("MODEL_NAME", "gpt-4o-mini") +def get_model_name(config_model_name: Optional[str] = None) -> str: + """Get model name from config or environment variable""" + return config_model_name or os.getenv("MODEL_NAME", DEFAULT_MODEL_NAME) + +def create_llm(model_name: str) -> ChatOpenAI: + """Create ChatOpenAI instance with standardized configuration""" + return ChatOpenAI( + model=model_name, + timeout=60, # Set request timeout to 60 seconds + max_retries=2, # Set maximum retry attempts to 2 + ) + +class ResultExtractor: + def __init__(self, model_name: Optional[str] = None, format_prompt: str = ""): + self.model_name = get_model_name(model_name) self.format_prompt = format_prompt - self.llm = ChatOpenAI( - model=self.model_name, - request_timeout=60, # Set request timeout to 60 seconds - max_retries=2 # Set maximum retry attempts to 2 - ) + self.llm = create_llm(self.model_name) self.name = "result_extractor" ``` #### 🤖 Agent Node Pattern **📋 Implementation Guide:** - - Use dataclass decorator for clean agent definition - - Include required attributes: agent_id, name, model_name, system_prompt - - Initialize chat history as empty list - - Set up LLM instance with timeout and retry configuration - - Ensure compatibility with tool integration framework + +- Use dataclass decorator for clean agent definition +- Include required attributes: agent_id, name, model_name, system_prompt +- Initialize chat history as empty list +- Set up LLM instance with timeout and retry configuration +- Ensure compatibility with tool integration framework **💡 ChatEval Implementation Example (Agent Class Definition):** -``` + +```python # mas_arena/agents/chateval.py +from dataclasses import dataclass + @dataclass class Agent: """Represents an LLM agent""" - agent_id: str + + agent_id: int name: str model_name: str system_prompt: str chat_history: List[Dict[str, str]] = None - + def __post_init__(self): - self.chat_history = [] - self.llm = ChatOpenAI( - model=self.model_name, - request_timeout=60, # Set request timeout to 60 seconds - max_retries=2 # Set maximum retry attempts to 2 - ) + if self.chat_history is None: + self.chat_history = [] + self.llm = create_llm(self.model_name) ``` #### 🔄 Usage Metadata Handling **📋 Implementation Guide:** - - For native OpenAI API calls or non-structured output: No manual handling required - - For structured output: Use `self.llm.with_structured_output(schema=AgentResponse, include_raw=True)` - - Usage metadata is automatically handled by the framework - - Focus on implementing the structured output schema instead + +- For native OpenAI API calls or non-structured output: No manual handling required +- For structured output: Use `self.llm.with_structured_output(schema=AgentResponse, include_raw=True)` +- Usage metadata is automatically handled by the framework +- Focus on implementing core agent logic and response processing +- Return standardized response format with message and solution ### 📋 Key Implementation Summary **🔧 Implementation Points:** + - Inherit from `AgentSystem` base class - Implement required `run_agent()` method - Ensure config includes `evaluator` key @@ -316,14 +342,16 @@ Use `AgentSystemRegistry.register()` to register system and provide default conf #### Step 1: Basic Structure and Registration **📋 Implementation Guide:** - - Use `@register_benchmark` decorator to register evaluator - - Define normalization keys mapping for data field standardization - - Inherit from `BaseEvaluator` base class - - Provide comprehensive docstring explaining evaluator purpose - - Set up evaluator name and supported answer formats + +- Use `@register_benchmark` decorator to register evaluator +- Define normalization keys mapping for data field standardization +- Inherit from `BaseEvaluator` base class +- Provide comprehensive docstring explaining evaluator purpose +- Set up evaluator name and supported answer formats **💡 MMLU_pro Implementation Example (Registration and Class Definition):** -``` + +```python # mas_arena/evaluators/mmlu_pro_evaluator.py @register_benchmark( name="mmlu_pro", @@ -345,14 +373,16 @@ class MMLU_ProEvaluator(BaseEvaluator): #### Step 2: Initialize Configuration **📋 Implementation Guide:** - - Call parent class initialization with name and config - - Set up evaluation-specific weights and parameters - - Configure dataset loading and validation - - Set up logging and error handling - - Define evaluation metrics and scoring methods + +- Call parent class initialization with name and config +- Set up evaluation-specific weights and parameters +- Configure dataset loading and validation +- Set up logging and error handling +- Define evaluation metrics and scoring methods **💡 MMLU_pro Implementation Example (Initialization):** -``` + +```python # mas_arena/evaluators/mmlu_pro_evaluator.py def __init__(self, name="mmlu_pro", config=None): """ @@ -376,15 +406,17 @@ class MMLU_ProEvaluator(BaseEvaluator): #### Step 3: Implement Core Evaluation Method **📋 Implementation Guide:** - - Extract final answer and reference solution from inputs - - Use specialized answer extraction method for response parsing - - Apply scoring logic (exact match, numerical comparison, etc.) - - Calculate evaluation metrics and scores - - Return standardized evaluation results dictionary - - Include extracted answer and original final answer + +- Extract final answer and reference solution from inputs +- Use specialized answer extraction method for response parsing +- Apply scoring logic (exact match, numerical comparison, etc.) +- Calculate evaluation metrics and scores +- Return standardized evaluation results dictionary +- Include extracted answer and original final answer **💡 MMLU_pro Implementation Example (evaluate Method):** -``` + +```python # mas_arena/evaluators/mmlu_pro_evaluator.py def evaluate(self, problem: Dict[str, Any], run_result: Dict[str, Any]) -> Dict[str, Any]: """ @@ -424,14 +456,16 @@ class MMLU_ProEvaluator(BaseEvaluator): #### 🔍 Answer Extraction **📋 Implementation Guide:** - - Use regular expressions to extract formatted answers - - Handle multiple answer formats (tags, patterns, raw text) - - Implement fallback strategies for unformatted responses - - Clean and normalize extracted text - - Support flexible answer parsing for different benchmarks + +- Use regular expressions to extract formatted answers +- Handle multiple answer formats (tags, patterns, raw text) +- Implement fallback strategies for unformatted responses +- Clean and normalize extracted text +- Support flexible answer parsing for different benchmarks **💡 MMLU_pro Implementation Example (Answer Extraction):** -``` + +```python # mas_arena/evaluators/mmlu_pro_evaluator.py def extract_answer_from_response(self, response: str) -> str: """ @@ -455,14 +489,16 @@ class MMLU_ProEvaluator(BaseEvaluator): #### ✅ Answer Verification **📋 Implementation Guide:** - - Implement case-insensitive comparison for text answers - - Handle numerical index to letter conversion (1→A, 2→B, etc.) - - Apply normalization and cleaning to both reference and candidate - - Return numerical score (1.0 for match, 0.0 for no match) - - Include error handling for malformed inputs + +- Implement case-insensitive comparison for text answers +- Handle numerical index to letter conversion (1→A, 2→B, etc.) +- Apply normalization and cleaning to both reference and candidate +- Return numerical score (1.0 for match, 0.0 for no match) +- Include error handling for malformed inputs **💡 MMLU_pro Implementation Example (Exact Match Verification):** -``` + +```python # mas_arena/evaluators/mmlu_pro_evaluator.py def check_exact_match(self, reference: str, candidate: str) -> float: """ @@ -499,15 +535,17 @@ class MMLU_ProEvaluator(BaseEvaluator): #### 📊 Batch Evaluation **📋 Implementation Guide:** - - Iterate through all problems in the batch - - Extract problem IDs and reference answers for each item - - Apply evaluation logic consistently across all problems - - Collect comprehensive results with metadata - - Log evaluation progress and summary statistics - - Return standardized results format for benchmark runner + +- Iterate through all problems in the batch +- Extract problem IDs and reference answers for each item +- Apply evaluation logic consistently across all problems +- Collect comprehensive results with metadata +- Log evaluation progress and summary statistics +- Return standardized results format for benchmark runner **💡 MMLU_pro Implementation Example (Batch Evaluation):** -``` + +```python # mas_arena/evaluators/mmlu_pro_evaluator.py def batch_evaluate(self, problems: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]: """ @@ -554,6 +592,7 @@ class MMLU_ProEvaluator(BaseEvaluator): ### 💻 Code Evaluation **🔧 Code Evaluator Key Points:** + - Inherit from `BaseCodeEvaluator` base class (not BaseEvaluator) - Implement `check_solution(code, test, entry_point)` method - Implement `extract_code(text)` to extract code from responses @@ -561,6 +600,7 @@ class MMLU_ProEvaluator(BaseEvaluator): - Use isolated environments for code execution **📊 Core Process Flow:** + 1. **Code Extraction** - Extract Python code from agent responses 2. **Environment Isolation** - Create secure execution environment 3. **Test Execution** - Run test cases to verify code correctness @@ -569,6 +609,7 @@ class MMLU_ProEvaluator(BaseEvaluator): ### 📋 Evaluator Implementation Summary **🔧 Core Components:** + - Use `@register_benchmark` decorator for registration - Inherit from `BaseEvaluator` base class - Implement required `evaluate()` method @@ -576,12 +617,14 @@ class MMLU_ProEvaluator(BaseEvaluator): - Optional: Implement answer extraction and verification methods **📊 Evaluation Process:** + 1. **Data Normalization** - Map fields using normalization_keys 2. **Answer Extraction** - Extract final answer from messages 3. **Answer Verification** - Compare predicted vs reference answers 4. **Result Return** - Return score, extracted_answer, final_answer fields -> 📄 **Complete Implementation References**: +> 📄 **Complete Implementation References**: +> > - Text Evaluator: [`mas_arena/evaluators/mmlu_pro_evaluator.py`](../mas_arena/evaluators/mmlu_pro_evaluator.py) > - Code Evaluator: [`mas_arena/evaluators/humaneval_evaluator.py`](../mas_arena/evaluators/humaneval_evaluator.py) @@ -611,6 +654,7 @@ class MMLU_ProEvaluator(BaseEvaluator): ### 📋 Implementation Checklist **For MAS Extensions:** + - [ ] ✅ Config includes `evaluator` key - [ ] 📊 Messages have `usage_metadata` for token tracking - [ ] 🏷️ Agents have `name` and `llm` attributes (for tool integration) @@ -619,9 +663,9 @@ class MMLU_ProEvaluator(BaseEvaluator): - [ ] 📋 Proper registration with `AgentSystemRegistry` **For Evaluator Extensions:** + - [ ] 🎯 Used `@register_benchmark` decorator - [ ] ✅ Implemented `evaluate` method - [ ] 🗝️ Proper normalization_keys mapping - [ ] 🛡️ Error handling for malformed inputs - [ ] ⏱️ Timeout handling for long operations - diff --git a/mas_arena/agents/chateval.py b/mas_arena/agents/chateval.py index 2759ff4..bfeac02 100644 --- a/mas_arena/agents/chateval.py +++ b/mas_arena/agents/chateval.py @@ -1,210 +1,184 @@ import os -from typing import Dict, List, Any, TypedDict +from typing import Dict, List, Any, Optional, Union from dataclasses import dataclass from langchain_openai import ChatOpenAI from langchain_core.messages import SystemMessage, HumanMessage, AIMessage from mas_arena.agents.base import AgentSystem, AgentSystemRegistry -# define structured output class, use TypedDict instead of Pydantic -class AgentResponse(TypedDict): - """Structured output for agent responses""" - analysis: str # Problem analysis - solution: str # Solution - confidence: int # Confidence level in the solution, range 1-5 + +# Constants +AGENT_NAMES = ["Math Expert", "Logic Expert", "Critical Thinking Expert"] +DEFAULT_MODEL_NAME = "gpt-4o-mini" + + +def get_model_name(config_model_name: Optional[str] = None) -> str: + """Get model name from config or environment variable""" + return config_model_name or os.getenv("MODEL_NAME", DEFAULT_MODEL_NAME) + +def create_llm(model_name: str) -> ChatOpenAI: + """Create ChatOpenAI instance with standardized configuration""" + return ChatOpenAI( + model=model_name, + timeout=60, # Set request timeout to 60 seconds + max_retries=2, # Set maximum retry attempts to 2 + ) + @dataclass class Agent: """Represents an LLM agent""" - agent_id: str + + agent_id: int name: str model_name: str system_prompt: str - chat_history: List[Dict[str, str]] = None - + chat_history: Optional[List[Dict[str, str]]] + def __post_init__(self): - self.chat_history = [] - self.llm = ChatOpenAI( - model=self.model_name, - request_timeout=60, # Set request timeout to 60 seconds - max_retries=2 # Set maximum retry attempts to 2 - ) - - async def generate_response(self, context: str) -> Any: - """Generate agent response""" - messages = [ + if self.chat_history is None: + self.chat_history = [] + self.llm = create_llm(self.model_name) + + def _update_chat_history(self, context: str, response_content: Any) -> None: + """Update chat history with human input and AI response""" + # Handle case where response_content might be a list or other type + content = response_content if isinstance(response_content, str) else str(response_content) + self.chat_history.append({"role": "human", "human": context}) + self.chat_history.append({"role": "ai", "ai": content}) + + def _build_messages(self, context: str) -> List[Union[SystemMessage, HumanMessage, AIMessage]]: + """Build message list for LLM input""" + return [ SystemMessage(content=self.system_prompt), - *[HumanMessage(content=msg["human"]) if msg.get("role") == "human" - else AIMessage(content=msg["ai"]) - for msg in self.chat_history], - HumanMessage(content=context) + *[ + ( + HumanMessage(content=msg["human"]) + if msg.get("role") == "human" + else AIMessage(content=msg["ai"]) + ) + for msg in (self.chat_history or []) + ], + HumanMessage(content=context), ] - - # Use structured output - try: - llm_with_schema = self.llm.with_structured_output(schema=AgentResponse, include_raw=True) - response = await llm_with_schema.ainvoke(messages) - - # Get structured data and raw response - structured_data = response["parsed"] - raw_response = response["raw"] - - - # Ensure structured_data is a dictionary, not an object - if hasattr(structured_data, "dict"): - structured_data = structured_data.dict() - elif hasattr(structured_data, "model_dump"): - structured_data = structured_data.model_dump() - - # Set AI message name - raw_response.name = self.name - - # Update chat history - self.chat_history.append({ - "role": "human", - "human": context - }) - self.chat_history.append({ - "role": "ai", - "ai": raw_response.content - }) - - # Return raw response object - return { - "message": raw_response, - "structured_solution": structured_data, - "solution": raw_response.content - } - - except Exception as e: - print(f"Structured output failed: {str(e)}, falling back to standard output") - - # Fallback to standard output - response = await self.llm.ainvoke(messages) - response.name = self.name - - self.chat_history.append({ - "role": "human", - "human": context - }) - self.chat_history.append({ - "role": "ai", - "ai": response.content - }) - - return { - "message": response, - "solution": response.content - } + + async def generate_response(self, context: str) -> Dict[str, Union[Any, str]]: + """Generate agent response""" + messages = self._build_messages(context) + + # Use standard output directly + response = await self.llm.ainvoke(messages) + response.name = self.name + + self._update_chat_history(context, response.content) + + return {"message": response, "solution": response.content} + class ResultExtractor: """Extract final results from conversation history""" - def __init__(self, model_name: str = None, format_prompt: str = ""): - self.model_name = model_name or os.getenv("MODEL_NAME", "gpt-4o-mini") + + def __init__(self, model_name: Optional[str] = None, format_prompt: str = ""): + self.model_name = get_model_name(model_name) self.format_prompt = format_prompt - self.llm = ChatOpenAI( - model=self.model_name, - request_timeout=60, # Set request timeout to 60 seconds - max_retries=2 # Set maximum retry attempts to 2 - ) + self.llm = create_llm(self.model_name) self.name = "result_extractor" - - async def extract(self, all_histories: List[List[Dict[str, str]]], problem: str) -> Dict[str, Any]: + + async def extract( + self, conversation_history: List[Dict[str, Any]], problem: str + ) -> Dict[str, Any]: """ - Extract final answer from all agents' conversation histories + Extract final answer from chronologically ordered conversation history """ # Select different prompts based on problem type prompt = f"""Original problem: {problem} -Below are the discussion histories of multiple AI agents: +Below are the discussion histories of multiple AI agents in chronological order: -{self._format_histories(all_histories)} +{self._format_histories(conversation_history)} Please analyze the above discussions and provide a final answer. Requirements: - Synthesize all agents' viewpoints. - Choose the most reasonable solution/option. {self.format_prompt} """ - + messages = [ - SystemMessage(content="You are a professional result analyzer, responsible for extracting the final answer from discussions of multiple AI agents."), - HumanMessage(content=prompt) + SystemMessage( + content="You are a professional result analyzer, responsible for extracting the final answer from discussions of multiple AI agents." + ), + HumanMessage(content=prompt), ] - + try: response = await self.llm.ainvoke(messages) response.name = "evaluator" - - return { - "message": response - } + + return {"message": response} except Exception as e: print(f"LLM call failed: {str(e)}") - return { - "message": None - } + return {"message": None} - def _format_histories(self, all_histories: List[List[Dict[str, str]]]) -> str: - """Format all conversation histories""" + def _format_histories(self, conversation_history: List[Dict[str, Any]]) -> str: + """Format conversation history in chronological order""" formatted = [] - agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"] - for i, history in enumerate(all_histories): - formatted.append(f"\n{agent_names[i]}'s discussion:") - for msg in history: - if msg.get("role") == "human": - formatted.append(f"Question: {msg['human']}") - else: - formatted.append(f"Answer: {msg['ai']}") - return "\n".join(formatted) + current_round = None + + for entry in conversation_history: + round_num = entry["round"] + agent_name = entry["agent_name"] + response = entry["response"] + + # Add round header if this is a new round + if current_round != round_num: + formatted.append(f"\n--- Round {round_num} ---") + current_round = round_num + + formatted.append(f"{agent_name}: {response}") + return "\n".join(formatted) + class ChatEval(AgentSystem): """Multi-agent evaluation system based on iterative debate""" - - def __init__(self, name: str = "chateval", config: Dict[str, Any] = None): - super().__init__(name, config) + + def __init__(self, name: str = "chateval", config: Optional[Dict[str, Any]] = None): + super().__init__(name, config or {}) self.config = config or {} self.num_agents = self.config.get("num_agents", 3) self.num_rounds = self.config.get("num_rounds", 2) - self.model_name = self.config.get("model_name") or os.getenv("MODEL_NAME", "gpt-4o-mini") - - # Initialize agents and extractor via _create_agents - # self.agents and self.extractor will be set by _create_agents + self.model_name = get_model_name(self.config.get("model_name")) + agent_components = self._create_agents() self.agents = [w for w in agent_components["workers"] if isinstance(w, Agent)] - extractors = [w for w in agent_components["workers"] if isinstance(w, ResultExtractor)] + extractors = [ + w for w in agent_components["workers"] if isinstance(w, ResultExtractor) + ] if not extractors: - raise ValueError("ResultExtractor not found in components created by _create_agents.") + raise ValueError( + "ResultExtractor not found in components created by _create_agents." + ) self.extractor = extractors[0] - def _create_agents(self) -> List[Agent]: + def _create_agents(self) -> Dict[str, List[Any]]: """Create multiple agent instances and result extractor""" - # This method will be patched by ToolIntegrationWrapper if this system is wrapped. - # The wrapper expects a dictionary: {"workers": [worker1, worker2, ...]} - # Each worker should have a .name and .llm attribute. - debate_agents = [] - agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"] for i in range(self.num_agents): agent = Agent( - agent_id=f"agent_{i+1}", - name=agent_names[i], + agent_id=i+1, + chat_history=[], + name=AGENT_NAMES[i], model_name=self.model_name, - system_prompt=self._get_agent_prompt(i) + system_prompt=self._get_agent_prompt(i), ) debate_agents.append(agent) - - # Create and assign the extractor here - extractor = ResultExtractor(self.model_name, self.format_prompt) - # self.extractor = extractor # Assign to self if needed elsewhere before run_agent completes, - # but __init__ already handles setting self.extractor. - return { - "workers": debate_agents + [extractor] - } + # Create and assign the extractor here + extractor = ResultExtractor(self.model_name, getattr(self, 'format_prompt', '')) + + return {"workers": debate_agents + [extractor]} def _get_agent_prompt(self, agent_index: int) -> str: """Generate specific system prompt for each agent""" - # Set different prompts for three different roles if agent_index == 0: return """You are a Mathematics Expert, focused on solving mathematical problems. You need to: 1. Carefully analyze the key points of mathematical problems @@ -239,78 +213,118 @@ async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]: # store all LLM response objects all_messages = [] - agent_histories = [] - + # Store all responses for context building - indexed by [round][agent_index] + round_responses = [] + # iterative discussion process - agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"] for t in range(self.num_rounds): + # Clear all agents' chat history at the start of each round to prevent accumulation + for agent in self.agents: + agent.chat_history = [] + + # Store responses for this round + current_round_responses = [] + for n, agent in enumerate(self.agents): - # generate response for current agent - context = self._build_context(problem_text, n, t) + # Rebuild context from scratch using all previous responses + context = self._build_context(problem_text, n, t, round_responses) response_data = await agent.generate_response(context) - + # save response object if "message" in response_data: all_messages.append(response_data["message"]) - - # add response to context of subsequent agents + + # Store this agent's response for current round solution_text = response_data.get("solution", "") - for m in range(n + 1, len(self.agents)): - self.agents[m].chat_history.append({ - "role": "human", - "human": f"{agent_names[n]}'s response: {solution_text}" - }) - - # extract all agent chat histories - agent_histories = [agent.chat_history for agent in self.agents] - - # extract final answer - extractor_result = await self.extractor.extract(agent_histories, problem_text) + current_round_responses.append({ + "agent_index": n, + "agent_name": AGENT_NAMES[n], + "response": solution_text + }) + + # Add current round responses to the overall history + round_responses.append(current_round_responses) + + # Generate conversation history from round responses + conversation_history = self._generate_conversation_history(round_responses) + # extract final answer using ResultExtractor + extractor_result = await self.extractor.extract(conversation_history, problem_text) + # add evaluator message if "message" in extractor_result and extractor_result["message"]: all_messages.append(extractor_result["message"]) + return { - "messages": all_messages, # contains all LLM response objects - "final_answer": extractor_result["message"].content + "messages": all_messages, + "conversation_history": conversation_history, + "final_answer": extractor_result["message"].content if extractor_result["message"] else None, } - def _build_context(self, problem: str, agent_index: int, round_num: int) -> str: - """Build context for current agent""" - agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"] - agent_name = agent_names[agent_index] - + def _generate_conversation_history(self, round_responses: List[List[Dict]]) -> List[Dict[str, Any]]: + """Generate conversation history from round responses""" + conversation_history = [] + for round_idx, round_data in enumerate(round_responses): + for response_data in round_data: + conversation_entry = { + "round": round_idx + 1, + "agent_index": response_data["agent_index"], + "agent_name": response_data["agent_name"], + "response": response_data["response"], + } + conversation_history.append(conversation_entry) + return conversation_history + + def _build_context(self, problem: str, agent_index: int, round_num: int, + round_responses: List[List[Dict]]) -> str: + """Build context for current agent with complete history reconstruction""" + agent_name = AGENT_NAMES[agent_index] + problem_statement = f"Original problem: {problem}" - problem_statement += self.format_prompt + # Get format_prompt from self if it exists, otherwise use empty string + format_prompt = getattr(self, 'format_prompt', '') + if format_prompt: + problem_statement += format_prompt + # For the very first agent in the very first round if round_num == 0 and agent_index == 0: return f"Please solve this problem or select the best option based on your expertise:\n{problem_statement}" + + # Build context with all previous discussions + context_parts = [f"Round {round_num + 1}, {agent_name}"] + context_parts.append(problem_statement) - return f"""Round {round_num + 1}, {agent_name} - -{problem_statement} + # Add previous rounds' discussions + if round_responses: + context_parts.append("\nPrevious rounds' discussions:") + for prev_round_idx, prev_round in enumerate(round_responses): + context_parts.append(f"\n--- Round {prev_round_idx + 1} ---") + for response in prev_round: + context_parts.append(f"{response['agent_name']}: {response['response']}") + context_parts.append(""" Please provide your insights based on previous discussions. You can: 1. Agree with and supplement previous viewpoints 2. Propose different solutions or select a different option if applicable 3. Point out potential issues with previous solutions/selected options 4. Provide new ideas or methods 5. Do not overly expand to other problems -If the problem is multiple choice, please indicate your chosen option clearly in your response.""" +If the problem is multiple choice, please indicate your chosen option clearly in your response.""") + + return "\n".join(context_parts) # register agent system -AgentSystemRegistry.register( - "chateval", - ChatEval, - num_agents=3, - num_rounds=2 -) +AgentSystemRegistry.register("chateval", ChatEval, num_agents=3, num_rounds=2) if __name__ == "__main__": - # test - problem = { - "problem": "A positive integer, its square root is 452, find this positive integer." - } - agent = ChatEval(name="chateval", config={"num_agents": 3, "num_rounds": 2}) - result = agent.run_agent(problem) - print(result) + import asyncio + + async def test(): + problem = { + "problem": "A positive integer, its square root is 452, find this positive integer." + } + agent = ChatEval(name="chateval", config={"num_agents": 3, "num_rounds": 1}) + result = await agent.run_agent(problem) + print(result) + + asyncio.run(test())