diff --git a/docs/extending.md b/docs/extending.md
index c8f70af..3f15045 100644
--- a/docs/extending.md
+++ b/docs/extending.md
@@ -24,6 +24,7 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 ### 📋 Implementation Requirements
 
 **✅ Essential Requirements:**
+
 - Extend `AgentSystem` base class
 - Implement `run_agent()` method (abstract method - required)
 - Include `evaluator` in config during initialization
@@ -31,6 +32,7 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 - Register with `AgentSystemRegistry`
 
 **💡 Optional but Recommended:**
+
 - Implement `_create_agents()` for tool integration support
 - Use `self.format_prompt` for benchmark-specific formatting
 - Handle async execution properly if needed
@@ -40,18 +42,19 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 #### Step 1: Create Agent System Class Structure
 
 ✅ Langgraph supported
-✅ Customizable agent and multi-agent interaction 
+✅ Customizable agent and multi-agent interaction
 
 **📋 Implementation Guide:**
-   - Inherit from `AgentSystem` base class
-   - Initialize configuration parameters (num_agents, num_rounds, model_name)
-   - Set up agent components using `_create_agents()` method
-   - Extract workers and result extractors from created components
-   - Validate that required components are available
+
+- Inherit from `AgentSystem` base class
+- Initialize configuration parameters (num_agents, num_rounds, model_name)
+- Set up agent components using `_create_agents()` method
+- Extract workers and result extractors from created components
+- Validate that required components are available
 
 **💡 SupervisorMAS Implementation Example (LangGraph Structure):**
 
-```
+```python
 # mas_arena/agents/supervisor_mas.py
 
     def _init_graph_if_needed(self, problem_input: Optional[Any] = None, feedback: Optional[Any] = None):
@@ -93,40 +96,45 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 ```
 
 **💡 ChatEval Implementation Example (Basic Structure):**
-```
+
+```python
 # mas_arena/agents/chateval.py
 class ChatEval(AgentSystem):
     """Multi-agent evaluation system based on iterative debate"""
     
-    def __init__(self, name: str = "chateval", config: Dict[str, Any] = None):
-        super().__init__(name, config)
+    def __init__(self, name: str = "chateval", config: Optional[Dict[str, Any]] = None):
+        super().__init__(name, config or {})
         self.config = config or {}
         self.num_agents = self.config.get("num_agents", 3)
         self.num_rounds = self.config.get("num_rounds", 2)
-        self.model_name = self.config.get("model_name") or os.getenv("MODEL_NAME", "gpt-4o-mini")
-        
-        # Initialize agents and extractor via _create_agents
-        # self.agents and self.extractor will be set by _create_agents
+        self.model_name = get_model_name(self.config.get("model_name"))
+
         agent_components = self._create_agents()
         self.agents = [w for w in agent_components["workers"] if isinstance(w, Agent)]
-        extractors = [w for w in agent_components["workers"] if isinstance(w, ResultExtractor)]
+        extractors = [
+            w for w in agent_components["workers"] if isinstance(w, ResultExtractor)
+        ]
         if not extractors:
-            raise ValueError("ResultExtractor not found in components created by _create_agents.")
+            raise ValueError(
+                "ResultExtractor not found in components created by _create_agents."
+            )
         self.extractor = extractors[0]
 ```
 
 #### Step 2: Implement Core `run_agent` Method
 
 **📋 Implementation Guide:**
-   - Extract problem text from input dictionary
-   - Initialize message storage for tracking LLM responses
-   - Implement multi-round agent interaction logic
-   - Collect and process agent responses with proper metadata
-   - Extract final answer using result extractor
-   - Return formatted result with messages and final answer
+
+- Extract problem text from input dictionary
+- Initialize message storage for tracking LLM responses
+- Implement multi-round agent interaction logic
+- Collect and process agent responses with proper metadata
+- Extract final answer using result extractor
+- Return formatted result with messages and final answer
 
 **💡 ChatEval Implementation Example (run_agent Core Method):**
-```
+
+```python
 # mas_arena/agents/chateval.py
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """Run iterative debate process"""
@@ -134,101 +142,103 @@ class ChatEval(AgentSystem):
 
         # store all LLM response objects
         all_messages = []
-        agent_histories = []
-        
+        # Store all responses for context building - indexed by [round][agent_index]
+        round_responses = []
+
         # iterative discussion process
-        agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"]
         for t in range(self.num_rounds):
+            # Clear all agents' chat history at the start of each round to prevent accumulation
+            for agent in self.agents:
+                agent.chat_history = []
+            
+            # Store responses for this round
+            current_round_responses = []
+            
             for n, agent in enumerate(self.agents):
-                # generate response for current agent
-                context = self._build_context(problem_text, n, t)
-                response_data = agent.generate_response(context)
-                
+                # Rebuild context from scratch using all previous responses
+                context = self._build_context(problem_text, n, t, round_responses)
+                response_data = await agent.generate_response(context)
+
                 # save response object
                 if "message" in response_data:
                     all_messages.append(response_data["message"])
-                
-                # add response to context of subsequent agents
+
+                # Store this agent's response for current round
                 solution_text = response_data.get("solution", "")
-                for m in range(n + 1, len(self.agents)):
-                    self.agents[m].chat_history.append({
-                        "role": "human",
-                        "human": f"{agent_names[n]}'s response: {solution_text}"
-                    })
-        
-        # extract all agent chat histories
-        agent_histories = [agent.chat_history for agent in self.agents]
-        
-        # extract final answer
-        extractor_result = self.extractor.extract(agent_histories, problem_text)
+                current_round_responses.append({
+                    "agent_index": n,
+                    "agent_name": AGENT_NAMES[n],
+                    "response": solution_text
+                })
+            
+            # Add current round responses to the overall history
+            round_responses.append(current_round_responses)
+
+        # Generate conversation history from round responses
+        conversation_history = self._generate_conversation_history(round_responses)
         
+        # extract final answer using ResultExtractor
+        extractor_result = await self.extractor.extract(conversation_history, problem_text)
+
         # add evaluator message
         if "message" in extractor_result and extractor_result["message"]:
             all_messages.append(extractor_result["message"])
+        
         return {
-            "messages": all_messages,  # contains all LLM response objects
-            "final_answer": extractor_result["message"].content
+            "messages": all_messages,
+            "conversation_history": conversation_history,
+            "final_answer": extractor_result["message"].content if extractor_result["message"] else None,
         }
 ```
 
 #### Step 3: Implement `_create_agents` Method (Tool Integration Support)
 
 **📋 Implementation Guide:**
-   - Create specialized `AgentNode` instances for each role
-   - Set agent names, models, and system prompts
-   - Create result extractor with format prompt integration
-   - Return dictionary with "workers" key containing all components
-   - Ensure each worker has `.name` and `.llm` attributes for tool binding
+
+- Create specialized `AgentNode` instances for each role
+- Set agent names, models, and system prompts
+- Create result extractor with format prompt integration
+- Return dictionary with "workers" key containing all components
+- Ensure each worker has `.name` and `.llm` attributes for tool binding
 
 **💡 ChatEval Implementation Example (_create_agents Tool Integration):**
-```
+
+```python
 # mas_arena/agents/chateval.py
-    def _create_agents(self) -> List[Agent]:
+    def _create_agents(self) -> Dict[str, List[Any]]:
         """Create multiple agent instances and result extractor"""
-        # This method will be patched by ToolIntegrationWrapper if this system is wrapped.
-        # The wrapper expects a dictionary: {"workers": [worker1, worker2, ...]}
-        # Each worker should have a .name and .llm attribute.
-        
         debate_agents = []
-        agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"]
         for i in range(self.num_agents):
             agent = Agent(
-                agent_id=f"agent_{i+1}",
-                name=agent_names[i],
+                agent_id=i+1,
+                name=AGENT_NAMES[i],
                 model_name=self.model_name,
-                system_prompt=self._get_agent_prompt(i)
+                system_prompt=self._get_agent_prompt(i),
             )
             debate_agents.append(agent)
-        
-        # Create and assign the extractor here
-        extractor = ResultExtractor(self.model_name, self.format_prompt)
-        # self.extractor = extractor # Assign to self if needed elsewhere before run_agent completes,
-                                 # but __init__ already handles setting self.extractor.
 
-        return {
-            "workers": debate_agents + [extractor]
-        }
+        # Create and assign the extractor here  
+        extractor = ResultExtractor(self.model_name, getattr(self, 'format_prompt', ''))
+
+        return {"workers": debate_agents + [extractor]}
 ```
 
 #### Step 4: Register System with Framework
 
 **📋 Implementation Guide:**
-   - Use `AgentSystemRegistry.register()` to make system available
-   - Provide system name as string identifier
-   - Pass class reference (not instance)
-   - Include default configuration parameters
-   - These defaults can be overridden during initialization
+
+- Use `AgentSystemRegistry.register()` to make system available
+- Provide system name as string identifier
+- Pass class reference (not instance)
+- Include default configuration parameters
+- These defaults can be overridden during initialization
 
 **💡 ChatEval Implementation Example (Registration):**
-```
+
+```python
 # mas_arena/agents/chateval.py
 # register agent system
-AgentSystemRegistry.register(
-    "chateval",
-    ChatEval,
-    num_agents=3,
-    num_rounds=2
-)
+AgentSystemRegistry.register("chateval", ChatEval, num_agents=3, num_rounds=2)
 ```
 
 ### ⚡ Advanced Features
@@ -236,66 +246,82 @@ AgentSystemRegistry.register(
 #### 🎨 Format Prompt Integration
 
 **📋 Implementation Guide:**
-   - Accept `format_prompt` parameter in initialization
-   - Store format prompt for benchmark-specific requirements
-   - Use format prompt in result extraction and agent prompts
-   - Configure timeout and retry settings for robust operation
+
+- Accept `format_prompt` parameter in initialization
+- Store format prompt for benchmark-specific requirements
+- Use format prompt in result extraction and agent prompts
+- Configure timeout and retry settings for robust operation
 
 **💡 ChatEval Implementation Example (Format Prompt Integration):**
-```
+
+```python
 # mas_arena/agents/chateval.py
-    def __init__(self, model_name: str = None, format_prompt: str = ""):
-        self.model_name = model_name or os.getenv("MODEL_NAME", "gpt-4o-mini")
+def get_model_name(config_model_name: Optional[str] = None) -> str:
+    """Get model name from config or environment variable"""
+    return config_model_name or os.getenv("MODEL_NAME", DEFAULT_MODEL_NAME)
+
+def create_llm(model_name: str) -> ChatOpenAI:
+    """Create ChatOpenAI instance with standardized configuration"""
+    return ChatOpenAI(
+        model=model_name,
+        timeout=60,  # Set request timeout to 60 seconds
+        max_retries=2,  # Set maximum retry attempts to 2
+    )
+
+class ResultExtractor:
+    def __init__(self, model_name: Optional[str] = None, format_prompt: str = ""):
+        self.model_name = get_model_name(model_name)
         self.format_prompt = format_prompt
-        self.llm = ChatOpenAI(
-            model=self.model_name,
-            request_timeout=60,  # Set request timeout to 60 seconds
-            max_retries=2        # Set maximum retry attempts to 2
-        )
+        self.llm = create_llm(self.model_name)
         self.name = "result_extractor"
 ```
 
 #### 🤖 Agent Node Pattern
 
 **📋 Implementation Guide:**
-   - Use dataclass decorator for clean agent definition
-   - Include required attributes: agent_id, name, model_name, system_prompt
-   - Initialize chat history as empty list
-   - Set up LLM instance with timeout and retry configuration
-   - Ensure compatibility with tool integration framework
+
+- Use dataclass decorator for clean agent definition
+- Include required attributes: agent_id, name, model_name, system_prompt
+- Initialize chat history as empty list
+- Set up LLM instance with timeout and retry configuration
+- Ensure compatibility with tool integration framework
 
 **💡 ChatEval Implementation Example (Agent Class Definition):**
-```
+
+```python
 # mas_arena/agents/chateval.py
+from dataclasses import dataclass
+
 @dataclass
 class Agent:
     """Represents an LLM agent"""
-    agent_id: str
+
+    agent_id: int
     name: str
     model_name: str
     system_prompt: str
     chat_history: List[Dict[str, str]] = None
-    
+
     def __post_init__(self):
-        self.chat_history = []
-        self.llm = ChatOpenAI(
-            model=self.model_name,
-            request_timeout=60,  # Set request timeout to 60 seconds
-            max_retries=2        # Set maximum retry attempts to 2
-        )
+        if self.chat_history is None:
+            self.chat_history = []
+        self.llm = create_llm(self.model_name)
 ```
 
 #### 🔄 Usage Metadata Handling
 
 **📋 Implementation Guide:**
-   - For native OpenAI API calls or non-structured output: No manual handling required
-   - For structured output: Use `self.llm.with_structured_output(schema=AgentResponse, include_raw=True)`
-   - Usage metadata is automatically handled by the framework
-   - Focus on implementing the structured output schema instead
+
+- For native OpenAI API calls or non-structured output: No manual handling required
+- For structured output: Use `self.llm.with_structured_output(schema=AgentResponse, include_raw=True)`
+- Usage metadata is automatically handled by the framework
+- Focus on implementing core agent logic and response processing
+- Return standardized response format with message and solution
 
 ### 📋 Key Implementation Summary
 
 **🔧 Implementation Points:**
+
 - Inherit from `AgentSystem` base class
 - Implement required `run_agent()` method  
 - Ensure config includes `evaluator` key
@@ -316,14 +342,16 @@ Use `AgentSystemRegistry.register()` to register system and provide default conf
 #### Step 1: Basic Structure and Registration
 
 **📋 Implementation Guide:**
-   - Use `@register_benchmark` decorator to register evaluator
-   - Define normalization keys mapping for data field standardization
-   - Inherit from `BaseEvaluator` base class
-   - Provide comprehensive docstring explaining evaluator purpose
-   - Set up evaluator name and supported answer formats
+
+- Use `@register_benchmark` decorator to register evaluator
+- Define normalization keys mapping for data field standardization
+- Inherit from `BaseEvaluator` base class
+- Provide comprehensive docstring explaining evaluator purpose
+- Set up evaluator name and supported answer formats
 
 **💡 MMLU_pro Implementation Example (Registration and Class Definition):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
 @register_benchmark(
     name="mmlu_pro",
@@ -345,14 +373,16 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### Step 2: Initialize Configuration
 
 **📋 Implementation Guide:**
-   - Call parent class initialization with name and config
-   - Set up evaluation-specific weights and parameters
-   - Configure dataset loading and validation
-   - Set up logging and error handling
-   - Define evaluation metrics and scoring methods
+
+- Call parent class initialization with name and config
+- Set up evaluation-specific weights and parameters
+- Configure dataset loading and validation
+- Set up logging and error handling
+- Define evaluation metrics and scoring methods
 
 **💡 MMLU_pro Implementation Example (Initialization):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def __init__(self, name="mmlu_pro", config=None):
         """
@@ -376,15 +406,17 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### Step 3: Implement Core Evaluation Method
 
 **📋 Implementation Guide:**
-   - Extract final answer and reference solution from inputs
-   - Use specialized answer extraction method for response parsing
-   - Apply scoring logic (exact match, numerical comparison, etc.)
-   - Calculate evaluation metrics and scores
-   - Return standardized evaluation results dictionary
-   - Include extracted answer and original final answer
+
+- Extract final answer and reference solution from inputs
+- Use specialized answer extraction method for response parsing
+- Apply scoring logic (exact match, numerical comparison, etc.)
+- Calculate evaluation metrics and scores
+- Return standardized evaluation results dictionary
+- Include extracted answer and original final answer
 
 **💡 MMLU_pro Implementation Example (evaluate Method):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def evaluate(self, problem: Dict[str, Any], run_result: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -424,14 +456,16 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### 🔍 Answer Extraction
 
 **📋 Implementation Guide:**
-   - Use regular expressions to extract formatted answers
-   - Handle multiple answer formats (tags, patterns, raw text)
-   - Implement fallback strategies for unformatted responses
-   - Clean and normalize extracted text
-   - Support flexible answer parsing for different benchmarks
+
+- Use regular expressions to extract formatted answers
+- Handle multiple answer formats (tags, patterns, raw text)
+- Implement fallback strategies for unformatted responses
+- Clean and normalize extracted text
+- Support flexible answer parsing for different benchmarks
 
 **💡 MMLU_pro Implementation Example (Answer Extraction):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def extract_answer_from_response(self, response: str) -> str:
         """
@@ -455,14 +489,16 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### ✅ Answer Verification
 
 **📋 Implementation Guide:**
-   - Implement case-insensitive comparison for text answers
-   - Handle numerical index to letter conversion (1→A, 2→B, etc.)
-   - Apply normalization and cleaning to both reference and candidate
-   - Return numerical score (1.0 for match, 0.0 for no match)
-   - Include error handling for malformed inputs
+
+- Implement case-insensitive comparison for text answers
+- Handle numerical index to letter conversion (1→A, 2→B, etc.)
+- Apply normalization and cleaning to both reference and candidate
+- Return numerical score (1.0 for match, 0.0 for no match)
+- Include error handling for malformed inputs
 
 **💡 MMLU_pro Implementation Example (Exact Match Verification):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def check_exact_match(self, reference: str, candidate: str) -> float:
         """
@@ -499,15 +535,17 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### 📊 Batch Evaluation
 
 **📋 Implementation Guide:**
-   - Iterate through all problems in the batch
-   - Extract problem IDs and reference answers for each item
-   - Apply evaluation logic consistently across all problems
-   - Collect comprehensive results with metadata
-   - Log evaluation progress and summary statistics
-   - Return standardized results format for benchmark runner
+
+- Iterate through all problems in the batch
+- Extract problem IDs and reference answers for each item
+- Apply evaluation logic consistently across all problems
+- Collect comprehensive results with metadata
+- Log evaluation progress and summary statistics
+- Return standardized results format for benchmark runner
 
 **💡 MMLU_pro Implementation Example (Batch Evaluation):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def batch_evaluate(self, problems: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]:
         """
@@ -554,6 +592,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 ### 💻 Code Evaluation
 
 **🔧 Code Evaluator Key Points:**
+
 - Inherit from `BaseCodeEvaluator` base class (not BaseEvaluator)
 - Implement `check_solution(code, test, entry_point)` method
 - Implement `extract_code(text)` to extract code from responses
@@ -561,6 +600,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 - Use isolated environments for code execution
 
 **📊 Core Process Flow:**
+
 1. **Code Extraction** - Extract Python code from agent responses
 2. **Environment Isolation** - Create secure execution environment
 3. **Test Execution** - Run test cases to verify code correctness
@@ -569,6 +609,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 ### 📋 Evaluator Implementation Summary
 
 **🔧 Core Components:**
+
 - Use `@register_benchmark` decorator for registration
 - Inherit from `BaseEvaluator` base class
 - Implement required `evaluate()` method
@@ -576,12 +617,14 @@ class MMLU_ProEvaluator(BaseEvaluator):
 - Optional: Implement answer extraction and verification methods
 
 **📊 Evaluation Process:**
+
 1. **Data Normalization** - Map fields using normalization_keys
 2. **Answer Extraction** - Extract final answer from messages
 3. **Answer Verification** - Compare predicted vs reference answers
 4. **Result Return** - Return score, extracted_answer, final_answer fields
 
-> 📄 **Complete Implementation References**: 
+> 📄 **Complete Implementation References**:
+>
 > - Text Evaluator: [`mas_arena/evaluators/mmlu_pro_evaluator.py`](../mas_arena/evaluators/mmlu_pro_evaluator.py)
 > - Code Evaluator: [`mas_arena/evaluators/humaneval_evaluator.py`](../mas_arena/evaluators/humaneval_evaluator.py)
 
@@ -611,6 +654,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 ### 📋 Implementation Checklist
 
 **For MAS Extensions:**
+
 - [ ] ✅ Config includes `evaluator` key
 - [ ] 📊 Messages have `usage_metadata` for token tracking
 - [ ] 🏷️ Agents have `name` and `llm` attributes (for tool integration)
@@ -619,9 +663,9 @@ class MMLU_ProEvaluator(BaseEvaluator):
 - [ ] 📋 Proper registration with `AgentSystemRegistry`
 
 **For Evaluator Extensions:**
+
 - [ ] 🎯 Used `@register_benchmark` decorator
 - [ ] ✅ Implemented `evaluate` method
 - [ ] 🗝️ Proper normalization_keys mapping
 - [ ] 🛡️ Error handling for malformed inputs
 - [ ] ⏱️ Timeout handling for long operations
-
diff --git a/mas_arena/agents/chateval.py b/mas_arena/agents/chateval.py
index 2759ff4..bfeac02 100644
--- a/mas_arena/agents/chateval.py
+++ b/mas_arena/agents/chateval.py
@@ -1,210 +1,184 @@
 import os
-from typing import Dict, List, Any, TypedDict
+from typing import Dict, List, Any, Optional, Union
 from dataclasses import dataclass
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
-# define structured output class, use TypedDict instead of Pydantic
-class AgentResponse(TypedDict):
-    """Structured output for agent responses"""
-    analysis: str  # Problem analysis
-    solution: str  # Solution
-    confidence: int  # Confidence level in the solution, range 1-5
+
+# Constants
+AGENT_NAMES = ["Math Expert", "Logic Expert", "Critical Thinking Expert"]
+DEFAULT_MODEL_NAME = "gpt-4o-mini"
+
+
+def get_model_name(config_model_name: Optional[str] = None) -> str:
+    """Get model name from config or environment variable"""
+    return config_model_name or os.getenv("MODEL_NAME", DEFAULT_MODEL_NAME)
+
+def create_llm(model_name: str) -> ChatOpenAI:
+    """Create ChatOpenAI instance with standardized configuration"""
+    return ChatOpenAI(
+        model=model_name,
+        timeout=60,  # Set request timeout to 60 seconds
+        max_retries=2,  # Set maximum retry attempts to 2
+    )
+
 
 @dataclass
 class Agent:
     """Represents an LLM agent"""
-    agent_id: str
+
+    agent_id: int
     name: str
     model_name: str
     system_prompt: str
-    chat_history: List[Dict[str, str]] = None
-    
+    chat_history: Optional[List[Dict[str, str]]] 
+
     def __post_init__(self):
-        self.chat_history = []
-        self.llm = ChatOpenAI(
-            model=self.model_name,
-            request_timeout=60,  # Set request timeout to 60 seconds
-            max_retries=2        # Set maximum retry attempts to 2
-        )
-
-    async def generate_response(self, context: str) -> Any:
-        """Generate agent response"""
-        messages = [
+        if self.chat_history is None:
+            self.chat_history = []
+        self.llm = create_llm(self.model_name)
+
+    def _update_chat_history(self, context: str, response_content: Any) -> None:
+        """Update chat history with human input and AI response"""
+        # Handle case where response_content might be a list or other type
+        content = response_content if isinstance(response_content, str) else str(response_content)
+        self.chat_history.append({"role": "human", "human": context})
+        self.chat_history.append({"role": "ai", "ai": content})
+
+    def _build_messages(self, context: str) -> List[Union[SystemMessage, HumanMessage, AIMessage]]:
+        """Build message list for LLM input"""
+        return [
             SystemMessage(content=self.system_prompt),
-            *[HumanMessage(content=msg["human"]) if msg.get("role") == "human" 
-              else AIMessage(content=msg["ai"]) 
-              for msg in self.chat_history],
-            HumanMessage(content=context)
+            *[
+                (
+                    HumanMessage(content=msg["human"])
+                    if msg.get("role") == "human"
+                    else AIMessage(content=msg["ai"])
+                )
+                for msg in (self.chat_history or [])
+            ],
+            HumanMessage(content=context),
         ]
-        
-        # Use structured output
-        try:
-            llm_with_schema = self.llm.with_structured_output(schema=AgentResponse, include_raw=True)
-            response = await llm_with_schema.ainvoke(messages)
-            
-            # Get structured data and raw response
-            structured_data = response["parsed"]
-            raw_response = response["raw"]
-            
-            
-            # Ensure structured_data is a dictionary, not an object
-            if hasattr(structured_data, "dict"):
-                structured_data = structured_data.dict()
-            elif hasattr(structured_data, "model_dump"):
-                structured_data = structured_data.model_dump()
-            
-            # Set AI message name
-            raw_response.name = self.name
-            
-            # Update chat history
-            self.chat_history.append({
-                "role": "human",
-                "human": context
-            })
-            self.chat_history.append({
-                "role": "ai",
-                "ai": raw_response.content
-            })
-            
-            # Return raw response object
-            return {
-                "message": raw_response,
-                "structured_solution": structured_data,
-                "solution": raw_response.content
-            }
-            
-        except Exception as e:
-            print(f"Structured output failed: {str(e)}, falling back to standard output")
-            
-            # Fallback to standard output
-            response = await self.llm.ainvoke(messages)
-            response.name = self.name
-            
-            self.chat_history.append({
-                "role": "human",
-                "human": context
-            })
-            self.chat_history.append({
-                "role": "ai",
-                "ai": response.content
-            })
-            
-            return {
-                "message": response,
-                "solution": response.content
-            }
+
+    async def generate_response(self, context: str) -> Dict[str, Union[Any, str]]:
+        """Generate agent response"""
+        messages = self._build_messages(context)
+
+        # Use standard output directly
+        response = await self.llm.ainvoke(messages)
+        response.name = self.name
+
+        self._update_chat_history(context, response.content)
+
+        return {"message": response, "solution": response.content}
+
 
 class ResultExtractor:
     """Extract final results from conversation history"""
-    def __init__(self, model_name: str = None, format_prompt: str = ""):
-        self.model_name = model_name or os.getenv("MODEL_NAME", "gpt-4o-mini")
+
+    def __init__(self, model_name: Optional[str] = None, format_prompt: str = ""):
+        self.model_name = get_model_name(model_name)
         self.format_prompt = format_prompt
-        self.llm = ChatOpenAI(
-            model=self.model_name,
-            request_timeout=60,  # Set request timeout to 60 seconds
-            max_retries=2        # Set maximum retry attempts to 2
-        )
+        self.llm = create_llm(self.model_name)
         self.name = "result_extractor"
-        
-    async def extract(self, all_histories: List[List[Dict[str, str]]], problem: str) -> Dict[str, Any]:
+
+    async def extract(
+        self, conversation_history: List[Dict[str, Any]], problem: str
+    ) -> Dict[str, Any]:
         """
-        Extract final answer from all agents' conversation histories
+        Extract final answer from chronologically ordered conversation history
         """
         # Select different prompts based on problem type
         prompt = f"""Original problem: {problem}
 
-Below are the discussion histories of multiple AI agents:
+Below are the discussion histories of multiple AI agents in chronological order:
 
-{self._format_histories(all_histories)}
+{self._format_histories(conversation_history)}
 
 Please analyze the above discussions and provide a final answer. Requirements:
 - Synthesize all agents' viewpoints.
 - Choose the most reasonable solution/option.
 {self.format_prompt}
 """
-  
+
         messages = [
-            SystemMessage(content="You are a professional result analyzer, responsible for extracting the final answer from discussions of multiple AI agents."),
-            HumanMessage(content=prompt)
+            SystemMessage(
+                content="You are a professional result analyzer, responsible for extracting the final answer from discussions of multiple AI agents."
+            ),
+            HumanMessage(content=prompt),
         ]
-        
+
         try:
             response = await self.llm.ainvoke(messages)
             response.name = "evaluator"
-            
-            return {
-                "message": response
-            }
+
+            return {"message": response}
         except Exception as e:
             print(f"LLM call failed: {str(e)}")
-            return {
-                "message": None
-            }
+            return {"message": None}
 
-    def _format_histories(self, all_histories: List[List[Dict[str, str]]]) -> str:
-        """Format all conversation histories"""
+    def _format_histories(self, conversation_history: List[Dict[str, Any]]) -> str:
+        """Format conversation history in chronological order"""
         formatted = []
-        agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"]
-        for i, history in enumerate(all_histories):
-            formatted.append(f"\n{agent_names[i]}'s discussion:")
-            for msg in history:
-                if msg.get("role") == "human":
-                    formatted.append(f"Question: {msg['human']}")
-                else:
-                    formatted.append(f"Answer: {msg['ai']}")
-        return "\n".join(formatted)
+        current_round = None
+        
+        for entry in conversation_history:
+            round_num = entry["round"]
+            agent_name = entry["agent_name"]
+            response = entry["response"]
+            
+            # Add round header if this is a new round
+            if current_round != round_num:
+                formatted.append(f"\n--- Round {round_num} ---")
+                current_round = round_num
+            
+            formatted.append(f"{agent_name}: {response}")
         
+        return "\n".join(formatted)
+
 
 class ChatEval(AgentSystem):
     """Multi-agent evaluation system based on iterative debate"""
-    
-    def __init__(self, name: str = "chateval", config: Dict[str, Any] = None):
-        super().__init__(name, config)
+
+    def __init__(self, name: str = "chateval", config: Optional[Dict[str, Any]] = None):
+        super().__init__(name, config or {})
         self.config = config or {}
         self.num_agents = self.config.get("num_agents", 3)
         self.num_rounds = self.config.get("num_rounds", 2)
-        self.model_name = self.config.get("model_name") or os.getenv("MODEL_NAME", "gpt-4o-mini")
-        
-        # Initialize agents and extractor via _create_agents
-        # self.agents and self.extractor will be set by _create_agents
+        self.model_name = get_model_name(self.config.get("model_name"))
+
         agent_components = self._create_agents()
         self.agents = [w for w in agent_components["workers"] if isinstance(w, Agent)]
-        extractors = [w for w in agent_components["workers"] if isinstance(w, ResultExtractor)]
+        extractors = [
+            w for w in agent_components["workers"] if isinstance(w, ResultExtractor)
+        ]
         if not extractors:
-            raise ValueError("ResultExtractor not found in components created by _create_agents.")
+            raise ValueError(
+                "ResultExtractor not found in components created by _create_agents."
+            )
         self.extractor = extractors[0]
 
-    def _create_agents(self) -> List[Agent]:
+    def _create_agents(self) -> Dict[str, List[Any]]:
         """Create multiple agent instances and result extractor"""
-        # This method will be patched by ToolIntegrationWrapper if this system is wrapped.
-        # The wrapper expects a dictionary: {"workers": [worker1, worker2, ...]}
-        # Each worker should have a .name and .llm attribute.
-        
         debate_agents = []
-        agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"]
         for i in range(self.num_agents):
             agent = Agent(
-                agent_id=f"agent_{i+1}",
-                name=agent_names[i],
+                agent_id=i+1,
+                chat_history=[],
+                name=AGENT_NAMES[i],
                 model_name=self.model_name,
-                system_prompt=self._get_agent_prompt(i)
+                system_prompt=self._get_agent_prompt(i),
             )
             debate_agents.append(agent)
-        
-        # Create and assign the extractor here
-        extractor = ResultExtractor(self.model_name, self.format_prompt)
-        # self.extractor = extractor # Assign to self if needed elsewhere before run_agent completes,
-                                 # but __init__ already handles setting self.extractor.
 
-        return {
-            "workers": debate_agents + [extractor]
-        }
+        # Create and assign the extractor here  
+        extractor = ResultExtractor(self.model_name, getattr(self, 'format_prompt', ''))
+
+        return {"workers": debate_agents + [extractor]}
 
     def _get_agent_prompt(self, agent_index: int) -> str:
         """Generate specific system prompt for each agent"""
-        # Set different prompts for three different roles
         if agent_index == 0:
             return """You are a Mathematics Expert, focused on solving mathematical problems. You need to:
 1. Carefully analyze the key points of mathematical problems
@@ -239,78 +213,118 @@ async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
 
         # store all LLM response objects
         all_messages = []
-        agent_histories = []
-        
+        # Store all responses for context building - indexed by [round][agent_index]
+        round_responses = []
+
         # iterative discussion process
-        agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"]
         for t in range(self.num_rounds):
+            # Clear all agents' chat history at the start of each round to prevent accumulation
+            for agent in self.agents:
+                agent.chat_history = []
+            
+            # Store responses for this round
+            current_round_responses = []
+            
             for n, agent in enumerate(self.agents):
-                # generate response for current agent
-                context = self._build_context(problem_text, n, t)
+                # Rebuild context from scratch using all previous responses
+                context = self._build_context(problem_text, n, t, round_responses)
                 response_data = await agent.generate_response(context)
-                
+
                 # save response object
                 if "message" in response_data:
                     all_messages.append(response_data["message"])
-                
-                # add response to context of subsequent agents
+
+                # Store this agent's response for current round
                 solution_text = response_data.get("solution", "")
-                for m in range(n + 1, len(self.agents)):
-                    self.agents[m].chat_history.append({
-                        "role": "human",
-                        "human": f"{agent_names[n]}'s response: {solution_text}"
-                    })
-        
-        # extract all agent chat histories
-        agent_histories = [agent.chat_history for agent in self.agents]
-        
-        # extract final answer
-        extractor_result = await self.extractor.extract(agent_histories, problem_text)
+                current_round_responses.append({
+                    "agent_index": n,
+                    "agent_name": AGENT_NAMES[n],
+                    "response": solution_text
+                })
+            
+            # Add current round responses to the overall history
+            round_responses.append(current_round_responses)
+
+        # Generate conversation history from round responses
+        conversation_history = self._generate_conversation_history(round_responses)
         
+        # extract final answer using ResultExtractor
+        extractor_result = await self.extractor.extract(conversation_history, problem_text)
+
         # add evaluator message
         if "message" in extractor_result and extractor_result["message"]:
             all_messages.append(extractor_result["message"])
+        
         return {
-            "messages": all_messages,  # contains all LLM response objects
-            "final_answer": extractor_result["message"].content
+            "messages": all_messages,
+            "conversation_history": conversation_history,
+            "final_answer": extractor_result["message"].content if extractor_result["message"] else None,
         }
 
-    def _build_context(self, problem: str, agent_index: int, round_num: int) -> str:
-        """Build context for current agent"""
-        agent_names = ["Math Expert", "Logic Expert", "Critical Thinking Expert"]
-        agent_name = agent_names[agent_index]
-        
+    def _generate_conversation_history(self, round_responses: List[List[Dict]]) -> List[Dict[str, Any]]:
+        """Generate conversation history from round responses"""
+        conversation_history = []
+        for round_idx, round_data in enumerate(round_responses):
+            for response_data in round_data:
+                conversation_entry = {
+                    "round": round_idx + 1,
+                    "agent_index": response_data["agent_index"],
+                    "agent_name": response_data["agent_name"],
+                    "response": response_data["response"],
+                }
+                conversation_history.append(conversation_entry)
+        return conversation_history
+
+    def _build_context(self, problem: str, agent_index: int, round_num: int, 
+                                   round_responses: List[List[Dict]]) -> str:
+        """Build context for current agent with complete history reconstruction"""
+        agent_name = AGENT_NAMES[agent_index]
+
         problem_statement = f"Original problem: {problem}"
-        problem_statement += self.format_prompt
+        # Get format_prompt from self if it exists, otherwise use empty string
+        format_prompt = getattr(self, 'format_prompt', '')
+        if format_prompt:
+            problem_statement += format_prompt
 
+        # For the very first agent in the very first round
         if round_num == 0 and agent_index == 0:
             return f"Please solve this problem or select the best option based on your expertise:\n{problem_statement}"
+
+        # Build context with all previous discussions
+        context_parts = [f"Round {round_num + 1}, {agent_name}"]
+        context_parts.append(problem_statement)
         
-        return f"""Round {round_num + 1}, {agent_name}
-        
-{problem_statement}
+        # Add previous rounds' discussions
+        if round_responses:
+            context_parts.append("\nPrevious rounds' discussions:")
+            for prev_round_idx, prev_round in enumerate(round_responses):
+                context_parts.append(f"\n--- Round {prev_round_idx + 1} ---")
+                for response in prev_round:
+                    context_parts.append(f"{response['agent_name']}: {response['response']}")
 
+        context_parts.append("""
 Please provide your insights based on previous discussions. You can:
 1. Agree with and supplement previous viewpoints
 2. Propose different solutions or select a different option if applicable
 3. Point out potential issues with previous solutions/selected options
 4. Provide new ideas or methods
 5. Do not overly expand to other problems
-If the problem is multiple choice, please indicate your chosen option clearly in your response."""
+If the problem is multiple choice, please indicate your chosen option clearly in your response.""")
+
+        return "\n".join(context_parts)
 
 # register agent system
-AgentSystemRegistry.register(
-    "chateval",
-    ChatEval,
-    num_agents=3,
-    num_rounds=2
-)
+AgentSystemRegistry.register("chateval", ChatEval, num_agents=3, num_rounds=2)
 
 if __name__ == "__main__":
-    # test
-    problem = {
-        "problem": "A positive integer, its square root is 452, find this positive integer."
-    }
-    agent = ChatEval(name="chateval", config={"num_agents": 3, "num_rounds": 2})
-    result = agent.run_agent(problem)
-    print(result)
+    import asyncio
+    
+    async def test():
+        problem = {
+            "problem": "A positive integer, its square root is 452, find this positive integer."
+        }
+        agent = ChatEval(name="chateval", config={"num_agents": 3, "num_rounds": 1})
+        result = await agent.run_agent(problem)
+        print(result)
+    
+    asyncio.run(test())