feat: Add max_rpm configuration and rate limiting to LLMClient for improved request management

experdot · experdot · commit 8d515cb716bf · 2025-08-12T13:51:48.000+08:00
diff --git a/sources/gc-qa-rag-etl/.config.development.json b/sources/gc-qa-rag-etl/.config.development.json
@@ -7,7 +7,8 @@
     "llm": {
         "api_key": "",
         "api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-        "model_name": "qwen-plus"
+        "model_name": "qwen-plus",
+        "max_rpm": 100
     },
     "embedding": {
         "api_key": ""
diff --git a/sources/gc-qa-rag-etl/.config.production.json b/sources/gc-qa-rag-etl/.config.production.json
@@ -7,7 +7,8 @@
     "llm": {
         "api_key": "",
         "api_base": "https://dashscope.aliyuncs.com/compatible-mode/v1",
-        "model_name": "qwen-plus"
+        "model_name": "qwen-plus",
+        "max_rpm": 100
     },
     "embedding": {
         "api_key": ""
diff --git a/sources/gc-qa-rag-etl/etlapp/common/config.py b/sources/gc-qa-rag-etl/etlapp/common/config.py
@@ -19,6 +19,7 @@ class LlmConfig:
     api_key: str
     api_base: str
     model_name: str
+    max_rpm: int = 100  # 每分钟最大请求数，默认100
 
 
 @dataclass
@@ -65,6 +66,7 @@ def from_environment(cls, environment: str) -> "Config":
                 api_key=config_raw["llm"]["api_key"],
                 api_base=config_raw["llm"]["api_base"],
                 model_name=config_raw["llm"]["model_name"],
+                max_rpm=config_raw["llm"].get("max_rpm", 60),  # 默认60 RPM
             ),
             embedding=EmbeddingConfig(api_key=config_raw["embedding"]["api_key"]),
             vector_db=VectorDbConfig(host=config_raw["vector_db"]["host"]),
diff --git a/sources/gc-qa-rag-etl/etlapp/common/llm.py b/sources/gc-qa-rag-etl/etlapp/common/llm.py
@@ -1,5 +1,6 @@
 from openai import OpenAI
 from etlapp.common.config import app_config
+from etlapp.common.rate_limiter import RateLimiter
 from typing import List, Dict
 
 
@@ -9,6 +10,7 @@ def __init__(
         api_key: str = app_config.llm.api_key,
         api_base: str = app_config.llm.api_base,
         model_name: str = app_config.llm.model_name,
+        max_rpm: int = app_config.llm.max_rpm,
         system_prompt: str = "你是一个乐于解答各种问题的助手。",
         temperature: float = 0.7,
         top_p: float = 0.7,
@@ -18,8 +20,13 @@ def __init__(
         self.system_prompt = system_prompt
         self.temperature = temperature
         self.top_p = top_p
+        # 初始化限流器
+        self.rate_limiter = RateLimiter(max_requests=max_rpm, window_seconds=60)
 
     def _create_completion(self, messages: List[Dict[str, str]]) -> str:
+        # 在发送请求前进行限流
+        self.rate_limiter.wait_and_acquire()
+        
         completion = self.client.chat.completions.create(
             model=self.model_name,
             messages=messages,
@@ -37,6 +44,23 @@ def chat(self, content: str) -> str:
 
     def chat_with_messages(self, messages: List[Dict[str, str]]) -> str:
         return self._create_completion(messages)
+    
+    def get_rate_limit_status(self) -> dict:
+        """
+        获取当前限流状态
+        
+        Returns:
+            dict: 包含剩余请求数和重置时间的状态信息
+        """
+        remaining = self.rate_limiter.get_remaining_requests()
+        reset_time = self.rate_limiter.get_reset_time()
+        
+        return {
+            "remaining_requests": remaining,
+            "reset_time": reset_time,
+            "max_rpm": self.rate_limiter.max_requests,
+            "window_seconds": self.rate_limiter.window_seconds
+        }
 
 
 # Create a default instance
diff --git a/sources/gc-qa-rag-etl/etlapp/common/rate_limiter.py b/sources/gc-qa-rag-etl/etlapp/common/rate_limiter.py
@@ -0,0 +1,97 @@
+import threading
+import time
+from collections import deque
+from typing import Optional
+
+
+class RateLimiter:
+    """
+    线程安全的速率限制器，支持RPM（每分钟请求数）限制
+    """
+    
+    def __init__(self, max_requests: int, window_seconds: int = 60):
+        """
+        初始化速率限制器
+        
+        Args:
+            max_requests: 在指定时间窗口内允许的最大请求数
+            window_seconds: 时间窗口大小（秒），默认60秒（1分钟）
+        """
+        self.max_requests = max_requests
+        self.window_seconds = window_seconds
+        self.requests = deque()
+        self._lock = threading.Lock()
+    
+    def acquire(self, timeout: Optional[float] = None) -> bool:
+        """
+        尝试获取请求许可
+        
+        Args:
+            timeout: 超时时间（秒），None表示无限等待
+            
+        Returns:
+            bool: 是否成功获取许可
+        """
+        start_time = time.time()
+        
+        while True:
+            with self._lock:
+                current_time = time.time()
+                
+                # 清理过期的请求记录
+                while self.requests and current_time - self.requests[0] > self.window_seconds:
+                    self.requests.popleft()
+                
+                # 检查是否可以发送请求
+                if len(self.requests) < self.max_requests:
+                    self.requests.append(current_time)
+                    return True
+            
+            # 检查超时
+            if timeout is not None and time.time() - start_time >= timeout:
+                return False
+            
+            # 等待一小段时间再重试
+            time.sleep(0.1)
+    
+    def wait_and_acquire(self) -> None:
+        """
+        等待直到可以获取请求许可（阻塞式）
+        """
+        self.acquire(timeout=None)
+    
+    def get_remaining_requests(self) -> int:
+        """
+        获取当前时间窗口内剩余的请求数
+        
+        Returns:
+            int: 剩余请求数
+        """
+        with self._lock:
+            current_time = time.time()
+            
+            # 清理过期的请求记录
+            while self.requests and current_time - self.requests[0] > self.window_seconds:
+                self.requests.popleft()
+            
+            return max(0, self.max_requests - len(self.requests))
+    
+    def get_reset_time(self) -> Optional[float]:
+        """
+        获取下次可以发送请求的时间戳
+        
+        Returns:
+            Optional[float]: 下次可发送请求的时间戳，None表示立即可发送
+        """
+        with self._lock:
+            current_time = time.time()
+            
+            # 清理过期的请求记录
+            while self.requests and current_time - self.requests[0] > self.window_seconds:
+                self.requests.popleft()
+            
+            if len(self.requests) < self.max_requests:
+                return None
+            
+            # 返回最早请求过期的时间
+            return self.requests[0] + self.window_seconds