Merge pull request 'chore: support gemini-2.5-flash-image-preview' (#473) from gemini-image into development

LouisFunmula · LouisFunmula · commit 0f1ccbfdb79f · 2025-08-28T03:52:58.000Z
Reviewed-on: https://git.biggo.com/Funmula/dive-mcp-host/pulls/473
diff --git a/dive_mcp_host/httpd/routers/utils.py b/dive_mcp_host/httpd/routers/utils.py
@@ -13,6 +13,7 @@
 from uuid import uuid4
 
 from fastapi.responses import StreamingResponse
+from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import (
     AIMessage,
     BaseMessage,
@@ -176,6 +177,67 @@ class ImageAndDocuments:
     documents: list[str] = field(default_factory=list)
 
 
+class ContentHandler:
+    """Some models will return more then just pure text in content response.
+
+    We need to have a customized handler for those special models.
+    """
+
+    def __init__(
+        self,
+        model: BaseChatModel,
+        str_output_parser: StrOutputParser,
+    ) -> None:
+        """Initialize ContentHandler
+
+        Args:
+            - model: To verify which model it is.
+            - str_output_parser: Used for extracting text content from AIMessage.
+        """
+        self._model = model
+        self._str_output_parser = str_output_parser
+
+    def invoke(self, msg: AIMessage) -> str:
+        """Extract content from AIMessage."""
+        result = self._text_content(msg)
+
+        if self._model.name in {"gemini-2.5-flash-image-preview"}:
+            result = f"{result} {self._gemini_25_image(msg)}"
+
+        return result
+
+    def _text_content(self, msg: AIMessage) -> str:
+        return self._str_output_parser.invoke(msg)
+
+    def _gemini_25_image(self, msg: AIMessage) -> str:
+        """Gemini will return base64 image content.
+
+        {
+            "content": [
+                "Here is a cuddly cat wearing a hat! ",
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "data:image/png;base64,XXXXXXXX"
+                    }
+                }
+            ]
+        }
+
+        """
+        result = ""
+        for content in msg.content:
+            if (
+                isinstance(content, dict)
+                and (image_url := content.get("image_url"))
+                and (url := image_url.get("url"))
+            ):
+                markdown_image_tag = f"![image]({url})"
+                result = f"{result} {markdown_image_tag}"
+
+        return result
+
+
 class ChatProcessor:
     """Chat processor."""
 
@@ -192,6 +254,9 @@ def __init__(
         self.store: StoreManagerProtocol = app.store
         self.dive_host: DiveMcpHost = app.dive_host["default"]
         self._str_output_parser = StrOutputParser()
+        self._content_handler = ContentHandler(
+            self.dive_host.model, self._str_output_parser
+        )
         self.disable_dive_system_prompt = (
             app.model_config_manager.full_config.disable_dive_system_prompt
             if app.model_config_manager.full_config
@@ -485,7 +550,7 @@ def _prompt_cb(_: Any) -> list[BaseMessage]:
         raise RuntimeError("Unreachable")
 
     async def _stream_text_msg(self, message: AIMessage) -> None:
-        content = self._str_output_parser.invoke(message)
+        content = self._content_handler.invoke(message)
         if content:
             await self.stream.write(StreamMessage(type="text", content=content))
         if message.response_metadata.get("stop_reason") == "max_tokens":
diff --git a/tests/httpd/test_chat_processor.py b/tests/httpd/test_chat_processor.py
@@ -5,13 +5,14 @@
 import pytest
 import pytest_asyncio
 from langchain_core.messages import AIMessage, HumanMessage
+from langchain_core.output_parsers import StrOutputParser
 
 from dive_mcp_host.httpd.conf.httpd_service import ServiceManager
 from dive_mcp_host.httpd.conf.mcp_servers import Config
 from dive_mcp_host.httpd.conf.prompt import PromptKey
-from dive_mcp_host.httpd.routers.utils import ChatProcessor
+from dive_mcp_host.httpd.routers.utils import ChatProcessor, ContentHandler
 from dive_mcp_host.httpd.server import DiveHostAPI
-from dive_mcp_host.models.fake import FakeMessageToolModel  # noqa: TC001
+from dive_mcp_host.models.fake import FakeMessageToolModel, load_model
 from tests.httpd.routers.conftest import config_files  # noqa: F401
 
 
@@ -110,3 +111,24 @@ async def test_generate_title(processor: ChatProcessor):
     assert r == "Simple Greeting"
     r = await processor._generate_title("Hello, how are you?")
     assert r == "Simple Greeting 2"
+
+
+def test_content_handler_gemini_image():
+    """Check if content handler can extract what is needed."""
+    model = load_model()
+    model.name = "gemini-2.5-flash-image-preview"
+    content_handler = ContentHandler(model, StrOutputParser())
+    message = AIMessage(
+        content=[
+            "Here is a cuddly cat wearing a hat! ",
+            {
+                "type": "image_url",
+                "image_url": {"url": "data:image/png;base64,XXXXXXXX"},
+            },
+        ]
+    )
+    content = content_handler.invoke(message)
+    assert (
+        content
+        == "Here is a cuddly cat wearing a hat!   ![image](data:image/png;base64,XXXXXXXX)"  # noqa: E501
+    )
diff --git a/tests/httpd/test_content_handler.py b/tests/httpd/test_content_handler.py
@@ -0,0 +1 @@
+
diff --git a/tests/test_providers.py b/tests/test_providers.py
@@ -186,6 +186,32 @@ async def test_host_google(echo_tool_stdio_config: dict[str, ServerConfig]) -> N
     await _run_the_test(config)
 
 
+@pytest.mark.asyncio
+async def test_host_google_image_gen(
+    echo_tool_stdio_config: dict[str, ServerConfig],
+) -> None:
+    """Test the host context initialization."""
+    if api_key := environ.get("GOOGLE_API_KEY"):
+        config = HostConfig(
+            llm=LLMConfig(
+                model="gemini-2.5-flash-image-preview",
+                model_provider="google-genai",
+                api_key=SecretStr(api_key),
+                configuration=LLMConfiguration(
+                    temperature=0.0,
+                    top_p=0,
+                ),
+                tools_in_prompt=True,
+                disable_streaming=True,
+            ),
+            mcp_servers=echo_tool_stdio_config,
+        )
+    else:
+        pytest.skip("need environment variable GOOGLE_API_KEY to run this test")
+
+    await _run_the_test(config)
+
+
 @pytest.mark.asyncio
 async def test_bedrock(echo_tool_stdio_config: dict[str, ServerConfig]) -> None:
     """Test the host context initialization."""