Merge pull request 'feat: support gemini-2.5-flash-image-preview' (#474) from gemini-image into main

LouisFunmula · LouisFunmula · commit a07d2a7bea6b · 2025-08-29T06:59:51.000Z
Reviewed-on: https://git.biggo.com/Funmula/dive-mcp-host/pulls/474
diff --git a/dive_mcp_host/host/store/base.py b/dive_mcp_host/host/store/base.py
@@ -56,6 +56,18 @@ async def get_file(self, file_path: str) -> bytes:
 class StoreManagerProtocol(ContextProtocol, Protocol):
     """Protocol for store manager operations."""
 
+    async def save_base64_image(self, data: str, extension: str = "png") -> list[str]:
+        """Save base64 image.
+
+        Args:
+            data: Image in base64
+            extension: File extension
+
+        Returns:
+            List of paths / urls
+        """
+        ...
+
     async def upload_files(
         self, files: list[UploadFile | str]
     ) -> tuple[list[str], list[str]]:
diff --git a/dive_mcp_host/httpd/_main.py b/dive_mcp_host/httpd/_main.py
@@ -43,12 +43,13 @@ def main() -> None:
     if args.cors_origin:
         service_config_manager.current_setting.cors_origin = args.cors_origin
 
-    service_config_manager.current_setting.logging_config["root"]["level"] = (
-        args.log_level
-    )
-    service_config_manager.current_setting.logging_config["loggers"]["dive_mcp_host"][
-        "level"
-    ] = args.log_level
+    if args.log_level:
+        service_config_manager.current_setting.logging_config["root"]["level"] = (
+            args.log_level
+        )
+        service_config_manager.current_setting.logging_config["loggers"][
+            "dive_mcp_host"
+        ]["level"] = args.log_level
 
     if args.log_dir:
         log_dir = Path(args.log_dir)
diff --git a/dive_mcp_host/httpd/conf/arguments.py b/dive_mcp_host/httpd/conf/arguments.py
@@ -102,8 +102,8 @@ class Arguments(BaseModel):
         description="Directory to write log files.",
     )
 
-    log_level: str = Field(
-        default="INFO",
+    log_level: str | None = Field(
+        default=None,
         description="Log level to use.",
     )
 
diff --git a/dive_mcp_host/httpd/conf/httpd_service.py b/dive_mcp_host/httpd/conf/httpd_service.py
@@ -72,7 +72,7 @@ class ServiceConfig(BaseModel):
             }
         },
         "root": {"level": "INFO", "handlers": ["default"]},
-        "loggers": {"dive_mcp_host": {"level": "DEBUG"}},
+        "loggers": {"dive_mcp_host": {"level": "INFO"}},
     }
 
 
diff --git a/dive_mcp_host/httpd/routers/utils.py b/dive_mcp_host/httpd/routers/utils.py
@@ -6,14 +6,14 @@
 from collections.abc import AsyncGenerator, AsyncIterator, Callable, Coroutine
 from contextlib import AsyncExitStack, suppress
 from dataclasses import asdict, dataclass, field
+from hashlib import md5
 from itertools import batched
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, Self
 from urllib.parse import urlparse
 from uuid import uuid4
 
 from fastapi.responses import StreamingResponse
-from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import (
     AIMessage,
     BaseMessage,
@@ -33,7 +33,7 @@
 from dive_mcp_host.host.agents.message_order import FAKE_TOOL_RESPONSE
 from dive_mcp_host.host.custom_events import ToolCallProgress
 from dive_mcp_host.host.errors import LogBufferNotFoundError
-from dive_mcp_host.host.store.base import FileType
+from dive_mcp_host.host.store.base import FileType, StoreManagerProtocol
 from dive_mcp_host.host.tools.log import LogEvent, LogManager, LogMsg
 from dive_mcp_host.host.tools.model_types import ClientState
 from dive_mcp_host.httpd.conf.prompt import PromptKey
@@ -58,7 +58,6 @@
 
 if TYPE_CHECKING:
     from dive_mcp_host.host.host import DiveMcpHost
-    from dive_mcp_host.host.store.base import StoreManagerProtocol
     from dive_mcp_host.httpd.middlewares.general import DiveUser
 
 title_prompt = """You are a title generator from the user input.
@@ -185,31 +184,54 @@ class ContentHandler:
 
     def __init__(
         self,
-        model: BaseChatModel,
-        str_output_parser: StrOutputParser,
+        store: StoreManagerProtocol,
     ) -> None:
-        """Initialize ContentHandler
-
-        Args:
-            - model: To verify which model it is.
-            - str_output_parser: Used for extracting text content from AIMessage.
-        """
-        self._model = model
-        self._str_output_parser = str_output_parser
+        """Initialize ContentHandler."""
+        self._store = store
+        self._str_output_parser = StrOutputParser()
+        # Cache that contains the md5 hash and file path / urls for the file.
+        # Prevents dupicate save / uploads.
+        self._cache: dict[str, list[str]] = {}
 
-    def invoke(self, msg: AIMessage) -> str:
-        """Extract content from AIMessage."""
+    async def invoke(self, msg: AIMessage) -> str:
+        """Extract various types of content."""
         result = self._text_content(msg)
+        model_name = msg.response_metadata.get("model_name")
 
-        if self._model.name in {"gemini-2.5-flash-image-preview"}:
-            result = f"{result} {self._gemini_25_image(msg)}"
+        if model_name in {"gemini-2.5-flash-image-preview"}:
+            result = f"{result} {await self._gemini_25_image(msg)}"
 
         return result
 
     def _text_content(self, msg: AIMessage) -> str:
         return self._str_output_parser.invoke(msg)
 
-    def _gemini_25_image(self, msg: AIMessage) -> str:
+    async def _save_with_cache(self, data: str) -> list[str]:
+        """Prevents duplicate save and uploads.
+
+        Returns:
+            Saved locations, 'local file path' or 'url'
+        """
+        md5_hash = md5(data.encode(), usedforsecurity=False).hexdigest()
+        locations = self._cache.get(md5_hash)
+        if not locations:
+            locations = await self._store.save_base64_image(data)
+            self._cache[md5_hash] = locations
+        return locations
+
+    def _retrive_optimal_location(self, locations: list[str]) -> str:
+        """Prioritize urls, prevents broken image in case we need to sync
+        user chat history some day.
+        """  # noqa: D205
+        url = locations[0]
+        for item in locations[1:]:
+            if self._store.is_url(item):
+                url = item
+        if self._store.is_local_file(url):
+            url = f"file://{url}"
+        return url
+
+    async def _gemini_25_image(self, msg: AIMessage) -> str:
         """Gemini will return base64 image content.
 
         {
@@ -230,10 +252,14 @@ def _gemini_25_image(self, msg: AIMessage) -> str:
             if (
                 isinstance(content, dict)
                 and (image_url := content.get("image_url"))
-                and (url := image_url.get("url"))
+                and (inline_base64 := image_url.get("url"))
             ):
-                markdown_image_tag = f"![image]({url})"
-                result = f"{result} {markdown_image_tag}"
+                base64_data: str = inline_base64.split(",")[-1]
+                assert isinstance(base64_data, str), "base64_data must be string"
+                locations = await self._save_with_cache(base64_data)
+                url = self._retrive_optimal_location(locations)
+                image_tag = f"![image]({url})"
+                result = f"{result} {image_tag}"
 
         return result
 
@@ -254,9 +280,7 @@ def __init__(
         self.store: StoreManagerProtocol = app.store
         self.dive_host: DiveMcpHost = app.dive_host["default"]
         self._str_output_parser = StrOutputParser()
-        self._content_handler = ContentHandler(
-            self.dive_host.model, self._str_output_parser
-        )
+        self._content_handler = ContentHandler(self.store)
         self.disable_dive_system_prompt = (
             app.model_config_manager.full_config.disable_dive_system_prompt
             if app.model_config_manager.full_config
@@ -395,7 +419,7 @@ async def handle_chat(  # noqa: C901, PLR0912, PLR0915
                         total_run_time=duration,
                     )
                     result = (
-                        self._str_output_parser.invoke(message)
+                        await self._content_handler.invoke(message)
                         if message.content
                         else ""
                     )
@@ -550,7 +574,7 @@ def _prompt_cb(_: Any) -> list[BaseMessage]:
         raise RuntimeError("Unreachable")
 
     async def _stream_text_msg(self, message: AIMessage) -> None:
-        content = self._content_handler.invoke(message)
+        content = await self._content_handler.invoke(message)
         if content:
             await self.stream.write(StreamMessage(type="text", content=content))
         if message.response_metadata.get("stop_reason") == "max_tokens":
diff --git a/dive_mcp_host/httpd/store/local.py b/dive_mcp_host/httpd/store/local.py
@@ -22,6 +22,26 @@ def __init__(self, root_dir: Path = RESOURCE_DIR) -> None:
         upload_dir.mkdir(parents=True, exist_ok=True)
         self.upload_dir = upload_dir
 
+    def save_base64_image(self, base64_str: str, extension: str = "png") -> str:
+        """Save base64 image to file.
+
+        Args:
+            base64_str: Image in base64
+            extension: File extension
+
+        Returns:
+            File path to image file
+        """
+        base64_bytes = BytesIO(base64.b64decode(base64_str))
+        pil_image = Image.open(base64_bytes)
+        file_name = f"{self._gen_rand_str()}.{extension}"
+        file_path = self.upload_dir / file_name
+        pil_image.save(file_path)
+        return str(file_path)
+
+    def _gen_rand_str(self) -> str:
+        return f"{int(time.time() * 1000)}-{randint(0, int(1e9))}"  # noqa: S311
+
     async def save_file(
         self,
         file: UploadFile | str,
@@ -35,7 +55,7 @@ async def save_file(
 
         ext = Path(file.filename).suffix
 
-        tmp_name = f"{int(time.time() * 1000)}-{randint(0, int(1e9))}{ext}"  # noqa: S311
+        tmp_name = f"{self._gen_rand_str()}{ext}"
         tmp_file = self.upload_dir.joinpath(tmp_name)
 
         hash_md5 = md5(usedforsecurity=False)
diff --git a/dive_mcp_host/httpd/store/manager.py b/dive_mcp_host/httpd/store/manager.py
@@ -76,8 +76,30 @@ async def _run_in_context(self) -> AsyncGenerator[Self, None]:
                 self._storages.append(store)
             yield self
 
+    async def save_base64_image(self, data: str, extension: str = "png") -> list[str]:
+        """Save base64 image.
+
+        Args:
+            data: Image in base64
+            extension: File extension
+
+        Returns:
+            List of paths / urls
+        """
+        path = self._local_store.save_base64_image(data, extension)
+        additional_paths = await self._run_storage_callbacks(path)
+        return [path, *additional_paths]
+
+    async def _run_storage_callbacks(self, file: UploadFile | str) -> list[str]:
+        tasks: list[asyncio.Task] = []
+        async with asyncio.TaskGroup() as tg:
+            for store in self._storages:
+                tasks.append(tg.create_task(store.save_file(file)))
+        return [i.result() for i in tasks if i.result()]
+
     async def save_files(
-        self, files: list[UploadFile | str]
+        self,
+        files: list[UploadFile | str],
     ) -> list[tuple[FileType, list[str]]]:
         """Save files to the stores.
 
@@ -95,11 +117,7 @@ async def save_files(
                 continue
             paths = [path]
             if self._storage_callbacks:
-                tasks: list[asyncio.Task] = []
-                async with asyncio.TaskGroup() as tg:
-                    for store in self._storages:
-                        tasks.append(tg.create_task(store.save_file(file)))
-                additional_paths = [i.result() for i in tasks if i.result()]
+                additional_paths = await self._run_storage_callbacks(file)
                 paths.extend(additional_paths)
             all_paths.append((FileType.from_file_path(path), paths))
         return all_paths
diff --git a/tests/httpd/test_chat_processor.py b/tests/httpd/test_chat_processor.py
@@ -1,20 +1,25 @@
+import tempfile
 import uuid
 from collections.abc import AsyncGenerator
-from typing import Any, cast
+from hashlib import md5
+from typing import TYPE_CHECKING, Any, cast
+from unittest.mock import AsyncMock
 
 import pytest
 import pytest_asyncio
 from langchain_core.messages import AIMessage, HumanMessage
-from langchain_core.output_parsers import StrOutputParser
 
 from dive_mcp_host.httpd.conf.httpd_service import ServiceManager
 from dive_mcp_host.httpd.conf.mcp_servers import Config
 from dive_mcp_host.httpd.conf.prompt import PromptKey
 from dive_mcp_host.httpd.routers.utils import ChatProcessor, ContentHandler
 from dive_mcp_host.httpd.server import DiveHostAPI
-from dive_mcp_host.models.fake import FakeMessageToolModel, load_model
+from dive_mcp_host.httpd.store.manager import StoreManager
 from tests.httpd.routers.conftest import config_files  # noqa: F401
 
+if TYPE_CHECKING:
+    from dive_mcp_host.models.fake import FakeMessageToolModel
+
 
 @pytest_asyncio.fixture
 async def server(config_files) -> AsyncGenerator[DiveHostAPI, None]:  # noqa: F811
@@ -113,22 +118,59 @@ async def test_generate_title(processor: ChatProcessor):
     assert r == "Simple Greeting 2"
 
 
-def test_content_handler_gemini_image():
+@pytest.mark.asyncio
+async def test_content_handler_gemini_image_with_url():
     """Check if content handler can extract what is needed."""
-    model = load_model()
-    model.name = "gemini-2.5-flash-image-preview"
-    content_handler = ContentHandler(model, StrOutputParser())
+    store = StoreManager()
+    store.save_base64_image = AsyncMock(
+        return_value=["/some/path", "http://someurl.com"]
+    )
+    content_handler = ContentHandler(store)
     message = AIMessage(
         content=[
             "Here is a cuddly cat wearing a hat! ",
             {
                 "type": "image_url",
                 "image_url": {"url": "data:image/png;base64,XXXXXXXX"},
             },
-        ]
+        ],
+        response_metadata={"model_name": "gemini-2.5-flash-image-preview"},
     )
-    content = content_handler.invoke(message)
+    content = await content_handler.invoke(message)
     assert (
-        content
-        == "Here is a cuddly cat wearing a hat!   ![image](data:image/png;base64,XXXXXXXX)"  # noqa: E501
+        content == "Here is a cuddly cat wearing a hat!   ![image](http://someurl.com)"
     )
+
+    # Cache should exist
+    md5_hash = md5(b"XXXXXXXX", usedforsecurity=False).hexdigest()
+    assert md5_hash in content_handler._cache
+    assert content_handler._cache[md5_hash] == ["/some/path", "http://someurl.com"]
+
+
+@pytest.mark.asyncio
+async def test_content_handler_gemini_image_with_local_file():
+    """Make sure local file also works."""
+    with tempfile.NamedTemporaryFile(prefix="dummyfile-") as f:
+        store = StoreManager()
+        store.save_base64_image = AsyncMock(return_value=[f.name])
+        content_handler = ContentHandler(store)
+        message = AIMessage(
+            content=[
+                "Here is a cuddly cat wearing a hat! ",
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "data:image/png;base64,XXXXXXXX"},
+                },
+            ],
+            response_metadata={"model_name": "gemini-2.5-flash-image-preview"},
+        )
+        content = await content_handler.invoke(message)
+        assert (
+            content
+            == f"Here is a cuddly cat wearing a hat!   ![image](file://{f.name})"
+        )
+
+        # Cache should exist
+        md5_hash = md5(b"XXXXXXXX", usedforsecurity=False).hexdigest()
+        assert md5_hash in content_handler._cache
+        assert content_handler._cache[md5_hash] == [f.name]
diff --git a/tests/httpd/test_content_handler.py b/tests/httpd/test_content_handler.py

Original file line number	Diff line number	Diff line change
`@@ -102,8 +102,8 @@ class Arguments(BaseModel):`
`102`	`102`	`description="Directory to write log files.",`
`103`	`103`	`)`
`104`	`104`
`105`		`- log_level: str = Field(`
`106`		`- default="INFO",`
	`105`	`+ log_level: str \| None = Field(`
	`106`	`+ default=None,`
`107`	`107`	`description="Log level to use.",`
`108`	`108`	`)`
`109`	`109`
Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ class ServiceConfig(BaseModel):`
`72`	`72`	`}`
`73`	`73`	`},`
`74`	`74`	`"root": {"level": "INFO", "handlers": ["default"]},`
`75`		`- "loggers": {"dive_mcp_host": {"level": "DEBUG"}},`
	`75`	`+ "loggers": {"dive_mcp_host": {"level": "INFO"}},`
`76`	`76`	`}`
`77`	`77`
`78`	`78`