feat: support image input to LLM clients (#653)

lemorage · web-flow · commit a877c627586e · 2025-07-06T23:28:28.000-07:00
* feat: support image as input in ExtractByLlm

* feat: add support for dynamic image MIME type detection

* refactor: update infer initialization in image MIME type detection

* feat: use serde for base64 image handling in Ollama

* feat: polish system prompt to support image input handling

* feat: update argument handling logic

* feat: refactor base64 encoding to use prelude for image handling

* feat: make image captioning support with Ollama integration optional

* feat: add optional envvar for checking Ollama model

* feat: update Ollama model handling to allow dynamic model specification
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -115,3 +115,5 @@ aws-config = "1.6.2"
 aws-sdk-s3 = "1.85.0"
 aws-sdk-sqs = "1.67.0"
 numpy = "0.25.0"
+infer = "0.19.0"
+serde_with = { version = "3.13.0", features = ["base64"] }
diff --git a/examples/image_search/README.md b/examples/image_search/README.md
@@ -13,6 +13,7 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
 - CLIP ViT-L/14 - Embeddings Model for images and query
 - Qdrant for Vector Storage
 - FastApi for backend
+- Ollama (Optional) for generating image captions using `gemma3`.
 
 ## Setup
 - Make sure Postgres and Qdrant are running
@@ -21,7 +22,16 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
   export COCOINDEX_DATABASE_URL="postgres://cocoindex:cocoindex@localhost/cocoindex"
   ```
 
-## Run
+## (Optional) Run Ollama
+
+- This enables automatic image captioning
+```
+ollama pull gemma3
+ollama serve
+export OLLAMA_MODEL="gemma3"  # Optional, for caption generation
+```
+
+## Run the App
 - Install dependencies:
   ```
   pip install -e .
diff --git a/examples/image_search/main.py b/examples/image_search/main.py
@@ -15,6 +15,7 @@
 from qdrant_client import QdrantClient
 from transformers import CLIPModel, CLIPProcessor
 
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/")
 QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6334/")
 QDRANT_COLLECTION = "ImageSearch"
 CLIP_MODEL_NAME = "openai/clip-vit-large-patch14"
@@ -69,12 +70,39 @@ def image_object_embedding_flow(
     )
     img_embeddings = data_scope.add_collector()
     with data_scope["images"].row() as img:
+        ollama_model_name = os.getenv("OLLAMA_MODEL")
+        if ollama_model_name is not None:
+            # If an Ollama model is specified, generate an image caption
+            img["caption"] = flow_builder.transform(
+                cocoindex.functions.ExtractByLlm(
+                    llm_spec=cocoindex.llm.LlmSpec(
+                        api_type=cocoindex.LlmApiType.OLLAMA, model=ollama_model_name
+                    ),
+                    instruction=(
+                        "Describe the image in one detailed sentence. "
+                        "Name all visible animal species, objects, and the main scene. "
+                        "Be specific about type, color, and notable features. "
+                        "Mention what each animal is doing."
+                    ),
+                    output_type=str,
+                ),
+                image=img["content"],
+            )
         img["embedding"] = img["content"].transform(embed_image)
-        img_embeddings.collect(
-            id=cocoindex.GeneratedField.UUID,
-            filename=img["filename"],
-            embedding=img["embedding"],
-        )
+
+        collect_fields = {
+            "id": cocoindex.GeneratedField.UUID,
+            "filename": img["filename"],
+            "embedding": img["embedding"],
+        }
+
+        if ollama_model_name is not None:
+            print(f"Using Ollama model '{ollama_model_name}' for captioning.")
+            collect_fields["caption"] = img["caption"]
+        else:
+            print(f"No Ollama model '{ollama_model_name}' found — skipping captioning.")
+
+        img_embeddings.collect(**collect_fields)
 
     img_embeddings.export(
         "img_embeddings",
@@ -126,11 +154,18 @@ def search(
         collection_name=QDRANT_COLLECTION,
         query_vector=("embedding", query_embedding),
         limit=limit,
+        with_payload=True,
     )
 
     return {
         "results": [
-            {"filename": result.payload["filename"], "score": result.score}
+            {
+                "filename": result.payload["filename"],
+                "score": result.score,
+                "caption": result.payload.get(
+                    "caption"
+                ),  # Include caption if available
+            }
             for result in search_results
         ]
     }
diff --git a/src/llm/anthropic.rs b/src/llm/anthropic.rs
@@ -1,8 +1,10 @@
 use crate::llm::{
-    LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat, ToJsonSchemaOptions,
+    LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat,
+    ToJsonSchemaOptions, detect_image_mime_type,
 };
 use anyhow::{Context, Result, bail};
 use async_trait::async_trait;
+use base64::prelude::*;
 use json5;
 use serde_json::Value;
 
@@ -36,9 +38,31 @@ impl LlmGenerationClient for Client {
         &self,
         request: LlmGenerateRequest<'req>,
     ) -> Result<LlmGenerateResponse> {
+        let mut user_content_parts: Vec<serde_json::Value> = Vec::new();
+
+        // Add image part if present
+        if let Some(image_bytes) = &request.image {
+            let base64_image = BASE64_STANDARD.encode(image_bytes.as_ref());
+            let mime_type = detect_image_mime_type(image_bytes.as_ref())?;
+            user_content_parts.push(serde_json::json!({
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": mime_type,
+                    "data": base64_image,
+                }
+            }));
+        }
+
+        // Add text part
+        user_content_parts.push(serde_json::json!({
+            "type": "text",
+            "text": request.user_prompt
+        }));
+
         let messages = vec![serde_json::json!({
             "role": "user",
-            "content": request.user_prompt
+            "content": user_content_parts
         })];
 
         let mut payload = serde_json::json!({
diff --git a/src/llm/gemini.rs b/src/llm/gemini.rs
@@ -2,8 +2,9 @@ use crate::prelude::*;
 
 use crate::llm::{
     LlmEmbeddingClient, LlmGenerateRequest, LlmGenerateResponse, LlmGenerationClient, OutputFormat,
-    ToJsonSchemaOptions,
+    ToJsonSchemaOptions, detect_image_mime_type,
 };
+use base64::prelude::*;
 use phf::phf_map;
 use serde_json::Value;
 use urlencoding::encode;
@@ -70,10 +71,27 @@ impl LlmGenerationClient for Client {
         &self,
         request: LlmGenerateRequest<'req>,
     ) -> Result<LlmGenerateResponse> {
-        // Compose the prompt/messages
+        let mut user_parts: Vec<serde_json::Value> = Vec::new();
+
+        // Add text part first
+        user_parts.push(serde_json::json!({ "text": request.user_prompt }));
+
+        // Add image part if present
+        if let Some(image_bytes) = &request.image {
+            let base64_image = BASE64_STANDARD.encode(image_bytes.as_ref());
+            let mime_type = detect_image_mime_type(image_bytes.as_ref())?;
+            user_parts.push(serde_json::json!({
+                "inlineData": {
+                    "mimeType": mime_type,
+                    "data": base64_image
+                }
+            }));
+        }
+
+        // Compose the contents
         let contents = vec![serde_json::json!({
             "role": "user",
-            "parts": [{ "text": request.user_prompt }]
+            "parts": user_parts
         })];
 
         // Prepare payload
diff --git a/src/llm/mod.rs b/src/llm/mod.rs
@@ -1,9 +1,12 @@
 use crate::prelude::*;
 
 use crate::base::json_schema::ToJsonSchemaOptions;
+use infer::Infer;
 use schemars::schema::SchemaObject;
 use std::borrow::Cow;
 
+static INFER: LazyLock<Infer> = LazyLock::new(Infer::new);
+
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
 pub enum LlmApiType {
     Ollama,
@@ -36,6 +39,7 @@ pub struct LlmGenerateRequest<'a> {
     pub model: &'a str,
     pub system_prompt: Option<Cow<'a, str>>,
     pub user_prompt: Cow<'a, str>,
+    pub image: Option<Cow<'a, [u8]>>,
     pub output_format: Option<OutputFormat<'a>>,
 }
 
@@ -141,3 +145,11 @@ pub fn new_llm_embedding_client(
     };
     Ok(client)
 }
+
+pub fn detect_image_mime_type(bytes: &[u8]) -> Result<&'static str> {
+    let infer = &*INFER;
+    match infer.get(bytes) {
+        Some(info) if info.mime_type().starts_with("image/") => Ok(info.mime_type()),
+        _ => bail!("Unknown or unsupported image format"),
+    }
+}
diff --git a/src/llm/ollama.rs b/src/llm/ollama.rs
@@ -2,6 +2,7 @@ use crate::prelude::*;
 
 use super::LlmGenerationClient;
 use schemars::schema::SchemaObject;
+use serde_with::{base64::Base64, serde_as};
 
 pub struct Client {
     generate_url: String,
@@ -14,10 +15,13 @@ enum OllamaFormat<'a> {
     JsonSchema(&'a SchemaObject),
 }
 
+#[serde_as]
 #[derive(Debug, Serialize)]
 struct OllamaRequest<'a> {
     pub model: &'a str,
     pub prompt: &'a str,
+    #[serde_as(as = "Option<Vec<Base64>>")]
+    pub images: Option<Vec<&'a [u8]>>,
     pub format: Option<OllamaFormat<'a>>,
     pub system: Option<&'a str>,
     pub stream: Option<bool>,
@@ -52,6 +56,7 @@ impl LlmGenerationClient for Client {
         let req = OllamaRequest {
             model: request.model,
             prompt: request.user_prompt.as_ref(),
+            images: request.image.as_deref().map(|img| vec![img.as_ref()]),
             format: request.output_format.as_ref().map(
                 |super::OutputFormat::JsonSchema { schema, .. }| {
                     OllamaFormat::JsonSchema(schema.as_ref())
diff --git a/src/llm/openai.rs b/src/llm/openai.rs
@@ -1,18 +1,21 @@
 use crate::api_bail;
 
-use super::{LlmEmbeddingClient, LlmGenerationClient};
+use super::{LlmEmbeddingClient, LlmGenerationClient, detect_image_mime_type};
 use anyhow::Result;
 use async_openai::{
     Client as OpenAIClient,
     config::OpenAIConfig,
     types::{
-        ChatCompletionRequestMessage, ChatCompletionRequestSystemMessage,
+        ChatCompletionRequestMessage, ChatCompletionRequestMessageContentPartImage,
+        ChatCompletionRequestMessageContentPartText, ChatCompletionRequestSystemMessage,
         ChatCompletionRequestSystemMessageContent, ChatCompletionRequestUserMessage,
-        ChatCompletionRequestUserMessageContent, CreateChatCompletionRequest,
-        CreateEmbeddingRequest, EmbeddingInput, ResponseFormat, ResponseFormatJsonSchema,
+        ChatCompletionRequestUserMessageContent, ChatCompletionRequestUserMessageContentPart,
+        CreateChatCompletionRequest, CreateEmbeddingRequest, EmbeddingInput, ImageDetail,
+        ResponseFormat, ResponseFormatJsonSchema,
     },
 };
 use async_trait::async_trait;
+use base64::prelude::*;
 use phf::phf_map;
 
 static DEFAULT_EMBEDDING_DIMENSIONS: phf::Map<&str, u32> = phf_map! {
@@ -64,11 +67,32 @@ impl LlmGenerationClient for Client {
         }
 
         // Add user message
+        let user_message_content = match request.image {
+            Some(img_bytes) => {
+                let base64_image = BASE64_STANDARD.encode(img_bytes.as_ref());
+                let mime_type = detect_image_mime_type(img_bytes.as_ref())?;
+                let image_url = format!("data:{};base64,{}", mime_type, base64_image);
+                ChatCompletionRequestUserMessageContent::Array(vec![
+                    ChatCompletionRequestUserMessageContentPart::Text(
+                        ChatCompletionRequestMessageContentPartText {
+                            text: request.user_prompt.into_owned(),
+                        },
+                    ),
+                    ChatCompletionRequestUserMessageContentPart::ImageUrl(
+                        ChatCompletionRequestMessageContentPartImage {
+                            image_url: async_openai::types::ImageUrl {
+                                url: image_url,
+                                detail: Some(ImageDetail::Auto),
+                            },
+                        },
+                    ),
+                ])
+            }
+            None => ChatCompletionRequestUserMessageContent::Text(request.user_prompt.into_owned()),
+        };
         messages.push(ChatCompletionRequestMessage::User(
             ChatCompletionRequestUserMessage {
-                content: ChatCompletionRequestUserMessageContent::Text(
-                    request.user_prompt.into_owned(),
-                ),
+                content: user_message_content,
                 ..Default::default()
             },
         ));
diff --git a/src/ops/functions/extract_by_llm.rs b/src/ops/functions/extract_by_llm.rs