firecrawl · tomkosm · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml
@@ -8,6 +8,7 @@ on:
     branches:
       - main
       - dev
+      - feat/vlm
   workflow_dispatch:
 
 jobs:

diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
 ARG POETRY_VERSION=1.6.1
 
-FROM nvidia/cuda:12.8.1-devel-ubuntu24.04
+FROM vllm/vllm-openai:v0.10.1.1
 # Allow statements and log messages to immediately appear in the logs
 ENV PYTHONUNBUFFERED True
 
@@ -9,7 +9,10 @@ RUN apt-get update && apt-get install -y tzdata
 # ENV TZ Asia/Tokyo
 
 RUN apt-get update && \
-    apt-get install --yes --no-install-recommends curl g++ libopencv-dev python3 python3-pip python3-dev && \
+    apt-get install --yes --no-install-recommends curl g++ libopencv-dev python3 python3-pip python3-dev fonts-noto-core \
+    fonts-noto-cjk \
+    fontconfig \
+    libgl1  && \
     rm -rf /var/lib/apt/lists/*
 
 
@@ -39,7 +42,7 @@ ENV PATH="$APP_HOME/.venv/bin:$PATH"
 
 COPY . ./
 
-RUN /bin/bash -c "mineru-models-download -s huggingface -m pipeline"
+RUN /bin/bash -c "mineru-models-download -s huggingface -m all"
 
 # Set the entry point to activate the virtual environment and run the command line tool
 ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && python3 -m app.serverless"]

diff --git a/app/serverless.py b/app/serverless.py
@@ -1,25 +1,51 @@
 import base64
 import os
 import time
-import asyncio
 import tempfile
 import copy
 import io
+import asyncio
 
 import runpod
 
-# New mineru imports
-from mineru.data.data_reader_writer import FileBasedDataWriter
-from mineru.utils.enum_class import MakeMode
-from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
-from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
-from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
 
 from pypdf import PdfReader, PdfWriter
 
+# # Ensure MinerU custom HF classes are available and optionally pre-initialize sglang engine
+# try:
+#     # Importing registers custom model types and/or enables custom code for AutoConfig
+#     from mineru.backend.vlm.hf_predictor import HuggingfacePredictor  # noqa: F401
+# except Exception:
+#     pass
+# try:
+#     from mineru.backend.vlm.predictor import get_predictor  # noqa: F401
+# except Exception:
+#     pass
+
+def _maybe_init_engine_in_main() -> None:
+    """Initialize VLM engine in the main process if requested via env.
+
+    Per MinerU guidance, sglang-engine must be initialized in the main process.
+    This avoids scheduler failures when workers spawn without prior initialization.
+    """
+    backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower()
+    from mineru.backend.vlm.vlm_analyze import ModelSingleton
+
+
+    try:
+
+        if backend_env == "vlm-vllm-async-engine":
+            ModelSingleton().get_model("vllm-async-engine", None, None)
+    except Exception:
+        # Defer detailed errors to runtime path to avoid import-time crashes
+        pass
+
+_maybe_init_engine_in_main()
+
 class TimeoutError(Exception):
     pass
 
+
 def _trim_pdf_to_max_pages(pdf_bytes: bytes, max_pages: int) -> bytes:
     """Return a new PDF bytes object with at most the first max_pages pages."""
     if max_pages is None or max_pages <= 0:
@@ -39,8 +65,14 @@ def _trim_pdf_to_max_pages(pdf_bytes: bytes, max_pages: int) -> bytes:
 
 def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enable=True, table_enable=True, max_pages=None):
     """Convert PDF bytes to markdown - returns only the markdown string"""
-
     try:
+        # Lazy imports to avoid import-time signal handling in non-main threads
+        from mineru.data.data_reader_writer import FileBasedDataWriter
+        from mineru.utils.enum_class import MakeMode
+        from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
+        from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
+        from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
+
         # Optionally limit to first N pages
         if max_pages is not None:
             try:
@@ -51,58 +83,143 @@ def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enabl
 
         # Analyze the PDF
         infer_results, all_image_lists, all_pdf_docs, lang_list_result, ocr_enabled_list = pipeline_doc_analyze(
-            [pdf_bytes], 
-            [lang], 
-            parse_method=parse_method, 
-            formula_enable=formula_enable,
-            table_enable=table_enable
+            [pdf_bytes], [lang], parse_method=parse_method, formula_enable=formula_enable, table_enable=table_enable
         )
-
-        # Process results
+
         model_list = infer_results[0]
         images_list = all_image_lists[0]
         pdf_doc = all_pdf_docs[0]
         _lang = lang_list_result[0]
         _ocr_enable = ocr_enabled_list[0]
-
-        # Create temporary image directory for any image processing
+
         with tempfile.TemporaryDirectory() as temp_dir:
             image_writer = FileBasedDataWriter(temp_dir)
-
-            # Convert to middle JSON format
             middle_json = pipeline_result_to_middle_json(
-                model_list, images_list, pdf_doc, image_writer, 
-                _lang, _ocr_enable, formula_enable
+                model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, formula_enable
             )
-
-            # Generate and return markdown
             pdf_info = middle_json["pdf_info"]
             return pipeline_union_make(pdf_info, MakeMode.MM_MD, "images")
-
     except Exception as e:
         raise Exception(f"Error converting PDF to markdown: {str(e)}")
 
-async def async_convert_to_markdown(pdf_bytes, timeout_seconds=None, **kwargs):
-    """Async wrapper with timeout support"""
-    loop = asyncio.get_running_loop()
-
-    if timeout_seconds and timeout_seconds > 0:
+
+def convert_to_markdown_vlm(pdf_bytes, backend="vlm-sglang-engine", server_url=None):
+    """Convert PDF bytes to markdown using VLM backends; returns markdown string."""
+    # Lazy imports to avoid import-time signal handling in non-main threads
+    from mineru.data.data_reader_writer import FileBasedDataWriter
+    from mineru.utils.enum_class import MakeMode
+    from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
+    from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
+
+    normalized_backend = backend[4:] if backend.startswith("vlm-") else backend
+    with tempfile.TemporaryDirectory() as temp_dir:
+        image_writer = FileBasedDataWriter(temp_dir)
+        middle_json, _ = vlm_doc_analyze(
+            pdf_bytes, image_writer=image_writer, backend=normalized_backend, server_url=server_url
+        )
+        pdf_info = middle_json["pdf_info"]
+        return vlm_union_make(pdf_info, MakeMode.MM_MD, "images")
+
+
+async def _convert_to_markdown_via_aio(
+    pdf_bytes: bytes,
+    filename: str,
+    *,
+    lang: str = "en",
+    backend: str = "pipeline",
+    parse_method: str = "auto",
+    formula_enable: bool = True,
+    table_enable: bool = True,
+    server_url: str | None = None,
+    max_pages: int | None = None,
+) -> str:
+    """Use MinerU's aio_do_parse to produce markdown and return its content."""
+    # Lazy import to keep module import light
+    from mineru.cli.common import aio_do_parse
+
+    # Map max_pages to end_page_id semantics (inclusive end index)
+    start_page_id = 0
+    end_page_id = None
+    if max_pages is not None:
         try:
-            return await asyncio.wait_for(
-                loop.run_in_executor(None, lambda: convert_to_markdown(pdf_bytes, **kwargs)),
-                timeout=timeout_seconds
-            )
-        except asyncio.TimeoutError:
-            raise TimeoutError(f"PDF processing timed out after {timeout_seconds} seconds")
-    else:
-        return await loop.run_in_executor(None, lambda: convert_to_markdown(pdf_bytes, **kwargs))
+            max_pages_int = int(max_pages)
+            if max_pages_int > 0:
+                end_page_id = max_pages_int - 1
+        except Exception:
+            raise Exception("Invalid max_pages value; must be an integer")
+
+    with tempfile.TemporaryDirectory() as output_dir:
+        await aio_do_parse(
+            output_dir=output_dir,
+            pdf_file_names=[filename],
+            pdf_bytes_list=[pdf_bytes],
+            p_lang_list=[lang],
+            backend=backend,
+            parse_method=parse_method,
+            formula_enable=formula_enable,
+            table_enable=table_enable,
+            server_url=server_url,
+            f_draw_layout_bbox=False,
+            f_draw_span_bbox=False,
+            f_dump_md=True,
+            f_dump_middle_json=False,
+            f_dump_model_output=False,
+            f_dump_orig_pdf=False,
+            f_dump_content_list=False,
+            start_page_id=start_page_id,
+            end_page_id=end_page_id,
+        )
+
+        # Locate markdown file
+        parse_subdir = parse_method if backend.startswith("pipeline") else "vlm"
+        parse_dir = os.path.join(output_dir, filename, parse_subdir)
+        md_path = os.path.join(parse_dir, f"{filename}.md")
+        if not os.path.exists(md_path):
+            raise Exception("Markdown output not found after parsing")
+        with open(md_path, "r", encoding="utf-8") as f:
+            return f.read()
+
+
+async def convert_to_markdown_dispatch(pdf_bytes, filename=None, **kwargs):
+    """Dispatch to pipeline or VLM engine based on env MINERU_BACKEND.
+
+    Prefer using aio_do_parse to match official MinerU entrypoints.
+    """
+    backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower()
+    server_url = os.getenv("MINERU_VLM_SERVER_URL") or os.getenv("MINERU_SGLANG_SERVER_URL")
+    lang = kwargs.get("lang", "en")
+    parse_method = kwargs.get("parse_method", "auto")
+    formula_enable = kwargs.get("formula_enable", True)
+    table_enable = kwargs.get("table_enable", True)
+    max_pages = kwargs.get("max_pages")
+
+    if filename is None:
+        filename = "document"
+
+    # Use aio_do_parse path for both pipeline and vlm backends
+    if backend_env.startswith("vlm"):
+        parse_method = "vlm"
+    backend_for_aio = backend_env
+    return await _convert_to_markdown_via_aio(
+        pdf_bytes,
+        filename,
+        lang=lang,
+        backend=backend_for_aio,
+        parse_method=parse_method,
+        formula_enable=formula_enable,
+        table_enable=table_enable,
+        server_url=server_url,
+        max_pages=max_pages,
+    )
+
+
 
 async def handler(event):
-    """Main serverless handler - returns only markdown"""
     try:
         input_data = event.get("input", {})
         base64_content = input_data.get("file_content")
         filename = input_data.get("filename")
+
         timeout = input_data.get("timeout")
         created_at = input_data.get("created_at")
         max_pages = input_data.get("max_pages")
@@ -113,19 +230,6 @@ async def handler(event):
         formula_enable = input_data.get("formula_enable", True)
         table_enable = input_data.get("table_enable", True)
 
-        # Calculate remaining timeout
-        timeout_seconds = None
-        if timeout:
-            timeout_seconds = int(timeout) / 1000
-            if created_at:
-                elapsed = time.time() - (created_at / 1000)
-                if elapsed >= timeout_seconds:
-                    return {"error": "Request timed out before processing", "status": "TIMEOUT"}
-                timeout_seconds = max(0, timeout_seconds - elapsed)
-                if timeout_seconds < 1:
-                    return {"error": "Insufficient time remaining", "status": "TIMEOUT"}
-
-        # Validate input
         if not base64_content or not filename:
             return {"error": "Missing file_content or filename", "status": "ERROR"}
 
@@ -143,10 +247,10 @@ async def handler(event):
 
         # Process PDF
         pdf_bytes = base64.b64decode(base64_content)
-        
-        md_content = await async_convert_to_markdown(
+
+        md_content = await convert_to_markdown_dispatch(
             pdf_bytes=pdf_bytes,
-            timeout_seconds=timeout_seconds,
+            filename=os.path.splitext(os.path.basename(filename))[0] if filename else "document",
             lang=lang,
             parse_method=parse_method,
             formula_enable=formula_enable,
@@ -155,11 +259,10 @@ async def handler(event):
         )
 
         return {"markdown": md_content, "status": "SUCCESS"}
-
-    except TimeoutError as e:
-        return {"error": str(e), "status": "TIMEOUT"}
     except Exception as e:
         return {"error": str(e), "status": "ERROR"}
 
-print("Starting RunPod serverless handler...")
-runpod.serverless.start({"handler": handler}) 
+
+if __name__ == "__main__":
+    print("Starting RunPod serverless handler...")
+    runpod.serverless.start({"handler": handler})
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ on: @@
         branches:
           - main
           - dev
+          - feat/vlm
       workflow_dispatch:
     jobs:
@@ Expand Down @@