Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
branches:
- main
- dev
- feat/vlm
workflow_dispatch:

jobs:
Expand Down
9 changes: 6 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
ARG POETRY_VERSION=1.6.1

FROM nvidia/cuda:12.8.1-devel-ubuntu24.04
FROM vllm/vllm-openai:v0.10.1.1
# Allow statements and log messages to immediately appear in the logs
ENV PYTHONUNBUFFERED True

Expand All @@ -9,7 +9,10 @@ RUN apt-get update && apt-get install -y tzdata
# ENV TZ Asia/Tokyo

RUN apt-get update && \
apt-get install --yes --no-install-recommends curl g++ libopencv-dev python3 python3-pip python3-dev && \
apt-get install --yes --no-install-recommends curl g++ libopencv-dev python3 python3-pip python3-dev fonts-noto-core \
fonts-noto-cjk \
fontconfig \
libgl1 && \
rm -rf /var/lib/apt/lists/*


Expand Down Expand Up @@ -39,7 +42,7 @@ ENV PATH="$APP_HOME/.venv/bin:$PATH"

COPY . ./

RUN /bin/bash -c "mineru-models-download -s huggingface -m pipeline"
RUN /bin/bash -c "mineru-models-download -s huggingface -m all"

# Set the entry point to activate the virtual environment and run the command line tool
ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && python3 -m app.serverless"]
Expand Down
221 changes: 162 additions & 59 deletions app/serverless.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,51 @@
import base64
import os
import time
import asyncio
import tempfile
import copy
import io
import asyncio

import runpod

# New mineru imports
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.enum_class import MakeMode
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json

from pypdf import PdfReader, PdfWriter

# # Ensure MinerU custom HF classes are available and optionally pre-initialize sglang engine
# try:
# # Importing registers custom model types and/or enables custom code for AutoConfig
# from mineru.backend.vlm.hf_predictor import HuggingfacePredictor # noqa: F401
# except Exception:
# pass
# try:
# from mineru.backend.vlm.predictor import get_predictor # noqa: F401
# except Exception:
# pass

def _maybe_init_engine_in_main() -> None:
"""Initialize VLM engine in the main process if requested via env.

Per MinerU guidance, sglang-engine must be initialized in the main process.
This avoids scheduler failures when workers spawn without prior initialization.
"""
backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower()
from mineru.backend.vlm.vlm_analyze import ModelSingleton


try:

if backend_env == "vlm-vllm-async-engine":
ModelSingleton().get_model("vllm-async-engine", None, None)
except Exception:
# Defer detailed errors to runtime path to avoid import-time crashes
pass

_maybe_init_engine_in_main()

class TimeoutError(Exception):
pass


def _trim_pdf_to_max_pages(pdf_bytes: bytes, max_pages: int) -> bytes:
"""Return a new PDF bytes object with at most the first max_pages pages."""
if max_pages is None or max_pages <= 0:
Expand All @@ -39,8 +65,14 @@ def _trim_pdf_to_max_pages(pdf_bytes: bytes, max_pages: int) -> bytes:

def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enable=True, table_enable=True, max_pages=None):
"""Convert PDF bytes to markdown - returns only the markdown string"""

try:
# Lazy imports to avoid import-time signal handling in non-main threads
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.enum_class import MakeMode
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json

# Optionally limit to first N pages
if max_pages is not None:
try:
Expand All @@ -51,58 +83,143 @@ def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enabl

# Analyze the PDF
infer_results, all_image_lists, all_pdf_docs, lang_list_result, ocr_enabled_list = pipeline_doc_analyze(
[pdf_bytes],
[lang],
parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable
[pdf_bytes], [lang], parse_method=parse_method, formula_enable=formula_enable, table_enable=table_enable
)

# Process results

model_list = infer_results[0]
images_list = all_image_lists[0]
pdf_doc = all_pdf_docs[0]
_lang = lang_list_result[0]
_ocr_enable = ocr_enabled_list[0]

# Create temporary image directory for any image processing

with tempfile.TemporaryDirectory() as temp_dir:
image_writer = FileBasedDataWriter(temp_dir)

# Convert to middle JSON format
middle_json = pipeline_result_to_middle_json(
model_list, images_list, pdf_doc, image_writer,
_lang, _ocr_enable, formula_enable
model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, formula_enable
)

# Generate and return markdown
pdf_info = middle_json["pdf_info"]
return pipeline_union_make(pdf_info, MakeMode.MM_MD, "images")

except Exception as e:
raise Exception(f"Error converting PDF to markdown: {str(e)}")

async def async_convert_to_markdown(pdf_bytes, timeout_seconds=None, **kwargs):
"""Async wrapper with timeout support"""
loop = asyncio.get_running_loop()

if timeout_seconds and timeout_seconds > 0:

def convert_to_markdown_vlm(pdf_bytes, backend="vlm-sglang-engine", server_url=None):
"""Convert PDF bytes to markdown using VLM backends; returns markdown string."""
# Lazy imports to avoid import-time signal handling in non-main threads
from mineru.data.data_reader_writer import FileBasedDataWriter
from mineru.utils.enum_class import MakeMode
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make

normalized_backend = backend[4:] if backend.startswith("vlm-") else backend
with tempfile.TemporaryDirectory() as temp_dir:
image_writer = FileBasedDataWriter(temp_dir)
middle_json, _ = vlm_doc_analyze(
pdf_bytes, image_writer=image_writer, backend=normalized_backend, server_url=server_url
)
pdf_info = middle_json["pdf_info"]
return vlm_union_make(pdf_info, MakeMode.MM_MD, "images")


async def _convert_to_markdown_via_aio(
pdf_bytes: bytes,
filename: str,
*,
lang: str = "en",
backend: str = "pipeline",
parse_method: str = "auto",
formula_enable: bool = True,
table_enable: bool = True,
server_url: str | None = None,
max_pages: int | None = None,
) -> str:
"""Use MinerU's aio_do_parse to produce markdown and return its content."""
# Lazy import to keep module import light
from mineru.cli.common import aio_do_parse

# Map max_pages to end_page_id semantics (inclusive end index)
start_page_id = 0
end_page_id = None
if max_pages is not None:
try:
return await asyncio.wait_for(
loop.run_in_executor(None, lambda: convert_to_markdown(pdf_bytes, **kwargs)),
timeout=timeout_seconds
)
except asyncio.TimeoutError:
raise TimeoutError(f"PDF processing timed out after {timeout_seconds} seconds")
else:
return await loop.run_in_executor(None, lambda: convert_to_markdown(pdf_bytes, **kwargs))
max_pages_int = int(max_pages)
if max_pages_int > 0:
end_page_id = max_pages_int - 1
except Exception:
raise Exception("Invalid max_pages value; must be an integer")

with tempfile.TemporaryDirectory() as output_dir:
await aio_do_parse(
output_dir=output_dir,
pdf_file_names=[filename],
pdf_bytes_list=[pdf_bytes],
p_lang_list=[lang],
backend=backend,
parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable,
server_url=server_url,
f_draw_layout_bbox=False,
f_draw_span_bbox=False,
f_dump_md=True,
f_dump_middle_json=False,
f_dump_model_output=False,
f_dump_orig_pdf=False,
f_dump_content_list=False,
start_page_id=start_page_id,
end_page_id=end_page_id,
)

# Locate markdown file
parse_subdir = parse_method if backend.startswith("pipeline") else "vlm"
parse_dir = os.path.join(output_dir, filename, parse_subdir)
md_path = os.path.join(parse_dir, f"{filename}.md")
if not os.path.exists(md_path):
raise Exception("Markdown output not found after parsing")
with open(md_path, "r", encoding="utf-8") as f:
return f.read()


async def convert_to_markdown_dispatch(pdf_bytes, filename=None, **kwargs):
"""Dispatch to pipeline or VLM engine based on env MINERU_BACKEND.

Prefer using aio_do_parse to match official MinerU entrypoints.
"""
backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower()
server_url = os.getenv("MINERU_VLM_SERVER_URL") or os.getenv("MINERU_SGLANG_SERVER_URL")
lang = kwargs.get("lang", "en")
parse_method = kwargs.get("parse_method", "auto")
formula_enable = kwargs.get("formula_enable", True)
table_enable = kwargs.get("table_enable", True)
max_pages = kwargs.get("max_pages")

if filename is None:
filename = "document"

# Use aio_do_parse path for both pipeline and vlm backends
if backend_env.startswith("vlm"):
parse_method = "vlm"
backend_for_aio = backend_env
return await _convert_to_markdown_via_aio(
pdf_bytes,
filename,
lang=lang,
backend=backend_for_aio,
parse_method=parse_method,
formula_enable=formula_enable,
table_enable=table_enable,
server_url=server_url,
max_pages=max_pages,
)



async def handler(event):
"""Main serverless handler - returns only markdown"""
try:
input_data = event.get("input", {})
base64_content = input_data.get("file_content")
filename = input_data.get("filename")

timeout = input_data.get("timeout")
created_at = input_data.get("created_at")
max_pages = input_data.get("max_pages")
Expand All @@ -113,19 +230,6 @@ async def handler(event):
formula_enable = input_data.get("formula_enable", True)
table_enable = input_data.get("table_enable", True)

# Calculate remaining timeout
timeout_seconds = None
if timeout:
timeout_seconds = int(timeout) / 1000
if created_at:
elapsed = time.time() - (created_at / 1000)
if elapsed >= timeout_seconds:
return {"error": "Request timed out before processing", "status": "TIMEOUT"}
timeout_seconds = max(0, timeout_seconds - elapsed)
if timeout_seconds < 1:
return {"error": "Insufficient time remaining", "status": "TIMEOUT"}

# Validate input
if not base64_content or not filename:
return {"error": "Missing file_content or filename", "status": "ERROR"}

Expand All @@ -143,10 +247,10 @@ async def handler(event):

# Process PDF
pdf_bytes = base64.b64decode(base64_content)
md_content = await async_convert_to_markdown(

md_content = await convert_to_markdown_dispatch(
pdf_bytes=pdf_bytes,
timeout_seconds=timeout_seconds,
filename=os.path.splitext(os.path.basename(filename))[0] if filename else "document",
lang=lang,
parse_method=parse_method,
formula_enable=formula_enable,
Expand All @@ -155,11 +259,10 @@ async def handler(event):
)

return {"markdown": md_content, "status": "SUCCESS"}

except TimeoutError as e:
return {"error": str(e), "status": "TIMEOUT"}
except Exception as e:
return {"error": str(e), "status": "ERROR"}

print("Starting RunPod serverless handler...")
runpod.serverless.start({"handler": handler})

if __name__ == "__main__":
print("Starting RunPod serverless handler...")
runpod.serverless.start({"handler": handler})
Loading
Loading