bharatcoder's picture
Update app.py
05291b5 verified
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import base64
from io import BytesIO
import os
# -----------------------------
# Load model and processor once
# -----------------------------
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# -----------------------------
# Image conversion helper
# -----------------------------
def convert_to_pil(image_input):
"""
Convert base64, dict, or file path to PIL.Image.
Handles:
- "data:image/png;base64,...."
- plain base64
- {"type": "image", "data": "..."}
- file path
"""
# Case 1: dict input (Perplexity/Claude format)
if isinstance(image_input, dict) and "data" in image_input:
image_input = image_input["data"]
# Case 2: base64 string with prefix
if isinstance(image_input, str) and image_input.startswith("data:image"):
base64_str = image_input.split(",", 1)[1]
image_data = base64.b64decode(base64_str)
return Image.open(BytesIO(image_data))
# Case 3: plain base64 string (no prefix)
if isinstance(image_input, str) and "," in image_input and len(image_input) > 100:
try:
image_data = base64.b64decode(image_input)
return Image.open(BytesIO(image_data))
except Exception:
pass
# Case 4: local file path
if isinstance(image_input, str) and os.path.exists(image_input):
return Image.open(image_input)
raise ValueError("Could not convert image input to PIL.Image")
# -----------------------------
# Core function
# -----------------------------
def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
"""
Run SmolDocling image-to-text conversion.
"""
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return result.replace("<end_of_utterance>", "").strip()
# -----------------------------
# Wrapper for MCP schema compatibility
# -----------------------------
def smoldocling_entry(image: str, prompt_text: str) -> str:
"""
Entry point for the SmolDocling MCP tool.
Expected input formats:
- **Base64 string**: "data:image/png;base64,...."
- **Object** (Perplexity/Claude style): {"type": "image", "data": "data:image/png;base64,..."}
- **Local file path** (for internal testing)
Parameters
----------
image : str
A base64-encoded image string (with or without data: prefix) OR
a JSON-encoded object containing image data.
prompt_text : str
Instruction text for how to process the document (e.g., "Convert this page to docling.")
Returns
-------
str
Structured or textual content extracted from the image.
"""
# Handle Perplexity-style dicts encoded as JSON strings
print(f"Received entry: {image} prompt: {prompt_text}")
try:
import json
maybe_json = json.loads(image)
if isinstance(maybe_json, dict) and "data" in maybe_json:
image = maybe_json
except Exception:
pass
pil_image = convert_to_pil(image)
return smoldocling_readimage(pil_image, prompt_text)
# -----------------------------
# Gradio MCP App (Headless)
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
### 📄 SmolDocling MCP Tool
This is a **headless MCP tool** for document image conversion.
It supports input as:
- Base64-encoded images
- Perplexity/Claude `{"type": "image", "data": "..."}` objects
- Local file paths (for testing)
"""
)
# Expose MCP tool
gr.api(smoldocling_entry)
# Launch MCP server mode
_, url, _ = demo.launch(mcp_server=True)
print(f"MCP Server running at: {url}")