import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image import base64 from io import BytesIO import os # ----------------------------- # Load model and processor once # ----------------------------- processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") # ----------------------------- # Image conversion helper # ----------------------------- def convert_to_pil(image_input): """ Convert base64, dict, or file path to PIL.Image. Handles: - "data:image/png;base64,...." - plain base64 - {"type": "image", "data": "..."} - file path """ # Case 1: dict input (Perplexity/Claude format) if isinstance(image_input, dict) and "data" in image_input: image_input = image_input["data"] # Case 2: base64 string with prefix if isinstance(image_input, str) and image_input.startswith("data:image"): base64_str = image_input.split(",", 1)[1] image_data = base64.b64decode(base64_str) return Image.open(BytesIO(image_data)) # Case 3: plain base64 string (no prefix) if isinstance(image_input, str) and "," in image_input and len(image_input) > 100: try: image_data = base64.b64decode(image_input) return Image.open(BytesIO(image_data)) except Exception: pass # Case 4: local file path if isinstance(image_input, str) and os.path.exists(image_input): return Image.open(image_input) raise ValueError("Could not convert image input to PIL.Image") # ----------------------------- # Core function # ----------------------------- def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str: """ Run SmolDocling image-to-text conversion. """ messages = [ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]} ] prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=1024) prompt_length = inputs.input_ids.shape[1] generated = outputs[:, prompt_length:] result = processor.batch_decode(generated, skip_special_tokens=False)[0] return result.replace("", "").strip() # ----------------------------- # Wrapper for MCP schema compatibility # ----------------------------- def smoldocling_entry(image: str, prompt_text: str) -> str: """ Entry point for the SmolDocling MCP tool. Expected input formats: - **Base64 string**: "data:image/png;base64,...." - **Object** (Perplexity/Claude style): {"type": "image", "data": "data:image/png;base64,..."} - **Local file path** (for internal testing) Parameters ---------- image : str A base64-encoded image string (with or without data: prefix) OR a JSON-encoded object containing image data. prompt_text : str Instruction text for how to process the document (e.g., "Convert this page to docling.") Returns ------- str Structured or textual content extracted from the image. """ # Handle Perplexity-style dicts encoded as JSON strings print(f"Received entry: {image} prompt: {prompt_text}") try: import json maybe_json = json.loads(image) if isinstance(maybe_json, dict) and "data" in maybe_json: image = maybe_json except Exception: pass pil_image = convert_to_pil(image) return smoldocling_readimage(pil_image, prompt_text) # ----------------------------- # Gradio MCP App (Headless) # ----------------------------- with gr.Blocks() as demo: gr.Markdown( """ ### 📄 SmolDocling MCP Tool This is a **headless MCP tool** for document image conversion. It supports input as: - Base64-encoded images - Perplexity/Claude `{"type": "image", "data": "..."}` objects - Local file paths (for testing) """ ) # Expose MCP tool gr.api(smoldocling_entry) # Launch MCP server mode _, url, _ = demo.launch(mcp_server=True) print(f"MCP Server running at: {url}")