my-smoldocling-demo

Sleeping

App Files Files Community

my-smoldocling-demo / app.py

bharatcoder

Update app.py

05291b5 verified 24 days ago

raw

history blame contribute delete

4.31 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoModelForImageTextToText
	from PIL import Image
	import base64
	from io import BytesIO
	import os

	# -----------------------------
	# Load model and processor once
	# -----------------------------
	processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
	model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

	# -----------------------------
	# Image conversion helper
	# -----------------------------
	def convert_to_pil(image_input):
	"""
	Convert base64, dict, or file path to PIL.Image.
	Handles:
	- "data:image/png;base64,...."
	- plain base64
	- {"type": "image", "data": "..."}
	- file path
	"""
	# Case 1: dict input (Perplexity/Claude format)
	if isinstance(image_input, dict) and "data" in image_input:
	image_input = image_input["data"]

	# Case 2: base64 string with prefix
	if isinstance(image_input, str) and image_input.startswith("data:image"):
	base64_str = image_input.split(",", 1)[1]
	image_data = base64.b64decode(base64_str)
	return Image.open(BytesIO(image_data))

	# Case 3: plain base64 string (no prefix)
	if isinstance(image_input, str) and "," in image_input and len(image_input) > 100:
	try:
	image_data = base64.b64decode(image_input)
	return Image.open(BytesIO(image_data))
	except Exception:
	pass

	# Case 4: local file path
	if isinstance(image_input, str) and os.path.exists(image_input):
	return Image.open(image_input)

	raise ValueError("Could not convert image input to PIL.Image")

	# -----------------------------
	# Core function
	# -----------------------------
	def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
	"""
	Run SmolDocling image-to-text conversion.
	"""
	messages = [
	{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
	]
	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	outputs = model.generate(**inputs, max_new_tokens=1024)

	prompt_length = inputs.input_ids.shape[1]
	generated = outputs[:, prompt_length:]
	result = processor.batch_decode(generated, skip_special_tokens=False)[0]
	return result.replace("<end_of_utterance>", "").strip()

	# -----------------------------
	# Wrapper for MCP schema compatibility
	# -----------------------------
	def smoldocling_entry(image: str, prompt_text: str) -> str:
	"""
	Entry point for the SmolDocling MCP tool.

	Expected input formats:
	- Base64 string: "data:image/png;base64,...."
	- Object (Perplexity/Claude style): {"type": "image", "data": "data:image/png;base64,..."}
	- Local file path (for internal testing)

	Parameters
	----------
	image : str
	A base64-encoded image string (with or without data: prefix) OR
	a JSON-encoded object containing image data.
	prompt_text : str
	Instruction text for how to process the document (e.g., "Convert this page to docling.")

	Returns
	-------
	str
	Structured or textual content extracted from the image.
	"""
	# Handle Perplexity-style dicts encoded as JSON strings
	print(f"Received entry: {image} prompt: {prompt_text}")
	try:
	import json
	maybe_json = json.loads(image)
	if isinstance(maybe_json, dict) and "data" in maybe_json:
	image = maybe_json
	except Exception:
	pass

	pil_image = convert_to_pil(image)
	return smoldocling_readimage(pil_image, prompt_text)

	# -----------------------------
	# Gradio MCP App (Headless)
	# -----------------------------
	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	### 📄 SmolDocling MCP Tool
	This is a headless MCP tool for document image conversion.
	It supports input as:
	- Base64-encoded images
	- Perplexity/Claude `{"type": "image", "data": "..."}` objects
	- Local file paths (for testing)
	"""
	)

	# Expose MCP tool
	gr.api(smoldocling_entry)

	# Launch MCP server mode
	_, url, _ = demo.launch(mcp_server=True)
	print(f"MCP Server running at: {url}")