Spaces:

Instantnewdesign
/

document_extract

Sleeping

App Files Files Community

document_extract / app.py

Instantnewdesign

Update app.py

538f2a9 verified about 2 months ago

raw

history blame contribute delete

2.08 kB

	import gradio as gr
	from mineru_vl_utils.mineru_client import MinerUClient
	from PIL import Image
	import fitz # PyMuPDF
	import os
	import torch

	# Init client
	model_path = "opendatalab/MinerU2.5-2509-1.2B"
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"⚡ Loading MinerU on device: {device}")
	client = MinerUClient(backend="transformers", model_path=model_path)

	def extract_from_file(file):
	try:
	print(f"📄 Processing file: {file.name}")
	ext = os.path.splitext(file.name)[-1].lower()
	images = []

	if ext == ".pdf":
	doc = fitz.open(file.name)
	for i, page in enumerate(doc):
	pix = page.get_pixmap()
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	images.append(img)
	print(f"✅ Converted page {i+1} to image")
	else:
	images.append(Image.open(file.name))

	results = []
	for i, img in enumerate(images):
	print(f"🔍 Extracting text from page {i+1}")
	blocks = client.two_step_extract(img)
	if not blocks:
	print(f"⚠️ No blocks found on page {i+1}")
	results.append("[EMPTY PAGE]")
	continue

	text_blocks = []
	for b in blocks:
	if hasattr(b, "text") and b.text:
	text_blocks.append(b.text)

	page_text = "\n".join(text_blocks) if text_blocks else "[NO TEXT FOUND]"
	results.append(page_text)
	print(f"✅ Page {i+1} extracted")

	return "\n\n--- PAGE ---\n\n".join(results)

	except Exception as e:
	print(f"❌ ERROR: {e}")
	return f"Error during extraction: {str(e)}"

	demo = gr.Interface(
	fn=extract_from_file,
	inputs=gr.File(type="filepath", label="Upload PDF or Image"),
	outputs=gr.Textbox(label="Extracted Text", lines=20),
	title="MinerU2.5 Document Extractor",
	description="Upload a PDF or Image to extract structured text using MinerU2.5."
	)

	demo.launch()