Spaces:
Sleeping
Sleeping
File size: 2,075 Bytes
1648a50 87cc209 538f2a9 87cc209 538f2a9 1648a50 538f2a9 87cc209 538f2a9 112a8d7 1648a50 112a8d7 538f2a9 87cc209 ce41633 87cc209 112a8d7 87cc209 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
import gradio as gr
from mineru_vl_utils.mineru_client import MinerUClient
from PIL import Image
import fitz # PyMuPDF
import os
import torch
# Init client
model_path = "opendatalab/MinerU2.5-2509-1.2B"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"β‘ Loading MinerU on device: {device}")
client = MinerUClient(backend="transformers", model_path=model_path)
def extract_from_file(file):
try:
print(f"π Processing file: {file.name}")
ext = os.path.splitext(file.name)[-1].lower()
images = []
if ext == ".pdf":
doc = fitz.open(file.name)
for i, page in enumerate(doc):
pix = page.get_pixmap()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
images.append(img)
print(f"β
Converted page {i+1} to image")
else:
images.append(Image.open(file.name))
results = []
for i, img in enumerate(images):
print(f"π Extracting text from page {i+1}")
blocks = client.two_step_extract(img)
if not blocks:
print(f"β οΈ No blocks found on page {i+1}")
results.append("[EMPTY PAGE]")
continue
text_blocks = []
for b in blocks:
if hasattr(b, "text") and b.text:
text_blocks.append(b.text)
page_text = "\n".join(text_blocks) if text_blocks else "[NO TEXT FOUND]"
results.append(page_text)
print(f"β
Page {i+1} extracted")
return "\n\n--- PAGE ---\n\n".join(results)
except Exception as e:
print(f"β ERROR: {e}")
return f"Error during extraction: {str(e)}"
demo = gr.Interface(
fn=extract_from_file,
inputs=gr.File(type="filepath", label="Upload PDF or Image"),
outputs=gr.Textbox(label="Extracted Text", lines=20),
title="MinerU2.5 Document Extractor",
description="Upload a PDF or Image to extract structured text using MinerU2.5."
)
demo.launch() |