Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from mineru_vl_utils.mineru_client import MinerUClient | |
| from PIL import Image | |
| import fitz # PyMuPDF pour lire les PDFs | |
| import os | |
| # Init client | |
| model_path = "opendatalab/MinerU2.5-2509-1.2B" | |
| client = MinerUClient( | |
| backend="transformers", | |
| model_path=model_path, | |
| device="cuda" # Utilisation GPU obligatoire | |
| ) | |
| def extract_from_file(file, progress=gr.Progress()): | |
| progress(0, desc="Analyse du fichier...") | |
| # Vérifier si PDF ou image | |
| ext = os.path.splitext(file.name)[-1].lower() | |
| images = [] | |
| if ext == ".pdf": | |
| doc = fitz.open(file.name) | |
| total_pages = len(doc) | |
| for i, page in enumerate(doc): | |
| pix = page.get_pixmap() | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| images.append(img) | |
| progress((i+1)/total_pages, desc=f"Conversion page {i+1}/{total_pages}") | |
| else: | |
| images.append(Image.open(file.name)) | |
| results = [] | |
| for i, img in enumerate(images): | |
| progress(i/len(images), desc=f"Extraction page {i+1}/{len(images)}") | |
| blocks = client.two_step_extract(img) | |
| text_blocks = [b.text for b in blocks if hasattr(b, "text")] | |
| results.append("\n".join(text_blocks)) | |
| progress(1, desc="Extraction terminée ✅") | |
| return "\n\n--- PAGE ---\n\n".join(results) | |
| demo = gr.Interface( | |
| fn=extract_from_file, | |
| inputs=gr.File(type="filepath", label="Upload PDF or Image"), | |
| outputs=gr.Textbox(label="Extracted Text", lines=20), | |
| title="MinerU2.5 Document Extractor", | |
| description="Upload a PDF or Image to extract structured text using MinerU2.5 with GPU." | |
| ) | |
| demo.launch() |