import gradio as gr
from PIL import Image
from mineru_vl_utils import MinerUClient
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

# Charger le modèle MinerU
model_path = "opendatalab/MinerU2.5-2509-1.2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype="auto",
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_path, use_fast=True)

client = MinerUClient(
    backend="transformers",
    model=model,
    processor=processor
)

def extract_from_image(image):
    # Conversion si nécessaire
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)

    # Extraction
    blocks = client.two_step_extract(image)
    # On retourne le texte concaténé
    extracted_text = "\n".join([b.text for b in blocks if hasattr(b, "text")])
    return extracted_text

# Interface Gradio
demo = gr.Interface(
    fn=extract_from_image,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="MinerU2.5 - Document Extract",
    description="Upload an image or PDF page and extract structured text with MinerU2.5"
)

if __name__ == "__main__":
    demo.launch()