File size: 2,075 Bytes
1648a50
87cc209
 
538f2a9
87cc209
538f2a9
1648a50
538f2a9
87cc209
538f2a9
 
112a8d7
1648a50
112a8d7
538f2a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87cc209
 
 
ce41633
87cc209
 
112a8d7
87cc209
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
from mineru_vl_utils.mineru_client import MinerUClient
from PIL import Image
import fitz  # PyMuPDF
import os
import torch

# Init client
model_path = "opendatalab/MinerU2.5-2509-1.2B"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"⚑ Loading MinerU on device: {device}")
client = MinerUClient(backend="transformers", model_path=model_path)

def extract_from_file(file):
    try:
        print(f"πŸ“„ Processing file: {file.name}")
        ext = os.path.splitext(file.name)[-1].lower()
        images = []

        if ext == ".pdf":
            doc = fitz.open(file.name)
            for i, page in enumerate(doc):
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                images.append(img)
                print(f"βœ… Converted page {i+1} to image")
        else:
            images.append(Image.open(file.name))

        results = []
        for i, img in enumerate(images):
            print(f"πŸ” Extracting text from page {i+1}")
            blocks = client.two_step_extract(img)
            if not blocks:
                print(f"⚠️ No blocks found on page {i+1}")
                results.append("[EMPTY PAGE]")
                continue

            text_blocks = []
            for b in blocks:
                if hasattr(b, "text") and b.text:
                    text_blocks.append(b.text)

            page_text = "\n".join(text_blocks) if text_blocks else "[NO TEXT FOUND]"
            results.append(page_text)
            print(f"βœ… Page {i+1} extracted")

        return "\n\n--- PAGE ---\n\n".join(results)

    except Exception as e:
        print(f"❌ ERROR: {e}")
        return f"Error during extraction: {str(e)}"

demo = gr.Interface(
    fn=extract_from_file,
    inputs=gr.File(type="filepath", label="Upload PDF or Image"),
    outputs=gr.Textbox(label="Extracted Text", lines=20),
    title="MinerU2.5 Document Extractor",
    description="Upload a PDF or Image to extract structured text using MinerU2.5."
)

demo.launch()