File size: 3,429 Bytes
c8d8d3a
393a4a4
c8d8d3a
 
 
 
 
 
3d8e7a8
393a4a4
3d8e7a8
c8d8d3a
bb6244a
3d8e7a8
bb6244a
 
393a4a4
 
3d8e7a8
c8d8d3a
3d8e7a8
 
 
c8d8d3a
3d8e7a8
c8d8d3a
 
3d8e7a8
c8d8d3a
 
27b7a93
3d8e7a8
c8d8d3a
3d8e7a8
 
bb6244a
3d8e7a8
 
 
c8d8d3a
3d8e7a8
 
 
 
 
 
 
 
393a4a4
3d8e7a8
 
 
 
 
 
 
 
 
c8d8d3a
 
 
3d8e7a8
 
 
 
 
 
 
c8d8d3a
 
3d8e7a8
 
 
 
 
 
 
 
bb6244a
393a4a4
3d8e7a8
 
 
 
 
 
 
 
 
 
c8d8d3a
3d8e7a8
 
 
bb6244a
3d8e7a8
c8d8d3a
3d8e7a8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import spaces
from paddleocr import PaddleOCR
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
import os

# --- 配置 ---
OUTPUT_DIR = "output_results" 
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- 模型加载器 (这个函数将在GPU容器中被安全调用) ---
def load_gpu_model():
    print("正在Docker容器中加载PaddleOCR GPU模型...")
    # 使用与2.7.3版本兼容的参数
    ocr_model = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=True)
    print("GPU模型加载成功。")
    return ocr_model

# --- Gradio调用的核心处理函数 ---
@spaces.GPU
def process_pdf_max_speed(pdf_file, progress=gr.Progress(track_tqdm=True)):
    """
    使用GPU和批处理来极速处理PDF,并实时更新进度条。
    """
    if pdf_file is None:
        return "请先上传一个PDF文件。", None

    try:
        # 在GPU会话中加载模型
        ocr = load_gpu_model()
        
        doc = fitz.open(pdf_file.name)
        total_pages = len(doc)
        batch_size = 4
        full_text_result = []

        for i in progress.tqdm(range(0, total_pages, batch_size), desc="🚀 批处理中..."):
            
            batch_images = []
            for page_num in range(i, min(i + batch_size, total_pages)):
                page = doc.load_page(page_num)
                pix = page.get_pixmap(dpi=200)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                batch_images.append(np.array(img))

            if batch_images:
                results = ocr.ocr(batch_images, cls=True)
                
                for page_index, page_result in enumerate(results):
                    page_texts = []
                    current_page_num = i + page_index + 1
                    if page_result:
                        for line in page_result:
                            page_texts.append(line[1][0])
                    
                    full_text_result.append(f"--- Page {current_page_num} ---\n" + "\n".join(page_texts))

        doc.close()
        
        final_text = "\n\n".join(full_text_result)
        output_filename = os.path.join(OUTPUT_DIR, f"{os.path.splitext(os.path.basename(pdf_file.name))[0]}_result.txt")
        with open(output_filename, 'w', encoding='utf-8') as f:
            f.write(final_text)

        print(f"处理完成!结果已保存到 {output_filename}")
        return final_text, output_filename

    except Exception as e:
        error_message = f"处理过程中发生错误: {str(e)}"
        print(error_message)
        return error_message, None

# --- 构建Gradio界面 ---
with gr.Blocks(title="极速PDF识别器", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # ✅ 极速PDF识别器 (终极稳定版) ✅
        **速度拉满!实时进度显示,处理期间请勿关闭页面。**
        """
    )
    
    with gr.Row():
        pdf_input = gr.File(label="📄 上传PDF文件", file_types=[".pdf"])
    
    submit_btn = gr.Button("⚡️ 开始极速处理", variant="primary")
    
    result_display = gr.Textbox(label="识别结果", lines=20, show_copy_button=True)
    download_link = gr.File(label="📥 点击此处下载结果文件", interactive=False)

    submit_btn.click(
        fn=process_pdf_max_speed,
        inputs=[pdf_input],
        outputs=[display, download_link]
    )

demo.queue().launch()