Spaces:

1oscon
/

PaddleOCR

Runtime error

App Files Files Community

1oscon commited on Oct 22

Commit

3d8e7a8

verified ·

1 Parent(s): d337cf9

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -45

app.py CHANGED Viewed

@@ -4,65 +4,107 @@ import fitz  # PyMuPDF
 from PIL import Image
 import numpy as np
 import os
-# 设置环境变量，防止不必要的日志
-os.environ['KMP_DUPLICATE_LIB_OK']='True'
-# 初始化PaddleOCR，明确指定使用中文模型 (lang='ch')
-print("正在加载PaddleOCR中文模型...")
-ocr = PaddleOCR(use_textline_orientation=True, lang='ch')
-print("模型加载完成。")
-def pdf_ocr_process(pdf_file):
     """
-    接收上传的PDF文件，使用PaddleOCR进行识别，并返回纯文本结果。
     """
     if pdf_file is None:
-        return "请上传一个PDF文件进行识别。"
     try:
-        # Gradio传递的是一个临时文件对象，我们使用它的.name属性获取文件路径
-        doc = fitz.open(pdf_file.name)
-        full_text = []
-        # 遍历PDF的每一页
-        for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
-            # 将页面转换为高分辨率的PNG图像，以提高识别准确率
-            pix = page.get_pixmap(dpi=300)
-            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            # 转换为PaddleOCR需要的Numpy数组格式
-            img_np = np.array(img)
-            # --- 核心修正 ---
-            # 执行OCR识别，移除新版本中已不接受的 'cls' 参数
-            result = ocr.ocr(img_np)
-            # 提取识别出的文本行
-            page_texts = []
-            if result and result[0]: # 确保result不是None或空
-                for line in result[0]:
-                    page_texts.append(line[1][0]) # line[1][0] 是文本内容
-            # 将当页的文本拼接起来
-            full_text.append(f"--- Page {page_num + 1} ---\n" + "\n".join(page_texts))
         doc.close()
-        return "\n\n".join(full_text)
     except Exception as e:
-        return f"处理过程中发生错误: {str(e)}"
-# 创建并启动Gradio界面
-iface = gr.Interface(
-    fn=pdf_ocr_process,
-    inputs=gr.File(label="上传PDF文件", file_types=[".pdf"]),
-    outputs=gr.Textbox(label="中文识别结果 (PaddleOCR)", lines=25, show_copy_button=True),
-    title="免费部署的中文PDF文档识别",
-    description="此应用基于PaddleOCR，为中文识别特别优化。它在CPU上运行，处理速度取决于文档的复杂度和页数。"
-)
-iface.launch()

 from PIL import Image
 import numpy as np
 import os
+import time
+# --- 配置 ---
+OUTPUT_DIR = "output_results" # 保存结果的文件夹
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+# --- 模型加载器 ---
+# 将模型加载封装成函数，确保只在GPU会话中加载
+def load_gpu_model():
+    print("正在加载PaddleOCR GPU模型...")
+    # 核心改动：use_gpu=True, 强制使用GPU
+    ocr_model = PaddleOCR(use_textline_orientation=True, lang='ch', use_gpu=True, show_log=False)
+    print("GPU模型加载完成。")
+    return ocr_model
+# --- Gradio调用的核心处理函数 ---
+# 核心改动：使用@spaces.GPU申请GPU资源
+@spaces.GPU
+def process_pdf_max_speed(pdf_file, progress=gr.Progress(track_tqdm=True)):
     """
+    使用GPU和批处理来极速处理PDF，并实时更新进度条。
     """
     if pdf_file is None:
+        return "请先上传一个PDF文件。", None
     try:
+        # 在GPU会话中加载模型
+        ocr = load_gpu_model()
+        # --- 准备工作 ---
+        doc = fitz.open(pdf_file.name)
+        total_pages = len(doc)
+        batch_size = 4  # 批处理大小，一次性处理4页，可以充分利用GPU
+        full_text_result = []
+        # --- 核心处理循环 ---
+        # gr.Progress(track_tqdm=True) 会自动创建一个漂亮的进度条
+        for i in progress.tqdm(range(0, total_pages, batch_size), desc="🚀 批处理中..."):
+            batch_images = []
+            # 准备一个批次的图片
+            for page_num in range(i, min(i + batch_size, total_pages)):
+                page = doc.load_page(page_num)
+                pix = page.get_pixmap(dpi=200)
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                # PaddleOCR可以直接处理Numpy数组
+                batch_images.append(np.array(img))
+            # --- 速度核心：一次性识别一个批次的图片 ---
+            if batch_images:
+                results = ocr.ocr(batch_images)
+                # 整理这个批次的结果
+                for page_index, page_result in enumerate(results):
+                    page_texts = []
+                    current_page_num = i + page_index + 1
+                    if page_result:
+                        for line in page_result:
+                            page_texts.append(line[1][0])
+                    full_text_result.append(f"--- Page {current_page_num} ---\n" + "\n".join(page_texts))
         doc.close()
+        # --- 保存最终结果 ---
+        final_text = "\n\n".join(full_text_result)
+        output_filename = os.path.join(OUTPUT_DIR, f"{os.path.splitext(os.path.basename(pdf_file.name))[0]}_result.txt")
+        with open(output_filename, 'w', encoding='utf-8') as f:
+            f.write(final_text)
+        print(f"处理完成！结果已保存到 {output_filename}")
+        # 返回文本内容和可供下载的文件路径
+        return final_text, output_filename
     except Exception as e:
+        error_message = f"处理过程中发生错误: {str(e)}"
+        print(error_message)
+        return error_message, None
+# --- 构建Gradio界面 ---
+with gr.Blocks(title="��速PDF识别器", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🔥 极速PDF识别器 (GPU加速版) 🔥
+        **速度拉满！实时进度显示，但处理期间请勿关闭页面。**
+        """
+    )
+    with gr.Row():
+        pdf_input = gr.File(label="📄 上传PDF文件", file_types=[".pdf"])
+    submit_btn = gr.Button("⚡️ 开始极速处理", variant="primary")
+    result_display = gr.Textbox(label="识别结果", lines=20, show_copy_button=True)
+    download_link = gr.File(label="📥 点击此处下载结果文件", interactive=False)
+    # 按钮和函数的连接
+    submit_btn.click(
+        fn=process_pdf_max_speed,
+        inputs=[pdf_input],
+        outputs=[result_display, download_link]
+    )
+demo.queue().launch()