Spaces:

1oscon
/

PaddleOCR

Runtime error

App Files Files Community

1oscon commited on Oct 22

Commit

c8d8d3a

verified ·

1 Parent(s): 11b9fab

Create app.py

Browse files

Files changed (1) hide show

app.py +71 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import gradio as gr
+from paddleocr import PaddleOCR
+import fitz  # PyMuPDF
+from PIL import Image
+import numpy as np
+import os
+# 设置环境变量，防止一些不必要的日志输出
+os.environ['KMP_DUPLICATE_LIB_OK']='True'
+# 初始化PaddleOCR，强制使用CPU
+# 第一次运行时会自动下载模型，会比较慢，请耐心等待
+print("正在加载PaddleOCR模型...")
+# 同时支持中文和英文识别
+ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False, show_log=False)
+print("模型加载完成。")
+def pdf_ocr_process(pdf_file):
+    """
+    接收上传的PDF文件，使用PaddleOCR进行识别，并返回纯文本结果。
+    """
+    if pdf_file is None:
+        return "请上传一个PDF文件进行识别。"
+    try:
+        # 从上传的文件对象中读取字节流
+        pdf_bytes = pdf_file.read()
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        full_text = []
+        # 遍历PDF的每一页
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            # 将页面转换为高分辨率的PNG图像
+            pix = page.get_pixmap(dpi=300)
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            # PaddleOCR需要一个Numpy数组格式的图像
+            img_np = np.array(img)
+            # 执行OCR识别
+            result = ocr.ocr(img_np, cls=True)
+            # 提取识别出的文本行
+            page_texts = []
+            if result and result[0]: # 确保result不是None或空
+                for line in result[0]:
+                    page_texts.append(line[1][0]) # line[1][0] 是文本内容
+            # 将当页的文本拼接起来
+            full_text.append(f"--- Page {page_num + 1} ---\n" + "\n".join(page_texts))
+        doc.close()
+        return "\n\n".join(full_text)
+    except Exception as e:
+        return f"处理过程中发生错误: {str(e)}"
+# 创建并启动Gradio界面
+iface = gr.Interface(
+    fn=pdf_ocr_process,
+    inputs=gr.File(label="上传PDF文件", file_types=[".pdf"]),
+    outputs=gr.Textbox(label="识别结果 (PaddleOCR)", lines=25, show_copy_button=True),
+    title="免费部署的PDF文档识别 (PaddleOCR)",
+    description="这是一个完全免费的OCR方案，基于PaddleOCR。它在CPU上运行，处理速度取决于文档的复杂度和页数。首次运行或长时间未使用后启动较慢。",
+    examples=[["sample.pdf"]]
+)
+iface.launch()