|
|
import gradio as gr |
|
|
import spaces |
|
|
from paddleocr import PaddleOCR |
|
|
import fitz |
|
|
from PIL import Image |
|
|
import numpy as np |
|
|
import os |
|
|
|
|
|
|
|
|
OUTPUT_DIR = "output_results" |
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
def load_gpu_model(): |
|
|
print("正在Docker容器中加载PaddleOCR GPU模型...") |
|
|
|
|
|
ocr_model = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=True) |
|
|
print("GPU模型加载成功。") |
|
|
return ocr_model |
|
|
|
|
|
|
|
|
@spaces.GPU |
|
|
def process_pdf_max_speed(pdf_file, progress=gr.Progress(track_tqdm=True)): |
|
|
""" |
|
|
使用GPU和批处理来极速处理PDF,并实时更新进度条。 |
|
|
""" |
|
|
if pdf_file is None: |
|
|
return "请先上传一个PDF文件。", None |
|
|
|
|
|
try: |
|
|
|
|
|
ocr = load_gpu_model() |
|
|
|
|
|
doc = fitz.open(pdf_file.name) |
|
|
total_pages = len(doc) |
|
|
batch_size = 4 |
|
|
full_text_result = [] |
|
|
|
|
|
for i in progress.tqdm(range(0, total_pages, batch_size), desc="🚀 批处理中..."): |
|
|
|
|
|
batch_images = [] |
|
|
for page_num in range(i, min(i + batch_size, total_pages)): |
|
|
page = doc.load_page(page_num) |
|
|
pix = page.get_pixmap(dpi=200) |
|
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) |
|
|
batch_images.append(np.array(img)) |
|
|
|
|
|
if batch_images: |
|
|
results = ocr.ocr(batch_images, cls=True) |
|
|
|
|
|
for page_index, page_result in enumerate(results): |
|
|
page_texts = [] |
|
|
current_page_num = i + page_index + 1 |
|
|
if page_result: |
|
|
for line in page_result: |
|
|
page_texts.append(line[1][0]) |
|
|
|
|
|
full_text_result.append(f"--- Page {current_page_num} ---\n" + "\n".join(page_texts)) |
|
|
|
|
|
doc.close() |
|
|
|
|
|
final_text = "\n\n".join(full_text_result) |
|
|
output_filename = os.path.join(OUTPUT_DIR, f"{os.path.splitext(os.path.basename(pdf_file.name))[0]}_result.txt") |
|
|
with open(output_filename, 'w', encoding='utf-8') as f: |
|
|
f.write(final_text) |
|
|
|
|
|
print(f"处理完成!结果已保存到 {output_filename}") |
|
|
return final_text, output_filename |
|
|
|
|
|
except Exception as e: |
|
|
error_message = f"处理过程中发生错误: {str(e)}" |
|
|
print(error_message) |
|
|
return error_message, None |
|
|
|
|
|
|
|
|
with gr.Blocks(title="极速PDF识别器", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# ✅ 极速PDF识别器 (终极稳定版) ✅ |
|
|
**速度拉满!实时进度显示,处理期间请勿关闭页面。** |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
pdf_input = gr.File(label="📄 上传PDF文件", file_types=[".pdf"]) |
|
|
|
|
|
submit_btn = gr.Button("⚡️ 开始极速处理", variant="primary") |
|
|
|
|
|
result_display = gr.Textbox(label="识别结果", lines=20, show_copy_button=True) |
|
|
download_link = gr.File(label="📥 点击此处下载结果文件", interactive=False) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_pdf_max_speed, |
|
|
inputs=[pdf_input], |
|
|
outputs=[display, download_link] |
|
|
) |
|
|
|
|
|
demo.queue().launch() |