Spaces:

1oscon
/

deepspeek

Runtime error

File size: 2,960 Bytes

ebb123f
 
 
 
 
 
3d7c669
ebb123f
3d7c669
ebb123f
 
3d7c669
 
ebb123f
 
 
 
3d7c669
ebb123f
 
 
 
 
 
 
3d7c669
ebb123f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d7c669
 
 
ebb123f
3d7c669
 
ebb123f
3d7c669
ebb123f
 
3d7c669
ebb123f
 
 
 
 
3d7c669
ebb123f
 
 
3d7c669
ebb123f
 
3d7c669
ebb123f
 
 
 
 
3d7c669
 
 
ebb123f

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import fitz  # PyMuPDF
import torch

# 指定设备 (在免费Space上，这里会自动选择 'cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 加载模型和分词器
# 首次加载会下载模型，可能需要很长时间
print("Loading DeepSeek-OCR model...")
model_path = 'deepseek-ai/DeepSeek-OCR'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device)
model.eval()
print("Model loaded successfully.")

def pdf_to_images(pdf_path):
    """将PDF文件转换为PIL图像列表"""
    doc = fitz.open(pdf_path)
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=200) # 适当降低dpi以减少内存消耗
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    doc.close()
    return images

def ocr_process(pdf_file):
    """处理上传的PDF文件并执行OCR"""
    if pdf_file is None:
        return "请先上传一个PDF文件"

    pdf_path = pdf_file.name
    try:
        images = pdf_to_images(pdf_path)
        
        full_text = ""
        # 提示用户进程开始
        yield "PDF处理完成，共 {} 页。开始逐页识别，请耐心等待...".format(len(images))

        for i, pil_img in enumerate(images):
            yield f"正在识别第 {i+1}/{len(images)} 页..."
            
            messages = [
                {"role": "user", "content": [{"type": "image", "image": pil_img}, {"type": "text", "text": "recognize characters in this image"}]}
            ]
            
            text_input = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(device)
            
            outputs = model.generate(text_input, max_new_tokens=2048, do_sample=False)
            result_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # 简单的后处理，移除提示词部分
            cleaned_text = result_text.split("recognize characters in this image")[-1].strip()
            
            full_text += f"--- Page {i+1} ---\n{cleaned_text}\n\n"

        yield full_text

    except Exception as e:
        yield f"处理时发生错误: {str(e)}"

# 创建Gradio界面
iface = gr.Interface(
    fn=ocr_process,
    inputs=gr.File(label="上传PDF文件", file_types=[".pdf"]),
    outputs=gr.Textbox(label="识别结果 (DeepSeek-OCR)", lines=20, show_copy_button=True),
    title="DeepSeek OCR PDF识别 (CPU运行)",
    description="上传PDF文件进行识别。警告：此模型在免费CPU服务器上运行会【极其缓慢】，处理多页或复杂PDF极有可能因超时而失败。"
)

# 启动应用
iface.launch()