1oscon commited on
Commit
3d7c669
·
verified ·
1 Parent(s): 428d0ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -21
app.py CHANGED
@@ -4,14 +4,18 @@ from PIL import Image
4
  import fitz # PyMuPDF
5
  import torch
6
 
7
- # 指定设备
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
9
 
10
  # 加载模型和分词器
 
 
11
  model_path = 'deepseek-ai/DeepSeek-OCR'
12
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
13
  model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device)
14
  model.eval()
 
15
 
16
  def pdf_to_images(pdf_path):
17
  """将PDF文件转换为PIL图像列表"""
@@ -19,7 +23,7 @@ def pdf_to_images(pdf_path):
19
  images = []
20
  for page_num in range(len(doc)):
21
  page = doc.load_page(page_num)
22
- pix = page.get_pixmap()
23
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
24
  images.append(img)
25
  doc.close()
@@ -35,44 +39,38 @@ def ocr_process(pdf_file):
35
  images = pdf_to_images(pdf_path)
36
 
37
  full_text = ""
 
 
 
38
  for i, pil_img in enumerate(images):
39
- # 准备模型输入
 
40
  messages = [
41
- {
42
- "role": "user",
43
- "content": [
44
- {"type": "image", "image": pil_img},
45
- {"type": "text", "text": "Please perform OCR on this image."}
46
- ]
47
- }
48
  ]
49
 
50
- text_input = tokenizer.apply_chat_template(
51
- messages, add_generation_prompt=True, return_tensors="pt"
52
- ).to(device)
53
 
54
- # 生成OCR结果
55
  outputs = model.generate(text_input, max_new_tokens=2048, do_sample=False)
56
  result_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
57
 
58
- # 清理和提取识别出的文本
59
  # 简单的后处理,移除提示词部分
60
- cleaned_text = result_text.split("Please perform OCR on this image.\n")[-1]
61
 
62
  full_text += f"--- Page {i+1} ---\n{cleaned_text}\n\n"
63
 
64
- return full_text
65
 
66
  except Exception as e:
67
- return f"处理时发生错误: {str(e)}"
68
 
69
  # 创建Gradio界面
70
  iface = gr.Interface(
71
  fn=ocr_process,
72
  inputs=gr.File(label="上传PDF文件", file_types=[".pdf"]),
73
- outputs=gr.Textbox(label="识别结果", lines=20),
74
- title="DeepSeek OCR PDF识别",
75
- description="上传一个PDF文件,模型将识别其中的文本内容。免费CPU服务器处理较慢,请耐心等待。"
76
  )
77
 
78
  # 启动应用
 
4
  import fitz # PyMuPDF
5
  import torch
6
 
7
+ # 指定设备 (在免费Space上,这里会自动选择 'cpu')
8
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+ print(f"Using device: {device}")
10
 
11
  # 加载模型和分词器
12
+ # 首次加载会下载模型,可能需要很长时间
13
+ print("Loading DeepSeek-OCR model...")
14
  model_path = 'deepseek-ai/DeepSeek-OCR'
15
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
16
  model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device)
17
  model.eval()
18
+ print("Model loaded successfully.")
19
 
20
  def pdf_to_images(pdf_path):
21
  """将PDF文件转换为PIL图像列表"""
 
23
  images = []
24
  for page_num in range(len(doc)):
25
  page = doc.load_page(page_num)
26
+ pix = page.get_pixmap(dpi=200) # 适当降低dpi以减少内存消耗
27
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
28
  images.append(img)
29
  doc.close()
 
39
  images = pdf_to_images(pdf_path)
40
 
41
  full_text = ""
42
+ # 提示用户进程开始
43
+ yield "PDF处理完成,共 {} 页。开始逐页识别,请耐心等待...".format(len(images))
44
+
45
  for i, pil_img in enumerate(images):
46
+ yield f"正在识别第 {i+1}/{len(images)} 页..."
47
+
48
  messages = [
49
+ {"role": "user", "content": [{"type": "image", "image": pil_img}, {"type": "text", "text": "recognize characters in this image"}]}
 
 
 
 
 
 
50
  ]
51
 
52
+ text_input = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(device)
 
 
53
 
 
54
  outputs = model.generate(text_input, max_new_tokens=2048, do_sample=False)
55
  result_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
56
 
 
57
  # 简单的后处理,移除提示词部分
58
+ cleaned_text = result_text.split("recognize characters in this image")[-1].strip()
59
 
60
  full_text += f"--- Page {i+1} ---\n{cleaned_text}\n\n"
61
 
62
+ yield full_text
63
 
64
  except Exception as e:
65
+ yield f"处理时发生错误: {str(e)}"
66
 
67
  # 创建Gradio界面
68
  iface = gr.Interface(
69
  fn=ocr_process,
70
  inputs=gr.File(label="上传PDF文件", file_types=[".pdf"]),
71
+ outputs=gr.Textbox(label="识别结果 (DeepSeek-OCR)", lines=20, show_copy_button=True),
72
+ title="DeepSeek OCR PDF识别 (CPU运行)",
73
+ description="上传PDF文件进行识别。警告:此模型在免费CPU服务器上运行会【极其缓慢】,处理多页或复杂PDF极有可能因超时而失败。"
74
  )
75
 
76
  # 启动应用