markobinario commited on
Commit
c0c9942
Β·
verified Β·
1 Parent(s): 575870b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +278 -52
app.py CHANGED
@@ -1,17 +1,23 @@
1
  import os
2
  import io
3
  import json
4
- from typing import List, Tuple, Dict, Any
 
5
 
6
  import fitz # PyMuPDF
7
  from PIL import Image
8
  import gradio as gr
 
9
 
 
 
 
 
 
10
 
11
  # Lazy-load the OCR model to reduce startup time and memory
12
  _ocr_model = None
13
 
14
-
15
  def get_ocr_model(lang: str = "en"):
16
  global _ocr_model
17
  if _ocr_model is not None:
@@ -24,8 +30,7 @@ def get_ocr_model(lang: str = "en"):
24
  _ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
25
  return _ocr_model
26
 
27
-
28
- def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -> Image.Image:
29
  page = pdf_doc.load_page(page_index)
30
  zoom = dpi / 72.0 # 72 dpi is PDF default
31
  mat = fitz.Matrix(zoom, zoom)
@@ -33,12 +38,9 @@ def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 170) -
33
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
34
  return img
35
 
36
-
37
  def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]:
38
  ocr = get_ocr_model(lang=lang)
39
  # Convert PIL image to numpy array for PaddleOCR
40
- import numpy as np
41
-
42
  img_np = np.array(image)
43
  result = ocr.ocr(img_np, cls=True)
44
 
@@ -58,26 +60,27 @@ def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str,
58
 
59
  return "\n".join(lines), items
60
 
61
-
62
- def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str]:
63
  """
64
- Returns combined text and a JSON string with per-page OCR results.
65
  """
66
  if file_obj is None:
67
- return "", json.dumps({"pages": []}, ensure_ascii=False)
68
-
69
- # Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name
70
- pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
71
- if pdf_path is None or not os.path.exists(pdf_path):
72
- # If bytes were passed, fall back to reading from buffer
73
- file_bytes = file_obj.read() if hasattr(file_obj, "read") else None
74
- if not file_bytes:
75
- return "", json.dumps({"pages": []}, ensure_ascii=False)
76
- pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
77
- else:
78
- pdf_doc = fitz.open(pdf_path)
79
 
 
 
80
  try:
 
 
 
 
 
 
 
 
 
 
 
81
  num_pages = pdf_doc.page_count
82
  if max_pages is not None:
83
  num_pages = min(num_pages, max_pages)
@@ -97,39 +100,262 @@ def extract_text_from_pdf(file_obj, dpi: int = 170, max_pages: int | None = None
97
 
98
  combined_text = "\n\n".join([t for t in all_text_lines if t])
99
  json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False)
100
-
101
- return combined_text, json_payload
102
- finally:
 
 
 
 
 
 
 
103
  pdf_doc.close()
104
-
105
-
106
- def gradio_predict(pdf_file):
107
- # Always render at a high DPI for accuracy and use English OCR by default
108
- text, _ = extract_text_from_pdf(pdf_file, dpi=300, max_pages=None, lang="en")
109
- return text
110
-
111
-
112
- with gr.Blocks(title="PDF OCR with PaddleOCR + PyMuPDF") as demo:
113
- gr.Markdown("""
114
- # PDF OCR (PaddleOCR + PyMuPDF)
115
-
116
- Upload a PDF to extract text using OCR. The app renders pages with PyMuPDF at a high DPI and uses PaddleOCR for recognition.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  """)
118
-
119
- pdf_input = gr.File(label="PDF", file_types=[".pdf"], file_count="single")
120
- text_output = gr.Textbox(label="Extracted Text", lines=20)
121
-
122
- # Auto-run OCR when a PDF is uploaded
123
- pdf_input.change(fn=gradio_predict, inputs=[pdf_input], outputs=[text_output], api_name="predict")
124
-
125
- # Simple API note
126
- gr.Markdown("""
127
- ## API usage
128
- - Use `gradio_client` to call this Space. Function signature: `gradio_predict(pdf_file)` β†’ `text`.
129
  """)
130
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  if __name__ == "__main__":
133
- # On Spaces, the host/port are managed by the platform. Locally, this runs on 7860 by default.
134
- demo.launch()
 
 
 
 
 
 
135
 
 
1
  import os
2
  import io
3
  import json
4
+ import time
5
+ from typing import List, Tuple, Dict, Any, Optional
6
 
7
  import fitz # PyMuPDF
8
  from PIL import Image
9
  import gradio as gr
10
+ import numpy as np
11
 
12
+ # =========================
13
+ # Config
14
+ # =========================
15
+ LOGO_IMAGE_PATH = './assets/logo.jpg'
16
+ GOOGLE_FONTS_URL = "<link href='https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap' rel='stylesheet'>"
17
 
18
  # Lazy-load the OCR model to reduce startup time and memory
19
  _ocr_model = None
20
 
 
21
  def get_ocr_model(lang: str = "en"):
22
  global _ocr_model
23
  if _ocr_model is not None:
 
30
  _ocr_model = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
31
  return _ocr_model
32
 
33
+ def pdf_page_to_image(pdf_doc: fitz.Document, page_index: int, dpi: int = 300) -> Image.Image:
 
34
  page = pdf_doc.load_page(page_index)
35
  zoom = dpi / 72.0 # 72 dpi is PDF default
36
  mat = fitz.Matrix(zoom, zoom)
 
38
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
39
  return img
40
 
 
41
  def run_paddle_ocr_on_image(image: Image.Image, lang: str = "en") -> Tuple[str, List[Dict[str, Any]]]:
42
  ocr = get_ocr_model(lang=lang)
43
  # Convert PIL image to numpy array for PaddleOCR
 
 
44
  img_np = np.array(image)
45
  result = ocr.ocr(img_np, cls=True)
46
 
 
60
 
61
  return "\n".join(lines), items
62
 
63
+ def extract_text_from_pdf(file_obj, dpi: int = 300, max_pages: int | None = None, lang: str = "en") -> Tuple[str, str, Dict[str, Any]]:
 
64
  """
65
+ Returns combined text, JSON string with per-page OCR results, and processing stats.
66
  """
67
  if file_obj is None:
68
+ return "", json.dumps({"pages": []}, ensure_ascii=False), {"error": "No file provided"}
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ start_time = time.time()
71
+
72
  try:
73
+ # Gradio may pass a path or a tempfile.NamedTemporaryFile-like with .name
74
+ pdf_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
75
+ if pdf_path is None or not os.path.exists(pdf_path):
76
+ # If bytes were passed, fall back to reading from buffer
77
+ file_bytes = file_obj.read() if hasattr(file_obj, "read") else None
78
+ if not file_bytes:
79
+ return "", json.dumps({"pages": []}, ensure_ascii=False), {"error": "Could not read file"}
80
+ pdf_doc = fitz.open(stream=file_bytes, filetype="pdf")
81
+ else:
82
+ pdf_doc = fitz.open(pdf_path)
83
+
84
  num_pages = pdf_doc.page_count
85
  if max_pages is not None:
86
  num_pages = min(num_pages, max_pages)
 
100
 
101
  combined_text = "\n\n".join([t for t in all_text_lines if t])
102
  json_payload = json.dumps({"pages": pages_payload}, ensure_ascii=False)
103
+
104
+ processing_time = time.time() - start_time
105
+ stats = {
106
+ "pages_processed": num_pages,
107
+ "total_pages": pdf_doc.page_count,
108
+ "processing_time": round(processing_time, 2),
109
+ "dpi": dpi,
110
+ "language": lang
111
+ }
112
+
113
  pdf_doc.close()
114
+ return combined_text, json_payload, stats
115
+
116
+ except Exception as e:
117
+ return "", json.dumps({"pages": []}, ensure_ascii=False), {"error": str(e)}
118
+
119
+ def handle_pdf_ocr(pdf_file: str) -> Tuple[str, str, str]:
120
+ """Main handler for PDF OCR processing"""
121
+ if not pdf_file:
122
+ raise gr.Error("Please upload a PDF file first.")
123
+
124
+ try:
125
+ print(f"Processing PDF: {pdf_file}")
126
+ start_time = time.time()
127
+
128
+ text, json_data, stats = extract_text_from_pdf(pdf_file, dpi=300, max_pages=None, lang="en")
129
+
130
+ end_time = time.time()
131
+ duration = end_time - start_time
132
+ print(f"PDF processing completed in {duration:.2f} seconds.")
133
+
134
+ if "error" in stats:
135
+ raise gr.Error(f"Processing failed: {stats['error']}")
136
+
137
+ # Format stats for display
138
+ stats_text = f"""**Processing Statistics:**
139
+ - Pages processed: {stats.get('pages_processed', 0)}/{stats.get('total_pages', 0)}
140
+ - Processing time: {stats.get('processing_time', 0)}s
141
+ - DPI: {stats.get('dpi', 300)}
142
+ - Language: {stats.get('language', 'en')}"""
143
+
144
+ return text, json_data, stats_text
145
+
146
+ except Exception as e:
147
+ error_msg = f"Error processing PDF: {str(e)}"
148
+ print(error_msg)
149
+ raise gr.Error(error_msg)
150
+
151
+ # =========================
152
+ # CSS & UI
153
+ # =========================
154
+ custom_css = """
155
+ /* Global fonts */
156
+ body, .gradio-container {
157
+ font-family: "Inter", "Segoe UI", "Roboto", sans-serif;
158
+ }
159
+
160
+ .app-header {
161
+ text-align: center;
162
+ max-width: 900px;
163
+ margin: 0 auto 20px !important;
164
+ padding: 20px;
165
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
166
+ border-radius: 12px;
167
+ color: white;
168
+ }
169
+
170
+ .app-header h1 {
171
+ margin: 0;
172
+ font-size: 2.5rem;
173
+ font-weight: 700;
174
+ }
175
+
176
+ .app-header p {
177
+ margin: 10px 0 0 0;
178
+ opacity: 0.9;
179
+ font-size: 1.1rem;
180
+ }
181
+
182
+ .gradio-container {
183
+ padding: 20px 0 !important;
184
+ max-width: 1200px;
185
+ margin: 0 auto;
186
+ }
187
+
188
+ .upload-section {
189
+ background: #f8fafc;
190
+ border: 2px dashed #cbd5e1;
191
+ border-radius: 12px;
192
+ padding: 30px;
193
+ text-align: center;
194
+ margin: 20px 0;
195
+ }
196
+
197
+ .upload-section:hover {
198
+ border-color: #667eea;
199
+ background: #f1f5f9;
200
+ }
201
+
202
+ .results-section {
203
+ margin-top: 20px;
204
+ }
205
+
206
+ .stats-box {
207
+ background: #f0f9ff;
208
+ border: 1px solid #0ea5e9;
209
+ border-radius: 8px;
210
+ padding: 15px;
211
+ margin: 10px 0;
212
+ }
213
+
214
+ #text_output {
215
+ min-height: 300px;
216
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
217
+ font-size: 14px;
218
+ line-height: 1.6;
219
+ }
220
+
221
+ #json_output {
222
+ min-height: 200px;
223
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
224
+ font-size: 12px;
225
+ }
226
+
227
+ .process-btn {
228
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
229
+ color: white !important;
230
+ border: none !important;
231
+ padding: 12px 30px !important;
232
+ border-radius: 8px !important;
233
+ font-weight: 600 !important;
234
+ font-size: 16px !important;
235
+ }
236
+
237
+ .process-btn:hover {
238
+ transform: translateY(-2px);
239
+ box-shadow: 0 8px 25px rgba(102, 126, 234, 0.3);
240
+ }
241
+
242
+ .notice {
243
+ background: #fef3c7;
244
+ border: 1px solid #f59e0b;
245
+ border-radius: 8px;
246
+ padding: 15px;
247
+ margin: 20px 0;
248
+ color: #92400e;
249
+ }
250
+
251
+ .api-section {
252
+ background: #f1f5f9;
253
+ border-radius: 8px;
254
+ padding: 20px;
255
+ margin: 20px 0;
256
+ border-left: 4px solid #667eea;
257
+ }
258
+ """
259
+
260
+ with gr.Blocks(head=GOOGLE_FONTS_URL, css=custom_css, theme=gr.themes.Soft()) as demo:
261
+ # Header
262
+ gr.HTML("""
263
+ <div class="app-header">
264
+ <h1>πŸ“„ PDF OCR Extractor</h1>
265
+ <p>Extract text from PDF documents using PaddleOCR + PyMuPDF</p>
266
+ </div>
267
  """)
268
+
269
+ # Notice
270
+ gr.HTML("""
271
+ <div class="notice">
272
+ <strong>πŸ’‘ Tip:</strong> This tool processes PDFs by rendering each page as a high-resolution image (300 DPI) and then applying OCR.
273
+ For best results, use clear, well-scanned PDFs with good contrast.
274
+ </div>
 
 
 
 
275
  """)
276
+
277
+ with gr.Row():
278
+ with gr.Column(scale=1):
279
+ # Upload section
280
+ gr.HTML('<div class="upload-section">')
281
+ pdf_input = gr.File(
282
+ label="πŸ“ Upload PDF File",
283
+ file_types=[".pdf"],
284
+ file_count="single",
285
+ elem_id="pdf_upload"
286
+ )
287
+ gr.HTML('</div>')
288
+
289
+ # Process button
290
+ process_btn = gr.Button(
291
+ "πŸš€ Extract Text",
292
+ variant="primary",
293
+ elem_classes=["process-btn"],
294
+ scale=2
295
+ )
296
+
297
+ # API section
298
+ gr.HTML("""
299
+ <div class="api-section">
300
+ <h3>πŸ”— API Usage</h3>
301
+ <p><strong>Endpoint:</strong> <code>/predict</code></p>
302
+ <p><strong>Input:</strong> PDF file</p>
303
+ <p><strong>Output:</strong> Extracted text</p>
304
+ </div>
305
+ """)
306
+
307
+ with gr.Column(scale=2):
308
+ # Results section
309
+ gr.HTML('<div class="results-section">')
310
+
311
+ with gr.Tabs():
312
+ with gr.Tab("πŸ“ Extracted Text"):
313
+ text_output = gr.Textbox(
314
+ label="Extracted Text",
315
+ lines=20,
316
+ elem_id="text_output",
317
+ placeholder="Extracted text will appear here..."
318
+ )
319
+
320
+ with gr.Tab("πŸ“Š JSON Data"):
321
+ json_output = gr.Code(
322
+ label="Detailed OCR Results (JSON)",
323
+ language="json",
324
+ elem_id="json_output",
325
+ placeholder="Detailed OCR results will appear here..."
326
+ )
327
+
328
+ with gr.Tab("πŸ“ˆ Statistics"):
329
+ stats_output = gr.Markdown(
330
+ label="Processing Statistics",
331
+ placeholder="Processing statistics will appear here..."
332
+ )
333
+
334
+ gr.HTML('</div>')
335
+
336
+ # Event handlers
337
+ process_btn.click(
338
+ fn=handle_pdf_ocr,
339
+ inputs=[pdf_input],
340
+ outputs=[text_output, json_output, stats_output],
341
+ api_name="predict"
342
+ )
343
+
344
+ # Auto-process on file upload
345
+ pdf_input.change(
346
+ fn=handle_pdf_ocr,
347
+ inputs=[pdf_input],
348
+ outputs=[text_output, json_output, stats_output],
349
+ api_name="predict"
350
+ )
351
 
352
  if __name__ == "__main__":
353
+ port = int(os.getenv("PORT", "7860"))
354
+ demo.queue(max_size=6).launch(
355
+ server_name="0.0.0.0",
356
+ server_port=port,
357
+ share=False
358
+ )
359
+
360
+
361