akhaliq HF Staff commited on
Commit
ebec941
Β·
verified Β·
1 Parent(s): 1a07c5d

Update Gradio app with multiple files

Browse files
Files changed (1) hide show
  1. app.py +54 -33
app.py CHANGED
@@ -50,12 +50,20 @@ def ocr_process(
50
  # Save image with proper format
51
  temp_image_path = os.path.join(temp_dir, "input_image.jpg")
52
  # Convert RGBA to RGB if necessary
53
- if image_input.mode == 'RGBA':
54
  rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
55
- rgb_image.paste(image_input, mask=image_input.split()[3])
56
- rgb_image.save(temp_image_path, 'JPEG')
 
 
 
 
57
  else:
58
- image_input.save(temp_image_path, 'JPEG')
 
 
 
 
59
 
60
  # Set parameters based on preset
61
  presets = {
@@ -74,12 +82,12 @@ def ocr_process(
74
  else:
75
  prompt = "<image>\nFree OCR. "
76
 
77
- # Run inference
78
  result = model.infer(
79
  tokenizer,
80
  prompt=prompt,
81
  image_file=temp_image_path,
82
- output_path=temp_dir, # Use temp directory for output
83
  base_size=config["base_size"],
84
  image_size=config["image_size"],
85
  crop_mode=config["crop_mode"],
@@ -91,17 +99,31 @@ def ocr_process(
91
  model.to("cpu")
92
  torch.cuda.empty_cache()
93
 
94
- # Return the result
95
- if result:
96
- return result
 
 
 
 
 
 
 
 
 
97
  else:
98
- return "No text detected in the image. Please try a different preset or ensure the image contains readable text."
 
 
 
 
 
99
 
100
  except Exception as e:
101
  # Ensure model is moved back to CPU on error
102
  model.to("cpu")
103
  torch.cuda.empty_cache()
104
- return f"Error processing image: {str(e)}"
105
 
106
 
107
  # Create Gradio interface
@@ -131,32 +153,32 @@ with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
131
  choices=["ocr", "markdown"],
132
  value="ocr",
133
  label="Task Type",
134
- info="OCR: Extract text | Markdown: Convert document to markdown format",
135
  )
136
 
137
  preset = gr.Radio(
138
- choices=["gundam", "tiny", "small", "base", "large"],
139
  value="gundam",
140
  label="Model Preset",
141
- info="Gundam: Optimized for mixed content | Tiny/Small: Fast | Base/Large: High quality",
142
  )
143
 
144
- with gr.Accordion("Preset Details", open=False):
145
  gr.Markdown("""
146
- - **Gundam**: base_size=1024, image_size=640, crop_mode=True (Recommended)
147
- - **Tiny**: base_size=512, image_size=512, crop_mode=False (Fastest)
148
- - **Small**: base_size=640, image_size=640, crop_mode=False
149
- - **Base**: base_size=1024, image_size=1024, crop_mode=False
150
- - **Large**: base_size=1280, image_size=1280, crop_mode=False (Best quality)
151
  """)
152
 
153
  submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")
154
  clear_btn = gr.ClearButton([image_input], value="πŸ—‘οΈ Clear")
155
 
156
  with gr.Column(scale=1):
157
- gr.Markdown("### πŸ“ Output")
158
  output_text = gr.Textbox(
159
- label="Extracted Text",
160
  lines=15,
161
  max_lines=30,
162
  interactive=False,
@@ -171,24 +193,23 @@ with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
171
  outputs=output_text,
172
  )
173
 
174
- # Examples section
175
- gr.Markdown("### πŸ“š Examples")
176
  gr.Examples(
177
  examples=[
178
- ["example1.jpg", "ocr", "gundam"],
179
- ["example2.jpg", "markdown", "gundam"],
180
  ],
181
  inputs=[image_input, task_type, preset],
182
- label="Try these examples (upload your own images for testing)",
183
  )
184
 
185
  gr.Markdown("""
186
- ### πŸ’‘ Tips
187
- - For general OCR, use the "gundam" preset (optimized balance)
188
- - For high-quality scanned documents, try "base" or "large" presets
189
- - For handwritten text, "large" preset may work better
190
- - Use "markdown" mode for structured documents with formatting
191
- - If processing fails, try a different preset
192
  """)
193
 
194
 
 
50
  # Save image with proper format
51
  temp_image_path = os.path.join(temp_dir, "input_image.jpg")
52
  # Convert RGBA to RGB if necessary
53
+ if image_input.mode in ('RGBA', 'LA', 'P'):
54
  rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
55
+ # Handle different image modes
56
+ if image_input.mode == 'RGBA':
57
+ rgb_image.paste(image_input, mask=image_input.split()[3])
58
+ else:
59
+ rgb_image.paste(image_input)
60
+ rgb_image.save(temp_image_path, 'JPEG', quality=95)
61
  else:
62
+ image_input.save(temp_image_path, 'JPEG', quality=95)
63
+
64
+ # Verify image was saved
65
+ if not os.path.exists(temp_image_path):
66
+ return "Error: Failed to save image for processing."
67
 
68
  # Set parameters based on preset
69
  presets = {
 
82
  else:
83
  prompt = "<image>\nFree OCR. "
84
 
85
+ # Run inference - the model returns the text directly
86
  result = model.infer(
87
  tokenizer,
88
  prompt=prompt,
89
  image_file=temp_image_path,
90
+ output_path=temp_dir,
91
  base_size=config["base_size"],
92
  image_size=config["image_size"],
93
  crop_mode=config["crop_mode"],
 
99
  model.to("cpu")
100
  torch.cuda.empty_cache()
101
 
102
+ # Process the result
103
+ if result is None:
104
+ return "No text could be extracted. The image might be too blurry or contain no readable text."
105
+
106
+ # Handle different result types
107
+ if isinstance(result, str):
108
+ output_text = result.strip()
109
+ elif isinstance(result, (list, tuple)) and len(result) > 0:
110
+ output_text = str(result[0]).strip()
111
+ elif isinstance(result, dict):
112
+ # Try to get text from common keys
113
+ output_text = result.get('text', result.get('output', result.get('result', str(result))))
114
  else:
115
+ output_text = str(result).strip()
116
+
117
+ if not output_text or output_text == "None":
118
+ return "No text detected. Try adjusting the preset or uploading a clearer image."
119
+
120
+ return output_text
121
 
122
  except Exception as e:
123
  # Ensure model is moved back to CPU on error
124
  model.to("cpu")
125
  torch.cuda.empty_cache()
126
+ return f"Error processing image: {str(e)}\n\nPlease try a different preset or check if the image is valid."
127
 
128
 
129
  # Create Gradio interface
 
153
  choices=["ocr", "markdown"],
154
  value="ocr",
155
  label="Task Type",
156
+ info="OCR: Extract plain text | Markdown: Convert to formatted markdown",
157
  )
158
 
159
  preset = gr.Radio(
160
+ choices=["gundam", "base", "large", "small", "tiny"],
161
  value="gundam",
162
  label="Model Preset",
163
+ info="Start with 'gundam' - it's optimized for most documents",
164
  )
165
 
166
+ with gr.Accordion("ℹ️ Preset Details", open=False):
167
  gr.Markdown("""
168
+ - **Gundam** (Recommended): Balanced performance with crop mode
169
+ - **Base**: Standard quality without cropping
170
+ - **Large**: Highest quality for complex documents
171
+ - **Small**: Faster processing, good for simple text
172
+ - **Tiny**: Fastest, suitable for clear printed text
173
  """)
174
 
175
  submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")
176
  clear_btn = gr.ClearButton([image_input], value="πŸ—‘οΈ Clear")
177
 
178
  with gr.Column(scale=1):
179
+ gr.Markdown("### πŸ“ Extracted Text")
180
  output_text = gr.Textbox(
181
+ label="Output",
182
  lines=15,
183
  max_lines=30,
184
  interactive=False,
 
193
  outputs=output_text,
194
  )
195
 
196
+ # Example section with receipt image
197
+ gr.Markdown("### πŸ“š Example")
198
  gr.Examples(
199
  examples=[
200
+ ["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"],
 
201
  ],
202
  inputs=[image_input, task_type, preset],
203
+ label="Try this receipt example",
204
  )
205
 
206
  gr.Markdown("""
207
+ ### πŸ’‘ Tips for Best Results
208
+ - **For receipts**: Use "ocr" mode with "gundam" or "base" preset
209
+ - **For documents with tables**: Use "markdown" mode with "large" preset
210
+ - **If text is not detected**: Try different presets in this order: gundam β†’ base β†’ large
211
+ - **For handwritten text**: Use "large" preset for better accuracy
212
+ - Ensure images are clear and well-lit for optimal results
213
  """)
214
 
215