Spaces:
Running
on
Zero
Running
on
Zero
Update Gradio app with multiple files
Browse files
app.py
CHANGED
|
@@ -41,89 +41,61 @@ def ocr_process(
|
|
| 41 |
if image_input is None:
|
| 42 |
return "Please upload an image first."
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
with
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
rgb_image.paste(image_input, mask=image_input.split()[3])
|
| 58 |
-
else:
|
| 59 |
-
rgb_image.paste(image_input)
|
| 60 |
-
rgb_image.save(temp_image_path, 'JPEG', quality=95)
|
| 61 |
-
else:
|
| 62 |
-
image_input.save(temp_image_path, 'JPEG', quality=95)
|
| 63 |
-
|
| 64 |
-
# Verify image was saved
|
| 65 |
-
if not os.path.exists(temp_image_path):
|
| 66 |
-
return "Error: Failed to save image for processing."
|
| 67 |
-
|
| 68 |
-
# Set parameters based on preset
|
| 69 |
-
presets = {
|
| 70 |
-
"tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
|
| 71 |
-
"small": {"base_size": 640, "image_size": 640, "crop_mode": False},
|
| 72 |
-
"base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
|
| 73 |
-
"large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
|
| 74 |
-
"gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
config = presets[preset]
|
| 78 |
-
|
| 79 |
-
# Set prompt based on task type
|
| 80 |
-
if task_type == "markdown":
|
| 81 |
-
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
|
| 82 |
else:
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
# Run inference - the model returns the text directly
|
| 86 |
-
result = model.infer(
|
| 87 |
-
tokenizer,
|
| 88 |
-
prompt=prompt,
|
| 89 |
-
image_file=temp_image_path,
|
| 90 |
-
output_path=temp_dir,
|
| 91 |
-
base_size=config["base_size"],
|
| 92 |
-
image_size=config["image_size"],
|
| 93 |
-
crop_mode=config["crop_mode"],
|
| 94 |
-
save_results=False,
|
| 95 |
-
test_compress=False,
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
# Move model back to CPU to free GPU memory
|
| 99 |
-
model.to("cpu")
|
| 100 |
-
torch.cuda.empty_cache()
|
| 101 |
-
|
| 102 |
-
# Process the result
|
| 103 |
-
if result is None:
|
| 104 |
-
return "No text could be extracted. The image might be too blurry or contain no readable text."
|
| 105 |
-
|
| 106 |
-
# Handle different result types
|
| 107 |
-
if isinstance(result, str):
|
| 108 |
-
output_text = result.strip()
|
| 109 |
-
elif isinstance(result, (list, tuple)) and len(result) > 0:
|
| 110 |
-
output_text = str(result[0]).strip()
|
| 111 |
-
elif isinstance(result, dict):
|
| 112 |
-
# Try to get text from common keys
|
| 113 |
-
output_text = result.get('text', result.get('output', result.get('result', str(result))))
|
| 114 |
else:
|
| 115 |
-
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
# Create Gradio interface
|
|
|
|
| 41 |
if image_input is None:
|
| 42 |
return "Please upload an image first."
|
| 43 |
|
| 44 |
+
# Move model to GPU and set dtype
|
| 45 |
+
model.cuda().to(torch.bfloat16)
|
| 46 |
+
|
| 47 |
+
# Create temp directory for this session
|
| 48 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 49 |
+
# Save image with proper format
|
| 50 |
+
temp_image_path = os.path.join(temp_dir, "input_image.jpg")
|
| 51 |
+
# Convert RGBA to RGB if necessary
|
| 52 |
+
if image_input.mode in ('RGBA', 'LA', 'P'):
|
| 53 |
+
rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
|
| 54 |
+
# Handle different image modes
|
| 55 |
+
if image_input.mode == 'RGBA':
|
| 56 |
+
rgb_image.paste(image_input, mask=image_input.split()[3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
else:
|
| 58 |
+
rgb_image.paste(image_input)
|
| 59 |
+
rgb_image.save(temp_image_path, 'JPEG', quality=95)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
else:
|
| 61 |
+
image_input.save(temp_image_path, 'JPEG', quality=95)
|
| 62 |
|
| 63 |
+
# Set parameters based on preset
|
| 64 |
+
presets = {
|
| 65 |
+
"tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
|
| 66 |
+
"small": {"base_size": 640, "image_size": 640, "crop_mode": False},
|
| 67 |
+
"base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
|
| 68 |
+
"large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
|
| 69 |
+
"gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
|
| 70 |
+
}
|
| 71 |
|
| 72 |
+
config = presets[preset]
|
| 73 |
|
| 74 |
+
# Set prompt based on task type
|
| 75 |
+
if task_type == "markdown":
|
| 76 |
+
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
|
| 77 |
+
else:
|
| 78 |
+
prompt = "<image>\nFree OCR. "
|
| 79 |
+
|
| 80 |
+
# Run inference - return the result directly
|
| 81 |
+
result = model.infer(
|
| 82 |
+
tokenizer,
|
| 83 |
+
prompt=prompt,
|
| 84 |
+
image_file=temp_image_path,
|
| 85 |
+
output_path=temp_dir,
|
| 86 |
+
base_size=config["base_size"],
|
| 87 |
+
image_size=config["image_size"],
|
| 88 |
+
crop_mode=config["crop_mode"],
|
| 89 |
+
save_results=False,
|
| 90 |
+
test_compress=False,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Move model back to CPU to free GPU memory
|
| 94 |
+
model.to("cpu")
|
| 95 |
+
torch.cuda.empty_cache()
|
| 96 |
+
|
| 97 |
+
# Return the result directly - the model returns the extracted text
|
| 98 |
+
return result
|
| 99 |
|
| 100 |
|
| 101 |
# Create Gradio interface
|