Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| from PIL import Image | |
| import os | |
| import spaces | |
| import tempfile | |
| import sys | |
| from io import StringIO | |
| from contextlib import contextmanager | |
| # Set CUDA device | |
| os.environ["CUDA_VISIBLE_DEVICES"] = '0' | |
| # Load model and tokenizer | |
| model_name = "deepseek-ai/DeepSeek-OCR" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| model = AutoModel.from_pretrained( | |
| model_name, | |
| _attn_implementation="flash_attention_2", | |
| trust_remote_code=True, | |
| use_safetensors=True, | |
| ) | |
| model = model.eval() | |
| def capture_stdout(): | |
| """Capture stdout to get printed output from model""" | |
| old_stdout = sys.stdout | |
| sys.stdout = StringIO() | |
| try: | |
| yield sys.stdout | |
| finally: | |
| sys.stdout = old_stdout | |
| def ocr_process( | |
| image_input: Image.Image, | |
| task_type: str = "ocr", | |
| preset: str = "gundam", | |
| ) -> str: | |
| """ | |
| Process image and extract text using DeepSeek-OCR model. | |
| Args: | |
| image_input: Input image | |
| task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion | |
| preset: Preset configuration for model parameters | |
| Returns: | |
| Extracted text or markdown content | |
| """ | |
| if image_input is None: | |
| return "Please upload an image first." | |
| # Move model to GPU and set dtype | |
| model.cuda().to(torch.bfloat16) | |
| # Create temp directory for this session | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # Save image with proper format | |
| temp_image_path = os.path.join(temp_dir, "input_image.jpg") | |
| # Convert RGBA to RGB if necessary | |
| if image_input.mode in ('RGBA', 'LA', 'P'): | |
| rgb_image = Image.new('RGB', image_input.size, (255, 255, 255)) | |
| # Handle different image modes | |
| if image_input.mode == 'RGBA': | |
| rgb_image.paste(image_input, mask=image_input.split()[3]) | |
| else: | |
| rgb_image.paste(image_input) | |
| rgb_image.save(temp_image_path, 'JPEG', quality=95) | |
| else: | |
| image_input.save(temp_image_path, 'JPEG', quality=95) | |
| # Set parameters based on preset | |
| presets = { | |
| "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, | |
| "small": {"base_size": 640, "image_size": 640, "crop_mode": False}, | |
| "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, | |
| "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, | |
| "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True}, | |
| } | |
| config = presets[preset] | |
| # Set prompt based on task type | |
| if task_type == "markdown": | |
| prompt = "<image>\n<|grounding|>Convert the document to markdown. " | |
| else: | |
| prompt = "<image>\nFree OCR. " | |
| # Capture stdout while running inference | |
| captured_output = "" | |
| with capture_stdout() as output: | |
| result = model.infer( | |
| tokenizer, | |
| prompt=prompt, | |
| image_file=temp_image_path, | |
| output_path=temp_dir, | |
| base_size=config["base_size"], | |
| image_size=config["image_size"], | |
| crop_mode=config["crop_mode"], | |
| save_results=True, | |
| test_compress=True, | |
| ) | |
| captured_output = output.getvalue() | |
| # Extract the text from captured output | |
| extracted_text = "" | |
| # Look for the actual OCR result in the captured output | |
| # The model prints the extracted text between certain markers | |
| lines = captured_output.split('\n') | |
| capture_text = False | |
| text_lines = [] | |
| for line in lines: | |
| # Start capturing after seeing certain patterns | |
| if "# " in line or line.strip().startswith("**"): | |
| capture_text = True | |
| if capture_text: | |
| # Stop at the separator lines | |
| if line.startswith("====") or line.startswith("---") and len(line) > 10: | |
| if text_lines: # Only stop if we've captured something | |
| break | |
| # Add non-empty lines that aren't debug output | |
| elif line.strip() and not line.startswith("image size:") and not line.startswith("valid image") and not line.startswith("output texts") and not line.startswith("compression"): | |
| text_lines.append(line) | |
| if text_lines: | |
| extracted_text = '\n'.join(text_lines) | |
| # If we didn't get text from stdout, check if result contains text | |
| if not extracted_text and result is not None: | |
| if isinstance(result, str): | |
| extracted_text = result | |
| elif isinstance(result, (list, tuple)) and len(result) > 0: | |
| # Try to extract text from the result | |
| if isinstance(result[0], str): | |
| extracted_text = result[0] | |
| elif hasattr(result[0], 'text'): | |
| extracted_text = result[0].text | |
| # Clean up any remaining markers from the text | |
| if extracted_text: | |
| # Remove any remaining debug output patterns | |
| clean_lines = [] | |
| for line in extracted_text.split('\n'): | |
| if not any(pattern in line.lower() for pattern in ['image size:', 'valid image', 'compression ratio', 'save results:', 'output texts']): | |
| clean_lines.append(line) | |
| extracted_text = '\n'.join(clean_lines).strip() | |
| # Move model back to CPU to free GPU memory | |
| model.to("cpu") | |
| torch.cuda.empty_cache() | |
| # Return the extracted text | |
| return extracted_text if extracted_text else "No text could be extracted from the image. Please try a different preset or check if the image contains readable text." | |
| # Create Gradio interface | |
| with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; margin-bottom: 20px;"> | |
| <h1>π DeepSeek OCR</h1> | |
| <p>Extract text and convert documents to markdown using DeepSeek-OCR</p> | |
| <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π€ Upload Image") | |
| image_input = gr.Image( | |
| label="Input Image", | |
| type="pil", | |
| sources=["upload", "webcam", "clipboard"], | |
| height=300, | |
| ) | |
| gr.Markdown("### βοΈ Settings") | |
| task_type = gr.Radio( | |
| choices=["ocr", "markdown"], | |
| value="ocr", | |
| label="Task Type", | |
| info="OCR: Extract plain text | Markdown: Convert to formatted markdown", | |
| ) | |
| preset = gr.Radio( | |
| choices=["gundam", "base", "large", "small", "tiny"], | |
| value="gundam", | |
| label="Model Preset", | |
| info="Start with 'gundam' - it's optimized for most documents", | |
| ) | |
| with gr.Accordion("βΉοΈ Preset Details", open=False): | |
| gr.Markdown(""" | |
| - **Gundam** (Recommended): Balanced performance with crop mode | |
| - **Base**: Standard quality without cropping | |
| - **Large**: Highest quality for complex documents | |
| - **Small**: Faster processing, good for simple text | |
| - **Tiny**: Fastest, suitable for clear printed text | |
| """) | |
| submit_btn = gr.Button("π Extract Text", variant="primary", size="lg") | |
| clear_btn = gr.ClearButton([image_input], value="ποΈ Clear") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Extracted Text") | |
| output_text = gr.Textbox( | |
| label="Output", | |
| lines=15, | |
| max_lines=30, | |
| interactive=False, | |
| placeholder="Extracted text will appear here...", | |
| show_copy_button=True, | |
| ) | |
| # Event handlers | |
| submit_btn.click( | |
| fn=ocr_process, | |
| inputs=[image_input, task_type, preset], | |
| outputs=output_text, | |
| ) | |
| # Example section with receipt image | |
| gr.Markdown("### π Example") | |
| gr.Examples( | |
| examples=[ | |
| ["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"], | |
| ], | |
| inputs=[image_input, task_type, preset], | |
| label="Try this receipt example", | |
| ) | |
| gr.Markdown(""" | |
| ### π‘ Tips for Best Results | |
| - **For receipts**: Use "ocr" mode with "gundam" or "base" preset | |
| - **For documents with tables**: Use "markdown" mode with "large" preset | |
| - **If text is not detected**: Try different presets in this order: gundam β base β large | |
| - **For handwritten text**: Use "large" preset for better accuracy | |
| - Ensure images are clear and well-lit for optimal results | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |