Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| from PIL import Image | |
| import os | |
| import spaces | |
| import tempfile | |
| import json | |
| from pathlib import Path | |
| # Set CUDA device | |
| os.environ["CUDA_VISIBLE_DEVICES"] = '0' | |
| # Load model and tokenizer | |
| model_name = "deepseek-ai/DeepSeek-OCR" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| model = AutoModel.from_pretrained( | |
| model_name, | |
| _attn_implementation="flash_attention_2", | |
| trust_remote_code=True, | |
| use_safetensors=True, | |
| ) | |
| model = model.eval() | |
| def ocr_process( | |
| image_input: Image.Image, | |
| task_type: str = "ocr", | |
| preset: str = "gundam", | |
| ) -> str: | |
| """ | |
| Process image and extract text using DeepSeek-OCR model. | |
| Args: | |
| image_input: Input image | |
| task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion | |
| preset: Preset configuration for model parameters | |
| Returns: | |
| Extracted text or markdown content | |
| """ | |
| if image_input is None: | |
| return "Please upload an image first." | |
| # Move model to GPU and set dtype | |
| model.cuda().to(torch.bfloat16) | |
| # Create temp directory for this session | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # Save image with proper format | |
| temp_image_path = os.path.join(temp_dir, "input_image.jpg") | |
| # Convert RGBA to RGB if necessary | |
| if image_input.mode in ('RGBA', 'LA', 'P'): | |
| rgb_image = Image.new('RGB', image_input.size, (255, 255, 255)) | |
| # Handle different image modes | |
| if image_input.mode == 'RGBA': | |
| rgb_image.paste(image_input, mask=image_input.split()[3]) | |
| else: | |
| rgb_image.paste(image_input) | |
| rgb_image.save(temp_image_path, 'JPEG', quality=95) | |
| else: | |
| image_input.save(temp_image_path, 'JPEG', quality=95) | |
| # Set parameters based on preset | |
| presets = { | |
| "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False}, | |
| "small": {"base_size": 640, "image_size": 640, "crop_mode": False}, | |
| "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False}, | |
| "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False}, | |
| "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True}, | |
| } | |
| config = presets[preset] | |
| # Set prompt based on task type | |
| if task_type == "markdown": | |
| prompt = "<image>\n<|grounding|>Convert the document to markdown. " | |
| else: | |
| prompt = "<image>\nFree OCR. " | |
| # Run inference with save_results=True to save output | |
| result = model.infer( | |
| tokenizer, | |
| prompt=prompt, | |
| image_file=temp_image_path, | |
| output_path=temp_dir, | |
| base_size=config["base_size"], | |
| image_size=config["image_size"], | |
| crop_mode=config["crop_mode"], | |
| save_results=True, | |
| test_compress=True, | |
| ) | |
| # Try to read the saved results | |
| extracted_text = "" | |
| # Check for saved JSON results | |
| json_path = Path(temp_dir) / "input_image_outputs.json" | |
| if json_path.exists(): | |
| try: | |
| with open(json_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Extract text from the JSON structure | |
| if isinstance(data, dict): | |
| if 'text' in data: | |
| extracted_text = data['text'] | |
| elif 'output' in data: | |
| extracted_text = data['output'] | |
| elif 'result' in data: | |
| extracted_text = data['result'] | |
| else: | |
| # If the structure is different, try to get the first string value | |
| for key, value in data.items(): | |
| if isinstance(value, str) and len(value) > 10: | |
| extracted_text = value | |
| break | |
| elif isinstance(data, list) and len(data) > 0: | |
| extracted_text = str(data[0]) | |
| else: | |
| extracted_text = str(data) | |
| except Exception as e: | |
| print(f"Error reading JSON: {e}") | |
| # If no JSON, check for text file | |
| if not extracted_text: | |
| txt_path = Path(temp_dir) / "input_image_outputs.txt" | |
| if txt_path.exists(): | |
| try: | |
| with open(txt_path, 'r', encoding='utf-8') as f: | |
| extracted_text = f.read() | |
| except Exception as e: | |
| print(f"Error reading text file: {e}") | |
| # If still no text, check for any output files | |
| if not extracted_text: | |
| output_files = list(Path(temp_dir).glob("*output*")) | |
| for file_path in output_files: | |
| if file_path.suffix in ['.txt', '.json', '.md']: | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| if content.strip(): | |
| extracted_text = content | |
| break | |
| except Exception as e: | |
| print(f"Error reading {file_path}: {e}") | |
| # If we still don't have text but result is not None, use result directly | |
| if not extracted_text and result is not None: | |
| if isinstance(result, str): | |
| extracted_text = result | |
| elif isinstance(result, (list, tuple)) and len(result) > 0: | |
| extracted_text = str(result[0]) | |
| else: | |
| extracted_text = str(result) | |
| # Move model back to CPU to free GPU memory | |
| model.to("cpu") | |
| torch.cuda.empty_cache() | |
| # Return the extracted text | |
| return extracted_text if extracted_text else "No text could be extracted from the image. Please try a different preset or check if the image contains readable text." | |
| # Create Gradio interface | |
| with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; margin-bottom: 20px;"> | |
| <h1>π DeepSeek OCR</h1> | |
| <p>Extract text and convert documents to markdown using DeepSeek-OCR</p> | |
| <p>Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">anycoder</a></p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π€ Upload Image") | |
| image_input = gr.Image( | |
| label="Input Image", | |
| type="pil", | |
| sources=["upload", "webcam", "clipboard"], | |
| height=300, | |
| ) | |
| gr.Markdown("### βοΈ Settings") | |
| task_type = gr.Radio( | |
| choices=["ocr", "markdown"], | |
| value="ocr", | |
| label="Task Type", | |
| info="OCR: Extract plain text | Markdown: Convert to formatted markdown", | |
| ) | |
| preset = gr.Radio( | |
| choices=["gundam", "base", "large", "small", "tiny"], | |
| value="gundam", | |
| label="Model Preset", | |
| info="Start with 'gundam' - it's optimized for most documents", | |
| ) | |
| with gr.Accordion("βΉοΈ Preset Details", open=False): | |
| gr.Markdown(""" | |
| - **Gundam** (Recommended): Balanced performance with crop mode | |
| - **Base**: Standard quality without cropping | |
| - **Large**: Highest quality for complex documents | |
| - **Small**: Faster processing, good for simple text | |
| - **Tiny**: Fastest, suitable for clear printed text | |
| """) | |
| submit_btn = gr.Button("π Extract Text", variant="primary", size="lg") | |
| clear_btn = gr.ClearButton([image_input], value="ποΈ Clear") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Extracted Text") | |
| output_text = gr.Textbox( | |
| label="Output", | |
| lines=15, | |
| max_lines=30, | |
| interactive=False, | |
| placeholder="Extracted text will appear here...", | |
| show_copy_button=True, | |
| ) | |
| # Event handlers | |
| submit_btn.click( | |
| fn=ocr_process, | |
| inputs=[image_input, task_type, preset], | |
| outputs=output_text, | |
| ) | |
| # Example section with receipt image | |
| gr.Markdown("### π Example") | |
| gr.Examples( | |
| examples=[ | |
| ["https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/ReceiptSwiss.jpg/800px-ReceiptSwiss.jpg", "ocr", "gundam"], | |
| ], | |
| inputs=[image_input, task_type, preset], | |
| label="Try this receipt example", | |
| ) | |
| gr.Markdown(""" | |
| ### π‘ Tips for Best Results | |
| - **For receipts**: Use "ocr" mode with "gundam" or "base" preset | |
| - **For documents with tables**: Use "markdown" mode with "large" preset | |
| - **If text is not detected**: Try different presets in this order: gundam β base β large | |
| - **For handwritten text**: Use "large" preset for better accuracy | |
| - Ensure images are clear and well-lit for optimal results | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |