import gradio as gr import spaces from transformers import AutoModel, AutoTokenizer, AutoProcessor from PIL import Image import torch # Load PaddleOCR-VL model model_name = "PaddlePaddle/PaddleOCR-VL" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) model = AutoModel.from_pretrained(model_name, trust_remote_code=True) if torch.cuda.is_available(): model = model.cuda() @spaces.GPU def ocr_inference(image): """ Perform OCR on the input image using PaddleOCR-VL """ if image is None: return "Please upload an image." try: # Convert to PIL Image if needed if not isinstance(image, Image.Image): image = Image.fromarray(image) # Prepare inputs prompt = "Extract all text from this image." inputs = processor(images=image, text=prompt, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.cuda() for k, v in inputs.items()} # Run OCR inference with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=512) # Decode the output result = tokenizer.decode(outputs[0], skip_special_tokens=True) return result except Exception as e: return f"Error during OCR: {str(e)}" # Create Gradio interface demo = gr.Interface( fn=ocr_inference, inputs=gr.Image(type="pil", label="Upload Image for OCR"), outputs=gr.Textbox(label="Extracted Text"), title="PaddleOCR-VL OCR Demo", description="Upload an image to extract text using PaddlePaddle/PaddleOCR-VL model" ) demo.launch()