Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Commit 
							
							·
						
						ba8ed4c
	
1
								Parent(s):
							
							750426f
								
Refactor app.py to improve OCR functionality and enhance Gradio interface with separate tabs for image and PDF processing; update requirements.txt to include necessary packages.
Browse files- app.py +146 -69
 - requirements.txt +2 -0
 
    	
        app.py
    CHANGED
    
    | 
         @@ -1,18 +1,20 @@ 
     | 
|
| 1 | 
         
             
            import gradio as gr
         
     | 
| 2 | 
         
            -
            from PIL import Image
         
     | 
| 3 | 
         
            -
            from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
         
     | 
| 4 | 
         
            -
            import torch
         
     | 
| 5 | 
         
             
            import spaces
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 6 | 
         | 
| 7 | 
         
             
            model_path = "nanonets/Nanonets-OCR-s"
         
     | 
| 8 | 
         | 
| 9 | 
         
             
            # Load model once at startup
         
     | 
| 10 | 
         
             
            print("Loading Nanonets OCR model...")
         
     | 
| 11 | 
         
             
            model = AutoModelForImageTextToText.from_pretrained(
         
     | 
| 12 | 
         
            -
                model_path, 
     | 
| 13 | 
         
            -
                torch_dtype="auto", 
     | 
| 14 | 
         
            -
                device_map="auto", 
     | 
| 15 | 
         
            -
                attn_implementation="flash_attention_2"
         
     | 
| 16 | 
         
             
            )
         
     | 
| 17 | 
         
             
            model.eval()
         
     | 
| 18 | 
         | 
| 
         @@ -20,40 +22,79 @@ tokenizer = AutoTokenizer.from_pretrained(model_path) 
     | 
|
| 20 | 
         
             
            processor = AutoProcessor.from_pretrained(model_path)
         
     | 
| 21 | 
         
             
            print("Model loaded successfully!")
         
     | 
| 22 | 
         | 
| 
         | 
|
| 23 | 
         
             
            @spaces.GPU()
         
     | 
| 24 | 
         
             
            def ocr_image_gradio(image, max_tokens=4096):
         
     | 
| 25 | 
         
             
                """Process image through Nanonets OCR model for Gradio interface"""
         
     | 
| 26 | 
         
             
                if image is None:
         
     | 
| 27 | 
         
             
                    return "Please upload an image."
         
     | 
| 28 | 
         
            -
             
     | 
| 29 | 
         
            -
                 
     | 
| 30 | 
         
            -
             
     | 
| 31 | 
         
            -
             
     | 
| 32 | 
         
            -
             
     | 
| 33 | 
         
            -
                     
     | 
| 34 | 
         
            -
             
     | 
| 35 | 
         
            -
             
     | 
| 36 | 
         
            -
                     
     | 
| 37 | 
         
            -
             
     | 
| 38 | 
         
            -
                         
     | 
| 
         | 
|
| 39 | 
         
             
                            {"type": "image", "image": image},
         
     | 
| 40 | 
         
             
                            {"type": "text", "text": prompt},
         
     | 
| 41 | 
         
            -
                        ] 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 42 | 
         
             
                    ]
         
     | 
| 43 | 
         
            -
             
     | 
| 44 | 
         
            -
             
     | 
| 45 | 
         
            -
                     
     | 
| 46 | 
         
            -
             
     | 
| 47 | 
         
            -
             
     | 
| 48 | 
         
            -
             
     | 
| 49 | 
         
            -
             
     | 
| 50 | 
         
            -
             
     | 
| 51 | 
         
            -
             
     | 
| 52 | 
         
            -
             
     | 
| 53 | 
         
            -
             
     | 
| 54 | 
         
            -
                    
         
     | 
| 55 | 
         
            -
             
     | 
| 56 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 57 | 
         | 
| 58 | 
         
             
            # Create Gradio interface
         
     | 
| 59 | 
         
             
            with gr.Blocks(title="Nanonets OCR Demo") as demo:
         
     | 
| 
         @@ -77,47 +118,83 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo: 
     | 
|
| 77 | 
         
             
                    </div>
         
     | 
| 78 | 
         
             
                </div>
         
     | 
| 79 | 
         
             
                """)
         
     | 
| 80 | 
         
            -
             
     | 
| 81 | 
         
            -
                with gr. 
     | 
| 82 | 
         
            -
                     
     | 
| 83 | 
         
            -
             
     | 
| 84 | 
         
            -
             
     | 
| 85 | 
         
            -
                             
     | 
| 86 | 
         
            -
             
     | 
| 87 | 
         
            -
             
     | 
| 88 | 
         
            -
             
     | 
| 89 | 
         
            -
             
     | 
| 90 | 
         
            -
             
     | 
| 91 | 
         
            -
             
     | 
| 92 | 
         
            -
             
     | 
| 93 | 
         
            -
             
     | 
| 94 | 
         
            -
             
     | 
| 95 | 
         
            -
             
     | 
| 96 | 
         
            -
             
     | 
| 97 | 
         
            -
             
     | 
| 98 | 
         
            -
             
     | 
| 99 | 
         
            -
             
     | 
| 100 | 
         
            -
             
     | 
| 101 | 
         
            -
                             
     | 
| 102 | 
         
            -
             
     | 
| 103 | 
         
            -
             
     | 
| 104 | 
         
            -
             
     | 
| 105 | 
         
            -
             
     | 
| 106 | 
         
            -
             
     | 
| 107 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 108 | 
         
             
                    fn=ocr_image_gradio,
         
     | 
| 109 | 
         
            -
                    inputs=[image_input,  
     | 
| 110 | 
         
            -
                    outputs= 
     | 
| 111 | 
         
            -
                    show_progress=True
         
     | 
| 112 | 
         
             
                )
         
     | 
| 113 | 
         
            -
             
     | 
| 114 | 
         
             
                image_input.change(
         
     | 
| 115 | 
         
             
                    fn=ocr_image_gradio,
         
     | 
| 116 | 
         
            -
                    inputs=[image_input,  
     | 
| 117 | 
         
            -
                    outputs= 
     | 
| 118 | 
         
            -
                    show_progress=True
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 119 | 
         
             
                )
         
     | 
| 120 | 
         
            -
             
     | 
| 121 | 
         
             
                # Add model information section
         
     | 
| 122 | 
         
             
                with gr.Accordion("About Nanonets-OCR-s", open=False):
         
     | 
| 123 | 
         
             
                    gr.Markdown("""
         
     | 
| 
         | 
|
| 1 | 
         
             
            import gradio as gr
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 2 | 
         
             
            import spaces
         
     | 
| 3 | 
         
            +
            import torch
         
     | 
| 4 | 
         
            +
            from gradio_pdf import PDF
         
     | 
| 5 | 
         
            +
            from pdf2image import convert_from_path
         
     | 
| 6 | 
         
            +
            from PIL import Image
         
     | 
| 7 | 
         
            +
            from transformers import AutoModelForImageTextToText, AutoProcessor, AutoTokenizer
         
     | 
| 8 | 
         | 
| 9 | 
         
             
            model_path = "nanonets/Nanonets-OCR-s"
         
     | 
| 10 | 
         | 
| 11 | 
         
             
            # Load model once at startup
         
     | 
| 12 | 
         
             
            print("Loading Nanonets OCR model...")
         
     | 
| 13 | 
         
             
            model = AutoModelForImageTextToText.from_pretrained(
         
     | 
| 14 | 
         
            +
                model_path,
         
     | 
| 15 | 
         
            +
                torch_dtype="auto",
         
     | 
| 16 | 
         
            +
                device_map="auto",
         
     | 
| 17 | 
         
            +
                attn_implementation="flash_attention_2",
         
     | 
| 18 | 
         
             
            )
         
     | 
| 19 | 
         
             
            model.eval()
         
     | 
| 20 | 
         | 
| 
         | 
|
| 22 | 
         
             
            processor = AutoProcessor.from_pretrained(model_path)
         
     | 
| 23 | 
         
             
            print("Model loaded successfully!")
         
     | 
| 24 | 
         | 
| 25 | 
         
            +
             
     | 
| 26 | 
         
             
            @spaces.GPU()
         
     | 
| 27 | 
         
             
            def ocr_image_gradio(image, max_tokens=4096):
         
     | 
| 28 | 
         
             
                """Process image through Nanonets OCR model for Gradio interface"""
         
     | 
| 29 | 
         
             
                if image is None:
         
     | 
| 30 | 
         
             
                    return "Please upload an image."
         
     | 
| 31 | 
         
            +
             
     | 
| 32 | 
         
            +
                prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
         
     | 
| 33 | 
         
            +
             
     | 
| 34 | 
         
            +
                # Convert PIL image if needed
         
     | 
| 35 | 
         
            +
                if not isinstance(image, Image.Image):
         
     | 
| 36 | 
         
            +
                    image = Image.fromarray(image)
         
     | 
| 37 | 
         
            +
             
     | 
| 38 | 
         
            +
                messages = [
         
     | 
| 39 | 
         
            +
                    {"role": "system", "content": "You are a helpful assistant."},
         
     | 
| 40 | 
         
            +
                    {
         
     | 
| 41 | 
         
            +
                        "role": "user",
         
     | 
| 42 | 
         
            +
                        "content": [
         
     | 
| 43 | 
         
             
                            {"type": "image", "image": image},
         
     | 
| 44 | 
         
             
                            {"type": "text", "text": prompt},
         
     | 
| 45 | 
         
            +
                        ],
         
     | 
| 46 | 
         
            +
                    },
         
     | 
| 47 | 
         
            +
                ]
         
     | 
| 48 | 
         
            +
             
     | 
| 49 | 
         
            +
                text = processor.apply_chat_template(
         
     | 
| 50 | 
         
            +
                    messages, tokenize=False, add_generation_prompt=True
         
     | 
| 51 | 
         
            +
                )
         
     | 
| 52 | 
         
            +
                inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
         
     | 
| 53 | 
         
            +
                inputs = inputs.to(model.device)
         
     | 
| 54 | 
         
            +
             
     | 
| 55 | 
         
            +
                with torch.no_grad():
         
     | 
| 56 | 
         
            +
                    output_ids = model.generate(
         
     | 
| 57 | 
         
            +
                        **inputs,
         
     | 
| 58 | 
         
            +
                        max_new_tokens=max_tokens,
         
     | 
| 59 | 
         
            +
                        do_sample=False,
         
     | 
| 60 | 
         
            +
                        repetition_penalty=1.25,
         
     | 
| 61 | 
         
            +
                    )
         
     | 
| 62 | 
         
            +
                    generated_ids = [
         
     | 
| 63 | 
         
            +
                        output_ids[len(input_ids) :]
         
     | 
| 64 | 
         
            +
                        for input_ids, output_ids in zip(inputs.input_ids, output_ids)
         
     | 
| 65 | 
         
             
                    ]
         
     | 
| 66 | 
         
            +
             
     | 
| 67 | 
         
            +
                output_text = processor.batch_decode(
         
     | 
| 68 | 
         
            +
                    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
         
     | 
| 69 | 
         
            +
                )
         
     | 
| 70 | 
         
            +
                return output_text[0]
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
             
     | 
| 73 | 
         
            +
            @spaces.GPU()
         
     | 
| 74 | 
         
            +
            def ocr_pdf_gradio(pdf_path, max_tokens=4096, progress=gr.Progress()):
         
     | 
| 75 | 
         
            +
                """Process each page of a PDF through Nanonets OCR model"""
         
     | 
| 76 | 
         
            +
                if pdf_path is None:
         
     | 
| 77 | 
         
            +
                    return "Please upload a PDF file."
         
     | 
| 78 | 
         
            +
             
     | 
| 79 | 
         
            +
                # Convert PDF to images
         
     | 
| 80 | 
         
            +
                progress(0, desc="Converting PDF to images...")
         
     | 
| 81 | 
         
            +
                pdf_images = convert_from_path(pdf_path)
         
     | 
| 82 | 
         
            +
             
     | 
| 83 | 
         
            +
                # Process each page
         
     | 
| 84 | 
         
            +
                all_text = []
         
     | 
| 85 | 
         
            +
                total_pages = len(pdf_images)
         
     | 
| 86 | 
         
            +
             
     | 
| 87 | 
         
            +
                for i, image in enumerate(pdf_images):
         
     | 
| 88 | 
         
            +
                    progress(
         
     | 
| 89 | 
         
            +
                        (i + 1) / total_pages, desc=f"Processing page {i + 1}/{total_pages}..."
         
     | 
| 90 | 
         
            +
                    )
         
     | 
| 91 | 
         
            +
                    page_text = ocr_image_gradio(image, max_tokens)
         
     | 
| 92 | 
         
            +
                    all_text.append(f"--- PAGE {i + 1} ---\n{page_text}\n")
         
     | 
| 93 | 
         
            +
             
     | 
| 94 | 
         
            +
                # Combine results
         
     | 
| 95 | 
         
            +
                combined_text = "\n".join(all_text)
         
     | 
| 96 | 
         
            +
                return combined_text
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         | 
| 99 | 
         
             
            # Create Gradio interface
         
     | 
| 100 | 
         
             
            with gr.Blocks(title="Nanonets OCR Demo") as demo:
         
     | 
| 
         | 
|
| 118 | 
         
             
                    </div>
         
     | 
| 119 | 
         
             
                </div>
         
     | 
| 120 | 
         
             
                """)
         
     | 
| 121 | 
         
            +
             
     | 
| 122 | 
         
            +
                with gr.Tabs() as tabs:
         
     | 
| 123 | 
         
            +
                    # Image tab
         
     | 
| 124 | 
         
            +
                    with gr.TabItem("Image OCR"):
         
     | 
| 125 | 
         
            +
                        with gr.Row():
         
     | 
| 126 | 
         
            +
                            with gr.Column(scale=1):
         
     | 
| 127 | 
         
            +
                                image_input = gr.Image(
         
     | 
| 128 | 
         
            +
                                    label="Upload Document Image", type="pil", height=400
         
     | 
| 129 | 
         
            +
                                )
         
     | 
| 130 | 
         
            +
                                image_max_tokens = gr.Slider(
         
     | 
| 131 | 
         
            +
                                    minimum=1024,
         
     | 
| 132 | 
         
            +
                                    maximum=8192,
         
     | 
| 133 | 
         
            +
                                    value=4096,
         
     | 
| 134 | 
         
            +
                                    step=512,
         
     | 
| 135 | 
         
            +
                                    label="Max Tokens",
         
     | 
| 136 | 
         
            +
                                    info="Maximum number of tokens to generate",
         
     | 
| 137 | 
         
            +
                                )
         
     | 
| 138 | 
         
            +
                                image_extract_btn = gr.Button(
         
     | 
| 139 | 
         
            +
                                    "Extract Text", variant="primary", size="lg"
         
     | 
| 140 | 
         
            +
                                )
         
     | 
| 141 | 
         
            +
             
     | 
| 142 | 
         
            +
                            with gr.Column(scale=2):
         
     | 
| 143 | 
         
            +
                                image_output_text = gr.Textbox(
         
     | 
| 144 | 
         
            +
                                    label="Extracted Text",
         
     | 
| 145 | 
         
            +
                                    lines=20,
         
     | 
| 146 | 
         
            +
                                    show_copy_button=True,
         
     | 
| 147 | 
         
            +
                                    placeholder="Extracted text will appear here...",
         
     | 
| 148 | 
         
            +
                                )
         
     | 
| 149 | 
         
            +
             
     | 
| 150 | 
         
            +
                    # PDF tab
         
     | 
| 151 | 
         
            +
                    with gr.TabItem("PDF OCR"):
         
     | 
| 152 | 
         
            +
                        with gr.Row():
         
     | 
| 153 | 
         
            +
                            with gr.Column(scale=1):
         
     | 
| 154 | 
         
            +
                                pdf_input = PDF(label="Upload PDF Document", height=400)
         
     | 
| 155 | 
         
            +
                                pdf_max_tokens = gr.Slider(
         
     | 
| 156 | 
         
            +
                                    minimum=1024,
         
     | 
| 157 | 
         
            +
                                    maximum=8192,
         
     | 
| 158 | 
         
            +
                                    value=4096,
         
     | 
| 159 | 
         
            +
                                    step=512,
         
     | 
| 160 | 
         
            +
                                    label="Max Tokens per Page",
         
     | 
| 161 | 
         
            +
                                    info="Maximum number of tokens to generate for each page",
         
     | 
| 162 | 
         
            +
                                )
         
     | 
| 163 | 
         
            +
                                pdf_extract_btn = gr.Button(
         
     | 
| 164 | 
         
            +
                                    "Extract PDF Text", variant="primary", size="lg"
         
     | 
| 165 | 
         
            +
                                )
         
     | 
| 166 | 
         
            +
             
     | 
| 167 | 
         
            +
                            with gr.Column(scale=2):
         
     | 
| 168 | 
         
            +
                                pdf_output_text = gr.Textbox(
         
     | 
| 169 | 
         
            +
                                    label="Extracted Text (All Pages)",
         
     | 
| 170 | 
         
            +
                                    lines=20,
         
     | 
| 171 | 
         
            +
                                    show_copy_button=True,
         
     | 
| 172 | 
         
            +
                                    placeholder="Extracted text will appear here...",
         
     | 
| 173 | 
         
            +
                                )
         
     | 
| 174 | 
         
            +
             
     | 
| 175 | 
         
            +
                # Event handlers for Image tab
         
     | 
| 176 | 
         
            +
                image_extract_btn.click(
         
     | 
| 177 | 
         
             
                    fn=ocr_image_gradio,
         
     | 
| 178 | 
         
            +
                    inputs=[image_input, image_max_tokens],
         
     | 
| 179 | 
         
            +
                    outputs=image_output_text,
         
     | 
| 180 | 
         
            +
                    show_progress=True,
         
     | 
| 181 | 
         
             
                )
         
     | 
| 182 | 
         
            +
             
     | 
| 183 | 
         
             
                image_input.change(
         
     | 
| 184 | 
         
             
                    fn=ocr_image_gradio,
         
     | 
| 185 | 
         
            +
                    inputs=[image_input, image_max_tokens],
         
     | 
| 186 | 
         
            +
                    outputs=image_output_text,
         
     | 
| 187 | 
         
            +
                    show_progress=True,
         
     | 
| 188 | 
         
            +
                )
         
     | 
| 189 | 
         
            +
             
     | 
| 190 | 
         
            +
                # Event handlers for PDF tab
         
     | 
| 191 | 
         
            +
                pdf_extract_btn.click(
         
     | 
| 192 | 
         
            +
                    fn=ocr_pdf_gradio,
         
     | 
| 193 | 
         
            +
                    inputs=[pdf_input, pdf_max_tokens],
         
     | 
| 194 | 
         
            +
                    outputs=pdf_output_text,
         
     | 
| 195 | 
         
            +
                    show_progress=True,
         
     | 
| 196 | 
         
             
                )
         
     | 
| 197 | 
         
            +
             
     | 
| 198 | 
         
             
                # Add model information section
         
     | 
| 199 | 
         
             
                with gr.Accordion("About Nanonets-OCR-s", open=False):
         
     | 
| 200 | 
         
             
                    gr.Markdown("""
         
     | 
    	
        requirements.txt
    CHANGED
    
    | 
         @@ -4,3 +4,5 @@ torchvision 
     | 
|
| 4 | 
         
             
            accelerate
         
     | 
| 5 | 
         
             
            spaces
         
     | 
| 6 | 
         
             
            https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl
         
     | 
| 
         | 
|
| 
         | 
| 
         | 
|
| 4 | 
         
             
            accelerate
         
     | 
| 5 | 
         
             
            spaces
         
     | 
| 6 | 
         
             
            https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.0.8/flash_attn-2.7.4.post1+cu126torch2.7-cp310-cp310-linux_x86_64.whl
         
     | 
| 7 | 
         
            +
            pdf2image
         
     | 
| 8 | 
         
            +
            gradio-pdf
         
     |