Spaces:
Runtime error
Runtime error
| from transformers import NougatProcessor, VisionEncoderDecoderModel | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from pathlib import Path | |
| from pdf2image import convert_from_path | |
| import spaces | |
| # Load the model and processor | |
| processor = NougatProcessor.from_pretrained("MohamedRashad/arabic-small-nougat") | |
| model = VisionEncoderDecoderModel.from_pretrained("MohamedRashad/arabic-small-nougat") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| print(f"Using {device} device") | |
| context_length = 2048 | |
| def extract_text_from_image(image): | |
| """ | |
| Extract text from PIL image | |
| Args: | |
| image (PIL.Image): Input image | |
| Returns: | |
| str: Extracted text from the image | |
| """ | |
| # prepare PDF image for the model | |
| pixel_values = processor(image, return_tensors="pt").pixel_values | |
| # generate transcription | |
| outputs = model.generate( | |
| pixel_values.to(device), | |
| min_length=1, | |
| max_new_tokens=context_length, | |
| bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
| ) | |
| page_sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
| page_sequence = processor.post_process_generation(page_sequence, fix_markdown=False) | |
| return page_sequence | |
| def extract_text_from_pdf(pdf_path, progress=gr.Progress()): | |
| """ | |
| Extract text from PDF | |
| Args: | |
| pdf_path (str): Path to the PDF file | |
| progress (gr.Progress): Progress bar | |
| Returns: | |
| str: Extracted text from the PDF | |
| """ | |
| progress(0, desc="Starting...") | |
| images = convert_from_path(pdf_path) | |
| texts = [] | |
| for image in progress.tqdm(images): | |
| extracted_text = extract_text_from_image(image) | |
| texts.append(extracted_text) | |
| return "\n".join(texts) | |
| model_description = """ | |
| This is a demo for the Arabic Small Nougat model. It is an end-to-end OCR model that can extract text from images and PDFs. | |
| - The model is trained on the [Khatt dataset](https://huggingface.co/datasets/Fakhraddin/khatt) and custom made dataset. | |
| - The model is a finetune of [facebook/nougat-small](https://huggingface.co/facebook/nougat-small) model. | |
| **Note**: The model is a prototype in my book and may not work well on all types of images and PDFs. **Check the output carefully before using it for any serious work.** | |
| """ | |
| example_images = [Image.open(Path(__file__).parent / "book_page.jpeg")] | |
| with gr.Blocks(title="Arabic Small Nougat") as demo: | |
| gr.HTML("<h1 style='text-align: center'>Arabic End-to-End Structured OCR for textbooks</h1>") | |
| gr.Markdown(model_description) | |
| with gr.Tab("Extract Text from Image"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_image = gr.Image(label="Input Image", type="pil") | |
| image_submit_button = gr.Button(value="Submit", variant="primary") | |
| output = gr.Markdown(label="Output Markdown", rtl=True) | |
| image_submit_button.click(extract_text_from_image, inputs=[input_image], outputs=output) | |
| gr.Examples(example_images, [input_image], output, extract_text_from_image, cache_examples=True) | |
| with gr.Tab("Extract Text from PDF"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| pdf = gr.File(label="Input PDF", type="filepath") | |
| pdf_submit_button = gr.Button(value="Submit", variant="primary") | |
| output = gr.Markdown(label="Output Markdown", rtl=True) | |
| pdf_submit_button.click(extract_text_from_pdf, inputs=[pdf], outputs=output) | |
| demo.queue().launch(share=False) | |