Spaces:
Runtime error
Runtime error
| import spaces | |
| import gradio as gr | |
| from pypdf import PdfReader | |
| import ocrmypdf | |
| def convert(pdf_file): | |
| reader = PdfReader(pdf_file) | |
| # Check if there are any images | |
| image_count = 0 | |
| for page in reader.pages: | |
| image_count += len(page.images) | |
| # If there are images, perform OCR on the document | |
| if image_count > 0: | |
| out_pdf_file = pdf_file.replace(".pdf", "_ocr.pdf") | |
| ocrmypdf.ocr(pdf_file, out_pdf_file, force_ocr=True) | |
| pdf_file = out_pdf_file | |
| # Extract text | |
| full_text = "" | |
| for idx, page in enumerate(reader.pages): | |
| full_text += f"\n\n---- Page {idx} ----\n\n" + page.extract_text() | |
| return full_text, reader.metadata | |
| gr.Interface( | |
| convert, | |
| inputs=[ | |
| gr.File(label="Upload PDF", type="filepath"), | |
| ], | |
| outputs=[ | |
| gr.Text(label="Markdown"), | |
| gr.JSON(label="Metadata"), | |
| ], | |
| ).launch() | |