Spaces:
Sleeping
Sleeping
| import os | |
| import base64 | |
| import json | |
| import requests | |
| from typing import Dict, List, Any | |
| import fitz # PyMuPDF | |
| from PIL import Image | |
| import io | |
| import re | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from datetime import datetime | |
| class TextBlock: | |
| text: str | |
| x: float | |
| y: float | |
| width: float | |
| height: float | |
| font_size: float | |
| font_name: str | |
| is_bold: bool = False | |
| is_italic: bool = False | |
| block_id: str = "" | |
| class PDFToHTMLConverter: | |
| def __init__(self, huggingface_token: str = None): | |
| self.hf_token = huggingface_token | |
| self.hf_headers = { | |
| "Authorization": f"Bearer {huggingface_token}" if huggingface_token else None | |
| } | |
| self.models = { | |
| "document_layout": "microsoft/layoutlm-base-uncased", | |
| "table_detection": "microsoft/table-transformer-detection", | |
| "ocr": "microsoft/trocr-base-printed", | |
| "math_detection": "facebook/detr-resnet-50" | |
| } | |
| self.hf_inference_url = "https://api-inference.huggingface.co/models" | |
| def pdf_to_base64(self, pdf_path: str) -> str: | |
| try: | |
| with open(pdf_path, "rb") as pdf_file: | |
| return base64.b64encode(pdf_file.read()).decode('utf-8') | |
| except Exception as e: | |
| raise Exception(f"Error converting PDF to base64: {str(e)}") | |
| def extract_pdf_content(self, pdf_path: str) -> Dict[str, Any]: | |
| doc = None | |
| try: | |
| if not os.path.exists(pdf_path): | |
| raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
| doc = fitz.open(pdf_path) | |
| if doc is None: | |
| raise RuntimeError("Failed to open PDF document") | |
| if doc.page_count == 0: | |
| raise ValueError("PDF document has no pages") | |
| print(f"π PDF opened successfully: {doc.page_count} pages") | |
| pages_content = [] | |
| for page_num in range(doc.page_count): | |
| try: | |
| page = doc[page_num] | |
| print(f"π Processing page {page_num + 1}/{doc.page_count}") | |
| text_blocks = [] | |
| try: | |
| page_dict = page.get_text("dict") | |
| text_blocks = self._extract_text_blocks_from_dict(page_dict, page_num) | |
| except Exception as e: | |
| print(f"β οΈ Dict method failed for page {page_num + 1}, falling back to simple text extraction: {e}") | |
| text_blocks = self._extract_text_blocks_simple(page, page_num) | |
| images = self._extract_images_safely(page, doc, page_num) | |
| tables = self._detect_tables_safely(page) | |
| page_rect = page.rect | |
| pages_content.append({ | |
| "page_number": page_num + 1, | |
| "text_blocks": text_blocks, | |
| "images": images, | |
| "tables": tables, | |
| "page_width": page_rect.width, | |
| "page_height": page_rect.height | |
| }) | |
| except Exception as e: | |
| print(f"β Error processing page {page_num + 1}: {e}") | |
| pages_content.append({ | |
| "page_number": page_num + 1, | |
| "text_blocks": [], | |
| "images": [], | |
| "tables": [], | |
| "page_width": 595, | |
| "page_height": 842 | |
| }) | |
| result = { | |
| "pages": pages_content, | |
| "total_pages": doc.page_count | |
| } | |
| return result | |
| except Exception as e: | |
| raise Exception(f"Error extracting PDF content: {str(e)}") | |
| finally: | |
| if doc is not None: | |
| try: | |
| doc.close() | |
| print("β PDF document closed successfully") | |
| except Exception as e: | |
| print(f"β οΈ Error closing PDF document: {e}") | |
| def _extract_text_blocks_from_dict(self, page_dict: dict, page_num: int) -> List[TextBlock]: | |
| text_blocks = [] | |
| for block_idx, block in enumerate(page_dict.get("blocks", [])): | |
| if "lines" not in block: | |
| continue | |
| for line_idx, line in enumerate(block["lines"]): | |
| for span_idx, span in enumerate(line["spans"]): | |
| text_content = span.get("text", "").strip() | |
| if text_content: | |
| bbox = span["bbox"] | |
| font_info = { | |
| "size": span.get("size", 12), | |
| "font": span.get("font", "Arial"), | |
| "is_bold": "bold" in span.get("font", "").lower() or span.get("flags", 0) & 16, | |
| "is_italic": "italic" in span.get("font", "").lower() or span.get("flags", 0) & 2 | |
| } | |
| text_block = TextBlock( | |
| text=text_content, | |
| x=bbox[0], | |
| y=bbox[1], | |
| width=bbox[2] - bbox[0], | |
| height=bbox[3] - bbox[1], | |
| font_size=font_info["size"], | |
| font_name=font_info["font"], | |
| is_bold=font_info["is_bold"], | |
| is_italic=font_info["is_italic"], | |
| block_id=f"p{page_num}-b{block_idx}-l{line_idx}-s{span_idx}" | |
| ) | |
| text_blocks.append(text_block) | |
| return text_blocks | |
| def _extract_text_blocks_simple(self, page, page_num: int) -> List[TextBlock]: | |
| text_blocks = [] | |
| try: | |
| blocks_data = page.get_text("blocks") | |
| for block_idx, block in enumerate(blocks_data): | |
| if block[6] == 0: | |
| text = block[4].strip() | |
| if text: | |
| x0, y0, x1, y1 = block[0], block[1], block[2], block[3] | |
| lines = text.split('\n') | |
| line_height = (y1 - y0) / max(len(lines), 1) | |
| for line_idx, line in enumerate(lines): | |
| if line.strip(): | |
| text_block = TextBlock( | |
| text=line.strip(), | |
| x=x0, | |
| y=y0 + (line_idx * line_height), | |
| width=x1 - x0, | |
| height=line_height, | |
| font_size=12, | |
| font_name="Arial", | |
| is_bold=False, | |
| is_italic=False, | |
| block_id=f"p{page_num}-simple-b{block_idx}-l{line_idx}" | |
| ) | |
| text_blocks.append(text_block) | |
| except Exception as e: | |
| print(f"β οΈ Simple text block extraction failed: {e}") | |
| return text_blocks | |
| def _extract_images_safely(self, page, doc, page_num) -> List[Dict]: | |
| images = [] | |
| try: | |
| image_list = page.get_images(full=True) | |
| for img_index, img_info in enumerate(image_list): | |
| try: | |
| xref = img_info[0] | |
| img_rects = [r for r in page.get_image_rects(xref)] | |
| if not img_rects: | |
| continue | |
| bbox = img_rects[0] | |
| pix = fitz.Pixmap(doc, xref) | |
| if pix.n - pix.alpha < 4: | |
| img_data = pix.tobytes("png") | |
| img_base64 = base64.b64encode(img_data).decode() | |
| images.append({ | |
| "index": img_index, | |
| "data": img_base64, | |
| "bbox": (bbox.x0, bbox.y0, bbox.x1, bbox.y1) | |
| }) | |
| pix = None | |
| except Exception as e: | |
| print(f"β οΈ Error extracting image {img_index} on page {page_num+1}: {e}") | |
| continue | |
| except Exception as e: | |
| print(f"β οΈ General error in image extraction for page {page_num+1}: {e}") | |
| return images | |
| def _detect_tables_safely(self, page) -> List[Dict]: | |
| tables = [] | |
| try: | |
| tabs = page.find_tables() | |
| for tab_index, tab in enumerate(tabs): | |
| try: | |
| table_data = tab.extract() | |
| if table_data: | |
| cleaned_data = [] | |
| for row in table_data: | |
| cleaned_row = [str(cell).strip() if cell else "" for cell in row] | |
| if any(cleaned_row): | |
| cleaned_data.append(cleaned_row) | |
| if cleaned_data: | |
| tables.append({ | |
| "bbox": (tab.bbox.x0, tab.bbox.y0, tab.bbox.x1, tab.bbox.y1), | |
| "data": cleaned_data | |
| }) | |
| except Exception as e: | |
| print(f"β οΈ Error extracting table {tab_index}: {e}") | |
| continue | |
| except Exception as e: | |
| print(f"β οΈ General error in table detection: {e}") | |
| return tables | |
| def enhance_math_symbols(self, text: str) -> str: | |
| math_replacements = { | |
| 'Β±': '±', 'Γ': '×', 'Γ·': '÷', 'β': '∑', | |
| 'β': '∏', 'β': '√', 'β': '∞', 'β«': '∫', | |
| 'β': '∂', 'β': 'Δ', 'β': '∇', 'β': '∈', | |
| 'β': '∉', 'β': '⊂', 'β': '⊃', 'β': '⊆', | |
| 'β': '⊇', 'βͺ': '∪', 'β©': '∩', 'β€': '≤', | |
| 'β₯': '≥', 'β ': '≠', 'β‘': '≡', 'β': '≈', | |
| 'β': '∝', 'β΄': '∴', | |
| 'Ξ±': 'α', 'Ξ²': 'β', 'Ξ³': 'γ', 'Ξ΄': 'δ', | |
| 'Ξ΅': 'ε', 'ΞΆ': 'ζ', 'Ξ·': 'η', 'ΞΈ': 'θ', | |
| 'ΞΉ': 'ι', 'ΞΊ': 'κ', 'Ξ»': 'λ', 'ΞΌ': 'μ', | |
| 'Ξ½': 'ν', 'ΞΎ': 'ξ', 'Ο': 'π', 'Ο': 'ρ', 'Ο': 'σ', | |
| 'Ο': 'τ', 'Ο ': 'υ', 'Ο': 'φ', 'Ο': 'χ', | |
| 'Ο': 'ψ', 'Ο': 'ω', | |
| 'Β½': '½', 'β ': '⅓', 'ΒΌ': '¼', 'β ': '⅔', | |
| 'ΒΎ': '¾', 'β ': '⅛', 'Β²': '²', 'Β³': '³', | |
| 'ΒΉ': '¹', 'Β°': '°' | |
| } | |
| for symbol, html_entity in math_replacements.items(): | |
| text = text.replace(symbol, html_entity) | |
| return text | |
| def convert_to_html(self, pdf_content: Dict[str, Any], output_path: str = None) -> str: | |
| html_content = [] | |
| html_content.append("""<!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>PDF Document</title> | |
| <style> | |
| * { | |
| box-sizing: border-box; | |
| margin: 0; | |
| padding: 0; | |
| } | |
| body { | |
| font-family: 'Times New Roman', Times, serif; | |
| background-color: #f5f5f5; | |
| padding: 20px; | |
| line-height: 1.2; | |
| color: #000000; | |
| } | |
| .document-container { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| background-color: white; | |
| box-shadow: 0 4px 12px rgba(0,0,0,0.1); | |
| border: 1px solid #ddd; | |
| } | |
| .page-wrapper { | |
| background-color: white; | |
| margin: 0; | |
| padding: 40px; | |
| border-bottom: 2px solid #000; | |
| position: relative; | |
| min-height: 800px; | |
| page-break-after: always; | |
| overflow: visible; | |
| } | |
| .page-header { | |
| background-color: #f8f8f8; | |
| padding: 10px 15px; | |
| margin: -40px -40px 30px -40px; | |
| border-bottom: 2px solid #000; | |
| font-weight: bold; | |
| color: #000; | |
| font-size: 14px; | |
| text-align: center; | |
| } | |
| .content-layer { | |
| position: relative; | |
| width: 100%; | |
| height: 100%; | |
| } | |
| .text-content { | |
| position: relative; | |
| z-index: 10; | |
| line-height: 1.4; | |
| } | |
| .text-block { | |
| margin-bottom: 8px; | |
| font-family: 'Times New Roman', Times, serif; | |
| color: #000; | |
| word-wrap: break-word; | |
| overflow-wrap: break-word; | |
| } | |
| .text-block.inline { | |
| display: inline; | |
| margin-bottom: 0; | |
| margin-right: 5px; | |
| } | |
| .text-group { | |
| margin-bottom: 12px; | |
| line-height: 1.3; | |
| } | |
| .bold { | |
| font-weight: bold; | |
| } | |
| .italic { | |
| font-style: italic; | |
| } | |
| .table-container { | |
| margin: 20px 0; | |
| background-color: white; | |
| overflow: auto; | |
| z-index: 20; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| .table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| border: 2px solid #000; | |
| font-family: 'Times New Roman', Times, serif; | |
| font-size: 12px; | |
| color: #000; | |
| background-color: white; | |
| margin: 0; | |
| } | |
| .table td, .table th { | |
| border: 1px solid #000; | |
| padding: 8px 12px; | |
| text-align: left; | |
| vertical-align: top; | |
| background-color: white; | |
| font-family: 'Times New Roman', Times, serif; | |
| word-wrap: break-word; | |
| min-width: 60px; | |
| } | |
| .table th { | |
| background-color: #f0f0f0; | |
| font-weight: bold; | |
| text-align: center; | |
| } | |
| .table tr:nth-child(even) td { | |
| background-color: #f9f9f9; | |
| } | |
| .table tr:hover td { | |
| background-color: #f0f0f0; | |
| } | |
| .image-container { | |
| margin: 15px 0; | |
| border: 1px solid #ccc; | |
| background-color: white; | |
| text-align: center; | |
| overflow: hidden; | |
| z-index: 5; | |
| } | |
| .image { | |
| max-width: 100%; | |
| height: auto; | |
| display: block; | |
| margin: 0 auto; | |
| } | |
| .math-symbol { | |
| font-family: 'Times New Roman', serif; | |
| } | |
| .document-info { | |
| background-color: #f8f8f8; | |
| padding: 15px; | |
| border: 1px solid #ccc; | |
| margin-bottom: 20px; | |
| text-align: center; | |
| font-family: 'Times New Roman', Times, serif; | |
| } | |
| @media print { | |
| body { | |
| background-color: white; | |
| padding: 0; | |
| } | |
| .page-wrapper { | |
| border: none; | |
| box-shadow: none; | |
| margin: 0; | |
| page-break-after: always; | |
| } | |
| .document-info { | |
| display: none; | |
| } | |
| .table { | |
| border: 2px solid #000 !important; | |
| } | |
| .table td, .table th { | |
| border: 1px solid #000 !important; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="document-container">""") | |
| html_content.append(f""" | |
| <div class="document-info"> | |
| <h1>PDF Document Conversion</h1> | |
| <p><strong>Total Pages:</strong> {pdf_content.get('total_pages', 'Unknown')}</p> | |
| <p><strong>Converted on:</strong> {self._get_current_timestamp()}</p> | |
| </div>""") | |
| for page in pdf_content["pages"]: | |
| page_width = max(page["page_width"], 595) | |
| page_height = max(page["page_height"], 842) | |
| html_content.append(f""" | |
| <div class="page-wrapper"> | |
| <div class="page-header"> | |
| Page {page["page_number"]} ({page_width:.0f}Γ{page_height:.0f}px) - Tables: {len(page["tables"])}, Images: {len(page["images"])}, Text Blocks: {len(page["text_blocks"])} | |
| </div> | |
| <div class="content-layer">""") | |
| # Add images first | |
| for img in page["images"]: | |
| html_content.append(f""" | |
| <div class="image-container"> | |
| <img class="image" src="data:image/png;base64,{img['data']}" | |
| alt="Page {page['page_number']} Image {img['index']}"> | |
| </div>""") | |
| # Add tables with improved generation | |
| for table_idx, table in enumerate(page["tables"]): | |
| print(f"π Generating HTML for table {table_idx} (source: {table.get('source', 'unknown')})") | |
| html_content.append(self._generate_html_table( | |
| table["data"], | |
| header_rows=table.get("header_rows", 1) | |
| )) | |
| # Add text content (non-overlapping groups) | |
| text_groups = self._group_overlapping_text(page["text_blocks"]) | |
| html_content.append(' <div class="text-content">') | |
| for group in text_groups: | |
| if len(group) == 1: | |
| block = group[0] | |
| if block.text.strip(): | |
| enhanced_text = self.enhance_math_symbols(block.text) | |
| enhanced_text = enhanced_text.replace('<', '<').replace('>', '>') | |
| css_classes = ["text-block"] | |
| if block.is_bold: | |
| css_classes.append("bold") | |
| if block.is_italic: | |
| css_classes.append("italic") | |
| if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']): | |
| css_classes.append("math-symbol") | |
| font_family = "'Times New Roman', Times, serif" | |
| if 'arial' in block.font_name.lower(): | |
| font_family = "Arial, sans-serif" | |
| elif 'helvetica' in block.font_name.lower(): | |
| font_family = "Helvetica, Arial, sans-serif" | |
| elif 'courier' in block.font_name.lower(): | |
| font_family = "'Courier New', monospace" | |
| font_size = max(block.font_size * 0.9, 10) | |
| html_content.append(f""" | |
| <div class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};"> | |
| {enhanced_text} | |
| </div>""") | |
| else: | |
| group.sort(key=lambda b: b.x) | |
| html_content.append(' <div class="text-group">') | |
| for block in group: | |
| if block.text.strip(): | |
| enhanced_text = self.enhance_math_symbols(block.text) | |
| enhanced_text = enhanced_text.replace('<', '<').replace('>', '>') | |
| css_classes = ["text-block", "inline"] | |
| if block.is_bold: | |
| css_classes.append("bold") | |
| if block.is_italic: | |
| css_classes.append("italic") | |
| if any(s in enhanced_text for s in ['α', 'β', 'γ', '∑', '∫']): | |
| css_classes.append("math-symbol") | |
| font_family = "'Times New Roman', Times, serif" | |
| if 'arial' in block.font_name.lower(): | |
| font_family = "Arial, sans-serif" | |
| elif 'helvetica' in block.font_name.lower(): | |
| font_family = "Helvetica, Arial, sans-serif" | |
| elif 'courier' in block.font_name.lower(): | |
| font_family = "'Courier New', monospace" | |
| font_size = max(block.font_size * 0.9, 10) | |
| html_content.append(f""" | |
| <span class="{' '.join(css_classes)}" style="font-size: {font_size}px; font-family: {font_family};"> | |
| {enhanced_text} | |
| </span>""") | |
| html_content.append(' </div>') | |
| html_content.append(""" </div> | |
| </div> | |
| </div>""") | |
| html_content.append(" </div>") | |
| html_content.append(""" | |
| </body> | |
| </html>""") | |
| final_html = "\n".join(html_content) | |
| if output_path: | |
| try: | |
| Path(output_path).parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(final_html) | |
| print(f"β HTML saved to: {output_path}") | |
| except Exception as e: | |
| print(f"β οΈ Error saving HTML to {output_path}: {e}") | |
| return final_html | |
| def _get_current_timestamp(self) -> str: | |
| return datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| def process_pdf(self, pdf_path: str, output_path: str = None, use_hf_models: bool = False) -> str: | |
| print(f"π Processing PDF: {pdf_path}") | |
| if not os.path.exists(pdf_path): | |
| raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
| print("π Extracting PDF content...") | |
| pdf_content = self.extract_pdf_content(pdf_path) | |
| if use_hf_models and self.hf_token: | |
| print("π€ Attempting to enhance with Hugging Face models...") | |
| try: | |
| print("Note: Hugging Face model integration requires further implementation.") | |
| except Exception as e: | |
| print(f"β οΈ Hugging Face enhancement failed: {e}") | |
| print("π Converting to HTML...") | |
| html_content = self.convert_to_html(pdf_content, output_path) | |
| print("β Processing complete!") | |
| return html_content | |
| def main(): | |
| HF_TOKEN = os.getenv("HF_API_TOKEN") | |
| converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN) | |
| pdf_path = "new-pdf.pdf" | |
| output_path = "sample_converted.html" | |
| try: | |
| html_content = converter.process_pdf( | |
| pdf_path=pdf_path, | |
| output_path=output_path, | |
| use_hf_models=False | |
| ) | |
| print(f"β Successfully converted '{pdf_path}' to '{output_path}'") | |
| print(f"π Open '{output_path}' in your web browser to view the result!") | |
| except FileNotFoundError as e: | |
| print(f"β Error: {e}") | |
| print("Please ensure the PDF file exists at the specified path.") | |
| except Exception as e: | |
| print(f"β An unexpected error occurred: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| if __name__ == "__main__": | |
| main() |