from flask import Blueprint, request, jsonify, render_template import os import re import fitz # PyMuPDF from PIL import Image from werkzeug.utils import secure_filename try: from pix2text import Pix2Text PIX2TEXT_AVAILABLE = True except ImportError: PIX2TEXT_AVAILABLE = False print("⚠️ Pix2Text not available. Install with: pip install pix2text") pdffly_bp = Blueprint('pdffly', __name__) UPLOAD_FOLDER = 'static/uploads' os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Load Pix2Text model once (for efficiency) if PIX2TEXT_AVAILABLE: print("🔹 Loading Pix2Text model for PDF → LaTeX...") try: p2t = Pix2Text() print("✅ Pix2Text model loaded successfully") except Exception as e: print(f"⚠️ Error loading Pix2Text: {e}") p2t = None else: p2t = None def pdf_to_images(pdf_path): """Convert PDF pages to images""" doc = fitz.open(pdf_path) image_paths = [] for i, page in enumerate(doc): pix = page.get_pixmap(dpi=200) img_path = os.path.join(UPLOAD_FOLDER, f"page_{i+1}.png") pix.save(img_path) image_paths.append(img_path) doc.close() return image_paths def extract_text_from_pdf(pdf_path): """Extract raw text from PDF (fallback method)""" doc = fitz.open(pdf_path) all_text = [] for page_num, page in enumerate(doc): text = page.get_text() all_text.append(f"Page {page_num + 1}:\n{text}\n") doc.close() return "\n".join(all_text) def clean_latex_code(latex_str): """Clean and format LaTeX code for Overleaf compilation""" if not latex_str or not isinstance(latex_str, str): return "" # Remove common OCR artifacts and spaces in commands latex_str = re.sub(r'\\operatorname\*?\s*\{\s*([a-z])\s+([a-z])\s+([a-z])\s*\}', lambda m: f'\\{m.group(1)}{m.group(2)}{m.group(3)}', latex_str) # Fix common math operators with spaces replacements = { r'\\operatorname\s*\{\s*l\s+i\s+m\s*\}': r'\\lim', r'\\operatorname\s*\{\s*s\s+i\s+n\s*\}': r'\\sin', r'\\operatorname\s*\{\s*c\s+o\s+s\s*\}': r'\\cos', r'\\operatorname\s*\{\s*t\s+a\s+n\s*\}': r'\\tan', r'\\operatorname\s*\{\s*l\s+o\s+g\s*\}': r'\\log', r'\\operatorname\s*\{\s*l\s+n\s*\}': r'\\ln', r'\\operatorname\s*\{\s*e\s+x\s+p\s*\}': r'\\exp', r'\\operatorname\s*\{\s*m\s+a\s+x\s*\}': r'\\max', r'\\operatorname\s*\{\s*m\s+i\s+n\s*\}': r'\\min', } for pattern, replacement in replacements.items(): latex_str = re.sub(pattern, replacement, latex_str, flags=re.IGNORECASE) # Remove spaces inside any remaining \operatorname commands latex_str = re.sub(r'\\operatorname\*?\s*\{([^}]+)\}', lambda m: f'\\operatorname{{{m.group(1).replace(" ", "")}}}', latex_str) # Replace $$ with \[ \] latex_str = re.sub(r'\$\$([^$]+)\$\$', r'\\[\1\\]', latex_str) # Remove obvious OCR gibberish (sequences of random chars/symbols) latex_str = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]+', '', latex_str) # Balance braces and brackets open_braces = latex_str.count('{') close_braces = latex_str.count('}') if open_braces > close_braces: latex_str += '}' * (open_braces - close_braces) elif close_braces > open_braces: latex_str = '{' * (close_braces - open_braces) + latex_str # Balance brackets open_brackets = latex_str.count('[') close_brackets = latex_str.count(']') if open_brackets > close_brackets: latex_str += ']' * (open_brackets - close_brackets) elif close_brackets > open_brackets: latex_str = '[' * (close_brackets - open_brackets) + latex_str # Remove invalid math syntax patterns latex_str = re.sub(r'\\\\+', r'\\\\', latex_str) # Multiple backslashes latex_str = re.sub(r'\s+', ' ', latex_str) # Multiple spaces return latex_str.strip() def create_complete_latex_document(latex_content, title="PDF to LaTeX Conversion"): """Wrap LaTeX content in a complete compilable document""" document = r'''\documentclass{article} \usepackage{amsmath} \usepackage{amssymb} \usepackage{amsfonts} \usepackage{graphicx} \title{''' + title + r'''} \author{PDFly} \date{\today} \begin{document} \maketitle \begin{center} \textit{This document was automatically generated from a PDF file using OCR and LaTeX conversion.} \end{center} \section*{Content} ''' + latex_content + r''' \end{document}''' return document @pdffly_bp.route("/", methods=["GET"]) def pdffly_page(): """Render the main PDFfly page.""" return render_template("pdffly.html") @pdffly_bp.route('/upload', methods=['POST']) def upload_and_convert_pdf(): """Upload PDF and convert to LaTeX""" if 'file' not in request.files: return jsonify({'error': 'No file found'}), 400 file = request.files['file'] if not file or file.filename == '': return jsonify({'error': 'No file selected'}), 400 if not file.filename.lower().endswith('.pdf'): return jsonify({'error': 'Only PDF files are allowed'}), 400 filename = secure_filename(file.filename) pdf_path = os.path.join(UPLOAD_FOLDER, filename) file.save(pdf_path) try: # Get page count doc = fitz.open(pdf_path) page_count = len(doc) doc.close() # Convert PDF → images images = pdf_to_images(pdf_path) # Run LaTeX recognition for each image latex_results = [] all_latex_pages = [] for i, img_path in enumerate(images): try: if p2t: result = p2t.recognize(img_path, resized_shape=768) latex_code = result if isinstance(result, str) else str(result) # Clean the LaTeX code latex_code = clean_latex_code(latex_code) all_latex_pages.append(f"% Page {i + 1}\n{latex_code}") else: # Fallback: extract text latex_code = f"Text extraction (Pix2Text not available)" all_latex_pages.append(latex_code) latex_results.append({ 'page': i + 1, 'image': img_path.replace('static/', '/static/'), 'latex': latex_code }) except Exception as e: latex_results.append({ 'page': i + 1, 'image': img_path.replace('static/', '/static/'), 'error': str(e) }) all_latex_pages.append(f"% Page {i + 1}: Error - {str(e)}") # Create complete document combined_latex = "\n\n".join(all_latex_pages) complete_document = create_complete_latex_document(combined_latex, filename) return jsonify({ 'success': True, 'message': 'PDF converted successfully!', 'pdf_path': pdf_path.replace('static/', '/static/'), 'filename': filename, 'pages': page_count, 'results': latex_results, 'complete_document': complete_document }) except Exception as e: return jsonify({ 'success': False, 'error': f'Error processing PDF: {str(e)}' }), 500 @pdffly_bp.route('/process', methods=['POST']) def process_pdf(): """Process specific area or entire PDF""" data = request.get_json() filename = data.get('filename') convert_all = data.get('convert_all', False) page_num = data.get('page', 0) coordinates = data.get('coordinates') if not filename: return jsonify({'success': False, 'error': 'No filename provided'}), 400 pdf_path = os.path.join(UPLOAD_FOLDER, filename) if not os.path.exists(pdf_path): return jsonify({'success': False, 'error': 'PDF file not found'}), 404 try: if convert_all: # Extract text from entire PDF text = extract_text_from_pdf(pdf_path) latex = f"\\text{{{text}}}" else: # Extract from specific page doc = fitz.open(pdf_path) if page_num < len(doc): page = doc[page_num] text = page.get_text() latex = f"\\text{{{text}}}" else: latex = "Page not found" doc.close() return jsonify({ 'success': True, 'latex': latex }) except Exception as e: return jsonify({ 'success': False, 'error': str(e) }), 500 @pdffly_bp.route('/solve', methods=['POST']) def solve_latex(): """Solve mathematical content""" data = request.get_json() latex = data.get('latex', '') # Simple solver response return jsonify({ 'success': True, 'solution': { 'type': 'info', 'message': 'Math solver integration pending' } })