Spaces:
Sleeping
Sleeping
| from flask import Flask, request, jsonify, send_file, send_from_directory | |
| from flask_cors import CORS | |
| from werkzeug.utils import secure_filename | |
| import os | |
| import traceback | |
| from pdf_html import PDFToHTMLConverter | |
| from pdf_word import PDFToWordConverter | |
| from pdf_json import PDFToJSONConverter | |
| from pdf_excel import PDFToExcelConverter | |
| app = Flask(__name__, static_folder='static') | |
| CORS(app) | |
| # Configure file size limits and folders | |
| app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100 MB limit | |
| app.config['UPLOAD_FOLDER'] = 'uploads' | |
| app.config['OUTPUT_FOLDER'] = 'outputs' | |
| app.config['SECRET_KEY'] = 'your-secret-key-here' # IMPORTANT: Change this in production! | |
| # Create necessary directories if they don't exist | |
| os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) | |
| os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True) | |
| # Placeholder for Hugging Face API Token | |
| HF_TOKEN = "Api_token" # Replace with your actual token | |
| # Define allowed file extensions for uploads | |
| ALLOWED_EXTENSIONS = {'pdf'} | |
| def allowed_file(filename): | |
| """Checks if the uploaded file has an allowed extension.""" | |
| return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
| def serve_index(): | |
| """Serves the main index.html file.""" | |
| return send_from_directory('static', 'index.html') | |
| def serve_static(filename): | |
| """Serves other static files (CSS, JS, etc.).""" | |
| return send_from_directory('static', filename) | |
| def convert_pdf(): | |
| """ | |
| Handles PDF conversion requests. | |
| Expects a 'file' (PDF), 'format' (html, word, json, excel), and 'output_name'. | |
| """ | |
| try: | |
| # Check if a file was included in the request | |
| if 'file' not in request.files: | |
| return jsonify({'success': False, 'error': 'No file uploaded.'}), 400 | |
| file = request.files['file'] | |
| format_type = request.form.get('format') | |
| output_name = request.form.get('output_name', 'converted_file') | |
| # Validate file and format | |
| if file.filename == '': | |
| return jsonify({'success': False, 'error': 'No file selected.'}), 400 | |
| if not format_type or format_type not in ['html', 'word', 'json', 'excel']: | |
| return jsonify({'success': False, 'error': 'Invalid format specified. Must be html, word, json, or excel.'}), 400 | |
| if not allowed_file(file.filename): | |
| return jsonify({'success': False, 'error': 'Only PDF files are allowed.'}), 400 | |
| # Securely save the uploaded file | |
| filename_secured = secure_filename(file.filename) | |
| input_path = os.path.join(app.config['UPLOAD_FOLDER'], filename_secured) | |
| file.save(input_path) | |
| # Define output file extensions based on format | |
| extensions = { | |
| 'html': '.html', | |
| 'word': '.docx', | |
| 'json': '.json', | |
| 'excel': '.xlsx' | |
| } | |
| output_filename = f"{output_name.replace('.', '')}{extensions.get(format_type, '.out')}" | |
| output_path = os.path.join(app.config['OUTPUT_FOLDER'], output_filename) | |
| success_message = "" | |
| try: | |
| # Perform conversion based on the requested format | |
| if format_type == 'html': | |
| converter = PDFToHTMLConverter(huggingface_token=HF_TOKEN) | |
| try: | |
| # First try with HF models | |
| converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=True) | |
| except AttributeError as ae: | |
| if '_group_overlapping_text' in str(ae): | |
| # Fall back to non-HF mode if the method is missing | |
| converter.process_pdf(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
| else: | |
| raise | |
| success_message = "Successfully converted to HTML!" | |
| elif format_type == 'word': | |
| converter = PDFToWordConverter(huggingface_token=HF_TOKEN) | |
| converter.process_pdf_to_word(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
| success_message = "Successfully converted to Word!" | |
| elif format_type == 'json': | |
| converter = PDFToJSONConverter(huggingface_token=HF_TOKEN) | |
| converter.process_pdf_to_json(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
| success_message = "Successfully converted to JSON!" | |
| elif format_type == 'excel': | |
| converter = PDFToExcelConverter(huggingface_token=HF_TOKEN) | |
| converter.process_pdf_to_excel(pdf_path=input_path, output_path=output_path, use_hf_models=False) | |
| success_message = "Successfully converted to Excel!" | |
| except Exception as conv_e: | |
| # Clean up the output file if conversion failed | |
| if os.path.exists(output_path): | |
| try: | |
| os.remove(output_path) | |
| except Exception as e: | |
| print(f"Warning: Could not remove output file {output_path}: {e}") | |
| raise conv_e | |
| # Clean up the uploaded input file | |
| try: | |
| os.remove(input_path) | |
| except Exception as e: | |
| print(f"Warning: Could not remove input file {input_path}: {e}") | |
| # Return success response with download URL | |
| return jsonify({ | |
| 'success': True, | |
| 'message': success_message, | |
| 'download_url': f'/download/{output_filename}' | |
| }), 200 | |
| except Exception as e: | |
| # Clean up input file in case of error | |
| if 'input_path' in locals() and os.path.exists(input_path): | |
| try: | |
| os.remove(input_path) | |
| except Exception as cleanup_e: | |
| print(f"Error during error cleanup for {input_path}: {cleanup_e}") | |
| traceback.print_exc() | |
| error_msg = str(e) | |
| if '_group_overlapping_text' in error_msg: | |
| error_msg = "HTML conversion failed due to incompatible converter version. Please try another format." | |
| return jsonify({ | |
| 'success': False, | |
| 'error': f'Conversion failed: {error_msg}' | |
| }), 500 | |
| def download_file(filename): | |
| """Allows downloading of converted files.""" | |
| try: | |
| file_path = os.path.join(app.config['OUTPUT_FOLDER'], filename) | |
| if os.path.exists(file_path): | |
| return send_from_directory(app.config['OUTPUT_FOLDER'], filename, as_attachment=True) | |
| return jsonify({'error': 'File not found.'}), 404 | |
| except Exception as e: | |
| traceback.print_exc() | |
| return jsonify({'error': str(e)}), 500 | |
| def health_check(): | |
| """Simple health check endpoint.""" | |
| return jsonify({'status': 'healthy', 'message': 'PDF Converter API is running.'}), 200 | |
| def too_large(e): | |
| """Handles file too large errors.""" | |
| return jsonify({'success': False, 'error': 'File too large. Maximum size is 100MB.'}), 413 | |
| def internal_error(e): | |
| """Handles general internal server errors.""" | |
| traceback.print_exc() | |
| return jsonify({'success': False, 'error': 'Internal server error occurred.'}), 500 | |
| if __name__ == '__main__': | |
| app.run(debug=True, host='0.0.0.0', port=5000) |