Spaces:

Shirochi
/

Glossarion

Running

File size: 6,328 Bytes

b5b56ea

#!/usr/bin/env python3
"""

Chapter Extraction Worker - Runs chapter extraction in a separate process to prevent GUI freezing

"""

import sys
import os
import io

# Force UTF-8 encoding for stdout/stderr on Windows
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
import json
import zipfile
import time
import traceback
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

def run_chapter_extraction(epub_path, output_dir, extraction_mode="smart", progress_callback=None):
    """

    Run chapter extraction in this worker process

    

    Args:

        epub_path: Path to EPUB file

        output_dir: Output directory for extracted content

        extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)

        progress_callback: Callback function for progress updates (uses print for IPC)

    

    Returns:

        dict: Extraction results including chapters and metadata

    """
    try:
        # Import here to avoid loading heavy modules until needed
        from TransateKRtoEN import ChapterExtractor
        
        # Create progress callback that prints to stdout for IPC
        def worker_progress_callback(message):
            # Use special prefix for progress messages
            print(f"[PROGRESS] {message}", flush=True)
        
        # Create extractor with progress callback
        extractor = ChapterExtractor(progress_callback=worker_progress_callback)
        
        # Set extraction mode
        os.environ["EXTRACTION_MODE"] = extraction_mode
        
        # Open EPUB and extract chapters
        print(f"[INFO] Starting extraction of: {epub_path}", flush=True)
        print(f"[INFO] Output directory: {output_dir}", flush=True)
        print(f"[INFO] Extraction mode: {extraction_mode}", flush=True)
        
        with zipfile.ZipFile(epub_path, 'r') as zf:
            # Extract metadata first
            metadata = extractor._extract_epub_metadata(zf)
            print(f"[INFO] Extracted metadata: {list(metadata.keys())}", flush=True)
            
            # Extract chapters
            chapters = extractor.extract_chapters(zf, output_dir)
            
            print(f"[INFO] Extracted {len(chapters)} chapters", flush=True)
            
            # The extract_chapters method already handles OPF sorting internally
            # Just log if OPF was used
            opf_path = os.path.join(output_dir, 'content.opf')
            if os.path.exists(opf_path):
                print(f"[INFO] OPF file available for chapter ordering", flush=True)
            
            # CRITICAL: Save the full chapters with body content!
            # This is what the main process needs to load
            chapters_full_path = os.path.join(output_dir, "chapters_full.json")
            try:
                with open(chapters_full_path, 'w', encoding='utf-8') as f:
                    json.dump(chapters, f, ensure_ascii=False)
                print(f"[INFO] Saved full chapters data to: {chapters_full_path}", flush=True)
            except Exception as e:
                print(f"[WARNING] Could not save full chapters: {e}", flush=True)
                # Fall back to saving individual files
                for chapter in chapters:
                    try:
                        chapter_file = f"chapter_{chapter['num']:04d}_{chapter.get('filename', 'content').replace('/', '_')}.html"
                        chapter_path = os.path.join(output_dir, chapter_file)
                        with open(chapter_path, 'w', encoding='utf-8') as f:
                            f.write(chapter.get('body', ''))
                        print(f"[INFO] Saved chapter {chapter['num']} to {chapter_file}", flush=True)
                    except Exception as ce:
                        print(f"[WARNING] Could not save chapter {chapter.get('num')}: {ce}", flush=True)
            
            # Return results as JSON for IPC
            result = {
                "success": True,
                "chapters": len(chapters),
                "metadata": metadata,
                "chapter_info": [
                    {
                        "num": ch.get("num"),
                        "title": ch.get("title"),
                        "has_images": ch.get("has_images", False),
                        "file_size": ch.get("file_size", 0),
                        "content_hash": ch.get("content_hash", "")
                    }
                    for ch in chapters
                ]
            }
            
            # Output result as JSON
            print(f"[RESULT] {json.dumps(result)}", flush=True)
            return result
            
    except Exception as e:
        # Send error information
        error_info = {
            "success": False,
            "error": str(e),
            "traceback": traceback.format_exc()
        }
        print(f"[ERROR] {str(e)}", flush=True)
        print(f"[RESULT] {json.dumps(error_info)}", flush=True)
        return error_info


def main():
    """Main entry point for worker process"""
    
    # Parse command line arguments
    if len(sys.argv) < 3:
        print("[ERROR] Usage: chapter_extraction_worker.py <epub_path> <output_dir> [extraction_mode]", flush=True)
        sys.exit(1)
    
    epub_path = sys.argv[1]
    output_dir = sys.argv[2]
    extraction_mode = sys.argv[3] if len(sys.argv) > 3 else "smart"
    
    # Validate inputs
    if not os.path.exists(epub_path):
        print(f"[ERROR] EPUB file not found: {epub_path}", flush=True)
        sys.exit(1)
    
    # Create output directory if needed
    os.makedirs(output_dir, exist_ok=True)
    
    # Run extraction
    result = run_chapter_extraction(epub_path, output_dir, extraction_mode)
    
    # Exit with appropriate code
    sys.exit(0 if result.get("success", False) else 1)


if __name__ == "__main__":
    # Ensure freeze support for Windows frozen exe
    try:
        import multiprocessing
        multiprocessing.freeze_support()
    except Exception:
        pass
    main()