Spaces:
Running
Running
File size: 6,328 Bytes
b5b56ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
#!/usr/bin/env python3
"""
Chapter Extraction Worker - Runs chapter extraction in a separate process to prevent GUI freezing
"""
import sys
import os
import io
# Force UTF-8 encoding for stdout/stderr on Windows
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
import json
import zipfile
import time
import traceback
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
def run_chapter_extraction(epub_path, output_dir, extraction_mode="smart", progress_callback=None):
"""
Run chapter extraction in this worker process
Args:
epub_path: Path to EPUB file
output_dir: Output directory for extracted content
extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
progress_callback: Callback function for progress updates (uses print for IPC)
Returns:
dict: Extraction results including chapters and metadata
"""
try:
# Import here to avoid loading heavy modules until needed
from TransateKRtoEN import ChapterExtractor
# Create progress callback that prints to stdout for IPC
def worker_progress_callback(message):
# Use special prefix for progress messages
print(f"[PROGRESS] {message}", flush=True)
# Create extractor with progress callback
extractor = ChapterExtractor(progress_callback=worker_progress_callback)
# Set extraction mode
os.environ["EXTRACTION_MODE"] = extraction_mode
# Open EPUB and extract chapters
print(f"[INFO] Starting extraction of: {epub_path}", flush=True)
print(f"[INFO] Output directory: {output_dir}", flush=True)
print(f"[INFO] Extraction mode: {extraction_mode}", flush=True)
with zipfile.ZipFile(epub_path, 'r') as zf:
# Extract metadata first
metadata = extractor._extract_epub_metadata(zf)
print(f"[INFO] Extracted metadata: {list(metadata.keys())}", flush=True)
# Extract chapters
chapters = extractor.extract_chapters(zf, output_dir)
print(f"[INFO] Extracted {len(chapters)} chapters", flush=True)
# The extract_chapters method already handles OPF sorting internally
# Just log if OPF was used
opf_path = os.path.join(output_dir, 'content.opf')
if os.path.exists(opf_path):
print(f"[INFO] OPF file available for chapter ordering", flush=True)
# CRITICAL: Save the full chapters with body content!
# This is what the main process needs to load
chapters_full_path = os.path.join(output_dir, "chapters_full.json")
try:
with open(chapters_full_path, 'w', encoding='utf-8') as f:
json.dump(chapters, f, ensure_ascii=False)
print(f"[INFO] Saved full chapters data to: {chapters_full_path}", flush=True)
except Exception as e:
print(f"[WARNING] Could not save full chapters: {e}", flush=True)
# Fall back to saving individual files
for chapter in chapters:
try:
chapter_file = f"chapter_{chapter['num']:04d}_{chapter.get('filename', 'content').replace('/', '_')}.html"
chapter_path = os.path.join(output_dir, chapter_file)
with open(chapter_path, 'w', encoding='utf-8') as f:
f.write(chapter.get('body', ''))
print(f"[INFO] Saved chapter {chapter['num']} to {chapter_file}", flush=True)
except Exception as ce:
print(f"[WARNING] Could not save chapter {chapter.get('num')}: {ce}", flush=True)
# Return results as JSON for IPC
result = {
"success": True,
"chapters": len(chapters),
"metadata": metadata,
"chapter_info": [
{
"num": ch.get("num"),
"title": ch.get("title"),
"has_images": ch.get("has_images", False),
"file_size": ch.get("file_size", 0),
"content_hash": ch.get("content_hash", "")
}
for ch in chapters
]
}
# Output result as JSON
print(f"[RESULT] {json.dumps(result)}", flush=True)
return result
except Exception as e:
# Send error information
error_info = {
"success": False,
"error": str(e),
"traceback": traceback.format_exc()
}
print(f"[ERROR] {str(e)}", flush=True)
print(f"[RESULT] {json.dumps(error_info)}", flush=True)
return error_info
def main():
"""Main entry point for worker process"""
# Parse command line arguments
if len(sys.argv) < 3:
print("[ERROR] Usage: chapter_extraction_worker.py <epub_path> <output_dir> [extraction_mode]", flush=True)
sys.exit(1)
epub_path = sys.argv[1]
output_dir = sys.argv[2]
extraction_mode = sys.argv[3] if len(sys.argv) > 3 else "smart"
# Validate inputs
if not os.path.exists(epub_path):
print(f"[ERROR] EPUB file not found: {epub_path}", flush=True)
sys.exit(1)
# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)
# Run extraction
result = run_chapter_extraction(epub_path, output_dir, extraction_mode)
# Exit with appropriate code
sys.exit(0 if result.get("success", False) else 1)
if __name__ == "__main__":
# Ensure freeze support for Windows frozen exe
try:
import multiprocessing
multiprocessing.freeze_support()
except Exception:
pass
main()
|