Spaces:
Running
Running
| """ | |
| Process-safe glossary generation worker | |
| ======================================== | |
| This module provides a pickleable function for glossary generation | |
| that can be run in a separate process using ProcessPoolExecutor. | |
| """ | |
| import os | |
| import sys | |
| import json | |
| import time | |
| def generate_glossary_in_process(output_dir, chapters_data, instructions, env_vars, log_queue=None): | |
| """ | |
| Generate glossary in a separate process to avoid GIL blocking. | |
| Args: | |
| output_dir: Output directory path | |
| chapters_data: Serialized chapters data | |
| instructions: Glossary instructions | |
| env_vars: Environment variables to set | |
| log_queue: Queue to send logs back to main process | |
| Returns: | |
| Dictionary with glossary results or error info | |
| """ | |
| import io | |
| import sys | |
| from io import StringIO | |
| # Capture ALL output - both stdout and stderr | |
| captured_logs = [] | |
| class LogCapture: | |
| def __init__(self, queue=None): | |
| self.queue = queue | |
| self.buffer = "" | |
| def write(self, text): | |
| if text: | |
| # Buffer text and send complete lines | |
| self.buffer += text | |
| while '\n' in self.buffer: | |
| line, self.buffer = self.buffer.split('\n', 1) | |
| if line: | |
| captured_logs.append(line) | |
| if self.queue: | |
| try: | |
| self.queue.put(line) | |
| except: | |
| pass | |
| def flush(self): | |
| if self.buffer: | |
| captured_logs.append(self.buffer) | |
| if self.queue: | |
| try: | |
| self.queue.put(self.buffer) | |
| except: | |
| pass | |
| self.buffer = "" | |
| try: | |
| # Redirect BOTH stdout and stderr to capture ALL output | |
| log_capture = LogCapture(log_queue) | |
| old_stdout = sys.stdout | |
| old_stderr = sys.stderr | |
| sys.stdout = log_capture | |
| sys.stderr = log_capture | |
| # Set environment variables from parent process | |
| for key, value in env_vars.items(): | |
| os.environ[key] = str(value) | |
| # Import here to avoid circular imports | |
| from TransateKRtoEN import GlossaryManager | |
| # Create glossary manager instance | |
| glossary_manager = GlossaryManager() | |
| # Generate glossary | |
| print(f"π Starting glossary generation in subprocess...") | |
| result = glossary_manager.save_glossary(output_dir, chapters_data, instructions) | |
| print(f"π Glossary generation completed") | |
| # Flush any remaining output | |
| log_capture.flush() | |
| # Restore stdout and stderr | |
| sys.stdout = old_stdout | |
| sys.stderr = old_stderr | |
| return { | |
| 'success': True, | |
| 'result': result, | |
| 'pid': os.getpid(), | |
| 'logs': captured_logs | |
| } | |
| except Exception as e: | |
| import traceback | |
| # Restore stdout and stderr if needed | |
| if 'old_stdout' in locals(): | |
| sys.stdout = old_stdout | |
| if 'old_stderr' in locals(): | |
| sys.stderr = old_stderr | |
| error_msg = f"Glossary generation error: {str(e)}" | |
| captured_logs.append(f"π β {error_msg}") | |
| return { | |
| 'success': False, | |
| 'error': error_msg, | |
| 'traceback': traceback.format_exc(), | |
| 'pid': os.getpid(), | |
| 'logs': captured_logs | |
| } | |
| def generate_glossary_async(output_dir, chapters, instructions, extraction_workers=None): | |
| """ | |
| Generate glossary asynchronously using ProcessPoolExecutor. | |
| This function completely bypasses the GIL by running in a separate process, | |
| ensuring the GUI remains fully responsive. | |
| """ | |
| import concurrent.futures | |
| import multiprocessing | |
| # Ensure freeze support for Windows frozen executables | |
| try: | |
| multiprocessing.freeze_support() | |
| except Exception: | |
| pass | |
| # Determine worker count | |
| if extraction_workers is None: | |
| extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) | |
| if extraction_workers == 1: | |
| # Auto-detect optimal workers | |
| extraction_workers = min(multiprocessing.cpu_count() or 4, 4) | |
| print(f"π Auto-detected {extraction_workers} CPU cores for glossary generation") | |
| # Collect relevant environment variables | |
| env_vars = {} | |
| important_vars = [ | |
| 'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES', | |
| 'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS', | |
| 'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT', | |
| 'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED', | |
| 'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION', | |
| 'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS', | |
| 'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY' | |
| ] | |
| for var in important_vars: | |
| if var in os.environ: | |
| env_vars[var] = os.environ[var] | |
| # Use ProcessPoolExecutor for true parallelism | |
| with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: | |
| # Submit the task | |
| future = executor.submit( | |
| generate_glossary_in_process, | |
| output_dir, | |
| chapters, | |
| instructions, | |
| env_vars | |
| ) | |
| # Return the future for the caller to monitor | |
| return future | |
| def check_glossary_completion(future, timeout=0.01): | |
| """ | |
| Check if glossary generation is complete without blocking. | |
| Args: | |
| future: Future object from generate_glossary_async | |
| timeout: Timeout in seconds for checking | |
| Returns: | |
| Tuple of (is_done, result_or_none) | |
| """ | |
| try: | |
| if future.done(): | |
| result = future.result(timeout=timeout) | |
| return True, result | |
| else: | |
| # Not done yet | |
| return False, None | |
| except concurrent.futures.TimeoutError: | |
| return False, None | |
| except Exception as e: | |
| # Error occurred | |
| return True, {'success': False, 'error': str(e)} |