Glossarion / glossary_process_worker.py
Shirochi's picture
Upload 41 files
457b8fd verified
"""
Process-safe glossary generation worker
========================================
This module provides a pickleable function for glossary generation
that can be run in a separate process using ProcessPoolExecutor.
"""
import os
import sys
import json
import time
def generate_glossary_in_process(output_dir, chapters_data, instructions, env_vars, log_queue=None):
"""
Generate glossary in a separate process to avoid GIL blocking.
Args:
output_dir: Output directory path
chapters_data: Serialized chapters data
instructions: Glossary instructions
env_vars: Environment variables to set
log_queue: Queue to send logs back to main process
Returns:
Dictionary with glossary results or error info
"""
import io
import sys
from io import StringIO
# Capture ALL output - both stdout and stderr
captured_logs = []
class LogCapture:
def __init__(self, queue=None):
self.queue = queue
self.buffer = ""
def write(self, text):
if text:
# Buffer text and send complete lines
self.buffer += text
while '\n' in self.buffer:
line, self.buffer = self.buffer.split('\n', 1)
if line:
captured_logs.append(line)
if self.queue:
try:
self.queue.put(line)
except:
pass
def flush(self):
if self.buffer:
captured_logs.append(self.buffer)
if self.queue:
try:
self.queue.put(self.buffer)
except:
pass
self.buffer = ""
try:
# Redirect BOTH stdout and stderr to capture ALL output
log_capture = LogCapture(log_queue)
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = log_capture
sys.stderr = log_capture
# Set environment variables from parent process
for key, value in env_vars.items():
os.environ[key] = str(value)
# Import here to avoid circular imports
from TransateKRtoEN import GlossaryManager
# Create glossary manager instance
glossary_manager = GlossaryManager()
# Generate glossary
print(f"πŸ“‘ Starting glossary generation in subprocess...")
result = glossary_manager.save_glossary(output_dir, chapters_data, instructions)
print(f"πŸ“‘ Glossary generation completed")
# Flush any remaining output
log_capture.flush()
# Restore stdout and stderr
sys.stdout = old_stdout
sys.stderr = old_stderr
return {
'success': True,
'result': result,
'pid': os.getpid(),
'logs': captured_logs
}
except Exception as e:
import traceback
# Restore stdout and stderr if needed
if 'old_stdout' in locals():
sys.stdout = old_stdout
if 'old_stderr' in locals():
sys.stderr = old_stderr
error_msg = f"Glossary generation error: {str(e)}"
captured_logs.append(f"πŸ“‘ ❌ {error_msg}")
return {
'success': False,
'error': error_msg,
'traceback': traceback.format_exc(),
'pid': os.getpid(),
'logs': captured_logs
}
def generate_glossary_async(output_dir, chapters, instructions, extraction_workers=None):
"""
Generate glossary asynchronously using ProcessPoolExecutor.
This function completely bypasses the GIL by running in a separate process,
ensuring the GUI remains fully responsive.
"""
import concurrent.futures
import multiprocessing
# Ensure freeze support for Windows frozen executables
try:
multiprocessing.freeze_support()
except Exception:
pass
# Determine worker count
if extraction_workers is None:
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
if extraction_workers == 1:
# Auto-detect optimal workers
extraction_workers = min(multiprocessing.cpu_count() or 4, 4)
print(f"πŸ“‘ Auto-detected {extraction_workers} CPU cores for glossary generation")
# Collect relevant environment variables
env_vars = {}
important_vars = [
'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES',
'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS',
'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT',
'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED',
'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION',
'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS',
'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY'
]
for var in important_vars:
if var in os.environ:
env_vars[var] = os.environ[var]
# Use ProcessPoolExecutor for true parallelism
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
# Submit the task
future = executor.submit(
generate_glossary_in_process,
output_dir,
chapters,
instructions,
env_vars
)
# Return the future for the caller to monitor
return future
def check_glossary_completion(future, timeout=0.01):
"""
Check if glossary generation is complete without blocking.
Args:
future: Future object from generate_glossary_async
timeout: Timeout in seconds for checking
Returns:
Tuple of (is_done, result_or_none)
"""
try:
if future.done():
result = future.result(timeout=timeout)
return True, result
else:
# Not done yet
return False, None
except concurrent.futures.TimeoutError:
return False, None
except Exception as e:
# Error occurred
return True, {'success': False, 'error': str(e)}