Spaces:

Shirochi
/

Glossarion

Running

App Files Files Community

Glossarion / glossary_process_worker.py

Shirochi

Upload 41 files

457b8fd verified about 1 month ago

raw

history blame contribute delete

6.7 kB

	"""
	Process-safe glossary generation worker
	========================================
	This module provides a pickleable function for glossary generation
	that can be run in a separate process using ProcessPoolExecutor.
	"""

	import os
	import sys
	import json
	import time

	def generate_glossary_in_process(output_dir, chapters_data, instructions, env_vars, log_queue=None):
	"""
	Generate glossary in a separate process to avoid GIL blocking.

	Args:
	output_dir: Output directory path
	chapters_data: Serialized chapters data
	instructions: Glossary instructions
	env_vars: Environment variables to set
	log_queue: Queue to send logs back to main process

	Returns:
	Dictionary with glossary results or error info
	"""
	import io
	import sys
	from io import StringIO

	# Capture ALL output - both stdout and stderr
	captured_logs = []

	class LogCapture:
	def __init__(self, queue=None):
	self.queue = queue
	self.buffer = ""

	def write(self, text):
	if text:
	# Buffer text and send complete lines
	self.buffer += text
	while '\n' in self.buffer:
	line, self.buffer = self.buffer.split('\n', 1)
	if line:
	captured_logs.append(line)
	if self.queue:
	try:
	self.queue.put(line)
	except:
	pass

	def flush(self):
	if self.buffer:
	captured_logs.append(self.buffer)
	if self.queue:
	try:
	self.queue.put(self.buffer)
	except:
	pass
	self.buffer = ""

	try:
	# Redirect BOTH stdout and stderr to capture ALL output
	log_capture = LogCapture(log_queue)
	old_stdout = sys.stdout
	old_stderr = sys.stderr
	sys.stdout = log_capture
	sys.stderr = log_capture

	# Set environment variables from parent process
	for key, value in env_vars.items():
	os.environ[key] = str(value)

	# Import here to avoid circular imports
	from TransateKRtoEN import GlossaryManager

	# Create glossary manager instance
	glossary_manager = GlossaryManager()

	# Generate glossary
	print(f"📑 Starting glossary generation in subprocess...")
	result = glossary_manager.save_glossary(output_dir, chapters_data, instructions)

	print(f"📑 Glossary generation completed")

	# Flush any remaining output
	log_capture.flush()

	# Restore stdout and stderr
	sys.stdout = old_stdout
	sys.stderr = old_stderr

	return {
	'success': True,
	'result': result,
	'pid': os.getpid(),
	'logs': captured_logs
	}

	except Exception as e:
	import traceback

	# Restore stdout and stderr if needed
	if 'old_stdout' in locals():
	sys.stdout = old_stdout
	if 'old_stderr' in locals():
	sys.stderr = old_stderr

	error_msg = f"Glossary generation error: {str(e)}"
	captured_logs.append(f"📑 ❌ {error_msg}")

	return {
	'success': False,
	'error': error_msg,
	'traceback': traceback.format_exc(),
	'pid': os.getpid(),
	'logs': captured_logs
	}

	def generate_glossary_async(output_dir, chapters, instructions, extraction_workers=None):
	"""
	Generate glossary asynchronously using ProcessPoolExecutor.

	This function completely bypasses the GIL by running in a separate process,
	ensuring the GUI remains fully responsive.
	"""
	import concurrent.futures
	import multiprocessing

	# Ensure freeze support for Windows frozen executables
	try:
	multiprocessing.freeze_support()
	except Exception:
	pass

	# Determine worker count
	if extraction_workers is None:
	extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))

	if extraction_workers == 1:
	# Auto-detect optimal workers
	extraction_workers = min(multiprocessing.cpu_count() or 4, 4)
	print(f"📑 Auto-detected {extraction_workers} CPU cores for glossary generation")

	# Collect relevant environment variables
	env_vars = {}
	important_vars = [
	'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES',
	'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS',
	'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT',
	'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED',
	'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION',
	'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS',
	'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY'
	]

	for var in important_vars:
	if var in os.environ:
	env_vars[var] = os.environ[var]

	# Use ProcessPoolExecutor for true parallelism
	with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
	# Submit the task
	future = executor.submit(
	generate_glossary_in_process,
	output_dir,
	chapters,
	instructions,
	env_vars
	)

	# Return the future for the caller to monitor
	return future

	def check_glossary_completion(future, timeout=0.01):
	"""
	Check if glossary generation is complete without blocking.

	Args:
	future: Future object from generate_glossary_async
	timeout: Timeout in seconds for checking

	Returns:
	Tuple of (is_done, result_or_none)
	"""
	try:
	if future.done():
	result = future.result(timeout=timeout)
	return True, result
	else:
	# Not done yet
	return False, None
	except concurrent.futures.TimeoutError:
	return False, None
	except Exception as e:
	# Error occurred
	return True, {'success': False, 'error': str(e)}