Spaces:

Shirochi
/

Glossarion

Running

File size: 6,695 Bytes

457b8fd

"""

Process-safe glossary generation worker

========================================

This module provides a pickleable function for glossary generation

that can be run in a separate process using ProcessPoolExecutor.

"""

import os
import sys
import json
import time

def generate_glossary_in_process(output_dir, chapters_data, instructions, env_vars, log_queue=None):
    """

    Generate glossary in a separate process to avoid GIL blocking.

    

    Args:

        output_dir: Output directory path

        chapters_data: Serialized chapters data

        instructions: Glossary instructions

        env_vars: Environment variables to set

        log_queue: Queue to send logs back to main process

    

    Returns:

        Dictionary with glossary results or error info

    """
    import io
    import sys
    from io import StringIO
    
    # Capture ALL output - both stdout and stderr
    captured_logs = []
    
    class LogCapture:
        def __init__(self, queue=None):
            self.queue = queue
            self.buffer = ""
            
        def write(self, text):
            if text:
                # Buffer text and send complete lines
                self.buffer += text
                while '\n' in self.buffer:
                    line, self.buffer = self.buffer.split('\n', 1)
                    if line:
                        captured_logs.append(line)
                        if self.queue:
                            try:
                                self.queue.put(line)
                            except:
                                pass
        
        def flush(self):
            if self.buffer:
                captured_logs.append(self.buffer)
                if self.queue:
                    try:
                        self.queue.put(self.buffer)
                    except:
                        pass
                self.buffer = ""
    
    try:
        # Redirect BOTH stdout and stderr to capture ALL output
        log_capture = LogCapture(log_queue)
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = log_capture
        sys.stderr = log_capture
        
        # Set environment variables from parent process
        for key, value in env_vars.items():
            os.environ[key] = str(value)
        
        # Import here to avoid circular imports
        from TransateKRtoEN import GlossaryManager
        
        # Create glossary manager instance
        glossary_manager = GlossaryManager()
        
        # Generate glossary
        print(f"📑 Starting glossary generation in subprocess...")
        result = glossary_manager.save_glossary(output_dir, chapters_data, instructions)
        
        print(f"📑 Glossary generation completed")
        
        # Flush any remaining output
        log_capture.flush()
        
        # Restore stdout and stderr
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        
        return {
            'success': True,
            'result': result,
            'pid': os.getpid(),
            'logs': captured_logs
        }
        
    except Exception as e:
        import traceback
        
        # Restore stdout and stderr if needed
        if 'old_stdout' in locals():
            sys.stdout = old_stdout
        if 'old_stderr' in locals():
            sys.stderr = old_stderr
        
        error_msg = f"Glossary generation error: {str(e)}"
        captured_logs.append(f"📑 ❌ {error_msg}")
        
        return {
            'success': False,
            'error': error_msg,
            'traceback': traceback.format_exc(),
            'pid': os.getpid(),
            'logs': captured_logs
        }

def generate_glossary_async(output_dir, chapters, instructions, extraction_workers=None):
    """

    Generate glossary asynchronously using ProcessPoolExecutor.

    

    This function completely bypasses the GIL by running in a separate process,

    ensuring the GUI remains fully responsive.

    """
    import concurrent.futures
    import multiprocessing
    
    # Ensure freeze support for Windows frozen executables
    try:
        multiprocessing.freeze_support()
    except Exception:
        pass
    
    # Determine worker count
    if extraction_workers is None:
        extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
    
    if extraction_workers == 1:
        # Auto-detect optimal workers
        extraction_workers = min(multiprocessing.cpu_count() or 4, 4)
        print(f"📑 Auto-detected {extraction_workers} CPU cores for glossary generation")
    
    # Collect relevant environment variables
    env_vars = {}
    important_vars = [
        'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES',
        'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS',
        'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT',
        'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED',
        'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION',
        'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS',
        'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY'
    ]
    
    for var in important_vars:
        if var in os.environ:
            env_vars[var] = os.environ[var]
    
    # Use ProcessPoolExecutor for true parallelism
    with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
        # Submit the task
        future = executor.submit(
            generate_glossary_in_process,
            output_dir,
            chapters,
            instructions,
            env_vars
        )
        
        # Return the future for the caller to monitor
        return future

def check_glossary_completion(future, timeout=0.01):
    """

    Check if glossary generation is complete without blocking.

    

    Args:

        future: Future object from generate_glossary_async

        timeout: Timeout in seconds for checking

    

    Returns:

        Tuple of (is_done, result_or_none)

    """
    try:
        if future.done():
            result = future.result(timeout=timeout)
            return True, result
        else:
            # Not done yet
            return False, None
    except concurrent.futures.TimeoutError:
        return False, None
    except Exception as e:
        # Error occurred
        return True, {'success': False, 'error': str(e)}