File size: 6,695 Bytes
457b8fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""

Process-safe glossary generation worker

========================================

This module provides a pickleable function for glossary generation

that can be run in a separate process using ProcessPoolExecutor.

"""

import os
import sys
import json
import time

def generate_glossary_in_process(output_dir, chapters_data, instructions, env_vars, log_queue=None):
    """

    Generate glossary in a separate process to avoid GIL blocking.

    

    Args:

        output_dir: Output directory path

        chapters_data: Serialized chapters data

        instructions: Glossary instructions

        env_vars: Environment variables to set

        log_queue: Queue to send logs back to main process

    

    Returns:

        Dictionary with glossary results or error info

    """
    import io
    import sys
    from io import StringIO
    
    # Capture ALL output - both stdout and stderr
    captured_logs = []
    
    class LogCapture:
        def __init__(self, queue=None):
            self.queue = queue
            self.buffer = ""
            
        def write(self, text):
            if text:
                # Buffer text and send complete lines
                self.buffer += text
                while '\n' in self.buffer:
                    line, self.buffer = self.buffer.split('\n', 1)
                    if line:
                        captured_logs.append(line)
                        if self.queue:
                            try:
                                self.queue.put(line)
                            except:
                                pass
        
        def flush(self):
            if self.buffer:
                captured_logs.append(self.buffer)
                if self.queue:
                    try:
                        self.queue.put(self.buffer)
                    except:
                        pass
                self.buffer = ""
    
    try:
        # Redirect BOTH stdout and stderr to capture ALL output
        log_capture = LogCapture(log_queue)
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = log_capture
        sys.stderr = log_capture
        
        # Set environment variables from parent process
        for key, value in env_vars.items():
            os.environ[key] = str(value)
        
        # Import here to avoid circular imports
        from TransateKRtoEN import GlossaryManager
        
        # Create glossary manager instance
        glossary_manager = GlossaryManager()
        
        # Generate glossary
        print(f"πŸ“‘ Starting glossary generation in subprocess...")
        result = glossary_manager.save_glossary(output_dir, chapters_data, instructions)
        
        print(f"πŸ“‘ Glossary generation completed")
        
        # Flush any remaining output
        log_capture.flush()
        
        # Restore stdout and stderr
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        
        return {
            'success': True,
            'result': result,
            'pid': os.getpid(),
            'logs': captured_logs
        }
        
    except Exception as e:
        import traceback
        
        # Restore stdout and stderr if needed
        if 'old_stdout' in locals():
            sys.stdout = old_stdout
        if 'old_stderr' in locals():
            sys.stderr = old_stderr
        
        error_msg = f"Glossary generation error: {str(e)}"
        captured_logs.append(f"πŸ“‘ ❌ {error_msg}")
        
        return {
            'success': False,
            'error': error_msg,
            'traceback': traceback.format_exc(),
            'pid': os.getpid(),
            'logs': captured_logs
        }

def generate_glossary_async(output_dir, chapters, instructions, extraction_workers=None):
    """

    Generate glossary asynchronously using ProcessPoolExecutor.

    

    This function completely bypasses the GIL by running in a separate process,

    ensuring the GUI remains fully responsive.

    """
    import concurrent.futures
    import multiprocessing
    
    # Ensure freeze support for Windows frozen executables
    try:
        multiprocessing.freeze_support()
    except Exception:
        pass
    
    # Determine worker count
    if extraction_workers is None:
        extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
    
    if extraction_workers == 1:
        # Auto-detect optimal workers
        extraction_workers = min(multiprocessing.cpu_count() or 4, 4)
        print(f"πŸ“‘ Auto-detected {extraction_workers} CPU cores for glossary generation")
    
    # Collect relevant environment variables
    env_vars = {}
    important_vars = [
        'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES',
        'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS',
        'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT',
        'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED',
        'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION',
        'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS',
        'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY'
    ]
    
    for var in important_vars:
        if var in os.environ:
            env_vars[var] = os.environ[var]
    
    # Use ProcessPoolExecutor for true parallelism
    with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
        # Submit the task
        future = executor.submit(
            generate_glossary_in_process,
            output_dir,
            chapters,
            instructions,
            env_vars
        )
        
        # Return the future for the caller to monitor
        return future

def check_glossary_completion(future, timeout=0.01):
    """

    Check if glossary generation is complete without blocking.

    

    Args:

        future: Future object from generate_glossary_async

        timeout: Timeout in seconds for checking

    

    Returns:

        Tuple of (is_done, result_or_none)

    """
    try:
        if future.done():
            result = future.result(timeout=timeout)
            return True, result
        else:
            # Not done yet
            return False, None
    except concurrent.futures.TimeoutError:
        return False, None
    except Exception as e:
        # Error occurred
        return True, {'success': False, 'error': str(e)}