Spaces:
Running
Running
Upload 41 files
Browse files- .gitattributes +1 -0
- Halgakos.ico +3 -0
- TransateKRtoEN.py +0 -0
- ai_hunter_enhanced.py +1385 -0
- api_key_encryption.py +244 -0
- async_api_processor.py +0 -0
- bubble_detector.py +1881 -0
- chapter_extraction_manager.py +403 -0
- chapter_extraction_worker.py +158 -0
- chapter_splitter.py +195 -0
- check_epub_directory.py +152 -0
- direct_imports.py +38 -0
- enhanced_text_extractor.py +597 -0
- epub_converter.py +0 -0
- extract_glossary_from_epub.py +2081 -0
- extract_glossary_from_txt.py +59 -0
- glossarion_web.py +0 -0
- glossary_process_worker.py +198 -0
- history_manager.py +136 -0
- image_translator.py +0 -0
- individual_endpoint_dialog.py +229 -0
- launch_Glossarion.bat +11 -0
- launch_Glossarion.vbs +3 -0
- launch_web.bat +37 -0
- launch_web_advanced.bat +107 -0
- local_inpainter.py +0 -0
- manga_integration.py +0 -0
- manga_settings_dialog.py +0 -0
- manga_translator.py +0 -0
- memory_usage_reporter.py +225 -0
- metadata_batch_translator.py +0 -0
- model_options.py +128 -0
- multi_api_key_manager.py +0 -0
- ocr_manager.py +1879 -0
- scan_html_folder.py +0 -0
- splash_utils.py +347 -0
- tqdm_safety.py +96 -0
- translator_gui.py +0 -0
- txt_processor.py +304 -0
- unified_api_client.py +0 -0
- update_manager.py +826 -0
- wait_and_open.ps1 +31 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Halgakos.ico filter=lfs diff=lfs merge=lfs -text
|
Halgakos.ico
ADDED
|
|
Git LFS Details
|
TransateKRtoEN.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ai_hunter_enhanced.py
ADDED
|
@@ -0,0 +1,1385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ai_hunter_enhanced.py
|
| 2 |
+
# Combined AI Hunter configuration GUI and detection logic
|
| 3 |
+
|
| 4 |
+
import tkinter as tk
|
| 5 |
+
from tkinter import ttk
|
| 6 |
+
import ttkbootstrap as tb
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import unicodedata
|
| 11 |
+
from difflib import SequenceMatcher
|
| 12 |
+
from collections import Counter
|
| 13 |
+
|
| 14 |
+
class AIHunterConfigGUI:
|
| 15 |
+
"""GUI for configuring AI Hunter detection parameters"""
|
| 16 |
+
def __init__(self, parent, config_dict, callback=None):
|
| 17 |
+
"""
|
| 18 |
+
Initialize with reference to main config dictionary
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
parent: Parent window
|
| 22 |
+
config_dict: Reference to main translator config dictionary
|
| 23 |
+
callback: Function to call after saving
|
| 24 |
+
"""
|
| 25 |
+
self.parent = parent
|
| 26 |
+
self.config = config_dict # Reference to main config
|
| 27 |
+
self.callback = callback
|
| 28 |
+
self.window = None
|
| 29 |
+
|
| 30 |
+
# Default AI Hunter settings structure
|
| 31 |
+
self.default_ai_hunter = {
|
| 32 |
+
'enabled': True,
|
| 33 |
+
'ai_hunter_max_workers': 1,
|
| 34 |
+
'retry_attempts': 6,
|
| 35 |
+
'disable_temperature_change': False,
|
| 36 |
+
'sample_size': 3000,
|
| 37 |
+
'thresholds': {
|
| 38 |
+
'exact': 90,
|
| 39 |
+
'text': 35,
|
| 40 |
+
'semantic': 85,
|
| 41 |
+
'structural': 85,
|
| 42 |
+
'character': 90,
|
| 43 |
+
'pattern': 80
|
| 44 |
+
},
|
| 45 |
+
'weights': {
|
| 46 |
+
'exact': 1.5,
|
| 47 |
+
'text': 1.2,
|
| 48 |
+
'semantic': 1.0,
|
| 49 |
+
'structural': 1.0,
|
| 50 |
+
'character': 0.8,
|
| 51 |
+
'pattern': 0.8
|
| 52 |
+
},
|
| 53 |
+
'detection_mode': 'weighted_average',
|
| 54 |
+
'multi_method_requirements': {
|
| 55 |
+
'methods_required': 3,
|
| 56 |
+
'min_methods': ['semantic', 'structural']
|
| 57 |
+
},
|
| 58 |
+
'preprocessing': {
|
| 59 |
+
'remove_html_spacing': True,
|
| 60 |
+
'normalize_unicode': True,
|
| 61 |
+
'ignore_case': True,
|
| 62 |
+
'remove_extra_whitespace': True
|
| 63 |
+
},
|
| 64 |
+
'edge_filters': {
|
| 65 |
+
'min_text_length': 500,
|
| 66 |
+
'max_length_ratio': 1.3,
|
| 67 |
+
'min_length_ratio': 0.7
|
| 68 |
+
},
|
| 69 |
+
'language_detection': {
|
| 70 |
+
'enabled': False,
|
| 71 |
+
'target_language': 'english',
|
| 72 |
+
'threshold_characters': 500,
|
| 73 |
+
'languages': {
|
| 74 |
+
'english': ['en'],
|
| 75 |
+
'japanese': ['ja', 'jp'],
|
| 76 |
+
'korean': ['ko', 'kr'],
|
| 77 |
+
'chinese': ['zh', 'zh-cn', 'zh-tw'],
|
| 78 |
+
'spanish': ['es'],
|
| 79 |
+
'french': ['fr'],
|
| 80 |
+
'german': ['de'],
|
| 81 |
+
'russian': ['ru'],
|
| 82 |
+
'arabic': ['ar'],
|
| 83 |
+
'hindi': ['hi'],
|
| 84 |
+
'portuguese': ['pt'],
|
| 85 |
+
'italian': ['it'],
|
| 86 |
+
'dutch': ['nl'],
|
| 87 |
+
'thai': ['th'],
|
| 88 |
+
'vietnamese': ['vi'],
|
| 89 |
+
'turkish': ['tr'],
|
| 90 |
+
'polish': ['pl'],
|
| 91 |
+
'swedish': ['sv'],
|
| 92 |
+
'danish': ['da'],
|
| 93 |
+
'norwegian': ['no'],
|
| 94 |
+
'finnish': ['fi']
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# Initialize AI Hunter config in main config if not present
|
| 100 |
+
if 'ai_hunter_config' not in self.config:
|
| 101 |
+
self.config['ai_hunter_config'] = self.default_ai_hunter.copy()
|
| 102 |
+
else:
|
| 103 |
+
# Merge with defaults to ensure all keys exist
|
| 104 |
+
self.config['ai_hunter_config'] = self._merge_configs(
|
| 105 |
+
self.default_ai_hunter,
|
| 106 |
+
self.config['ai_hunter_config']
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def _merge_configs(self, default, existing):
|
| 110 |
+
"""Recursively merge existing config with defaults"""
|
| 111 |
+
result = default.copy()
|
| 112 |
+
for key, value in existing.items():
|
| 113 |
+
if key in result and isinstance(result[key], dict) and isinstance(value, dict):
|
| 114 |
+
result[key] = self._merge_configs(result[key], value)
|
| 115 |
+
else:
|
| 116 |
+
result[key] = value
|
| 117 |
+
return result
|
| 118 |
+
|
| 119 |
+
def get_ai_config(self):
|
| 120 |
+
"""Get AI Hunter configuration from main config"""
|
| 121 |
+
return self.config.get('ai_hunter_config', self.default_ai_hunter)
|
| 122 |
+
|
| 123 |
+
def show_ai_hunter_config(self):
|
| 124 |
+
"""Display the AI Hunter configuration window with scrollbar using WindowManager"""
|
| 125 |
+
if self.window and self.window.winfo_exists():
|
| 126 |
+
self.window.lift()
|
| 127 |
+
return
|
| 128 |
+
|
| 129 |
+
# Import WindowManager if not already available
|
| 130 |
+
if not hasattr(self, 'wm'):
|
| 131 |
+
from translator_gui import WindowManager
|
| 132 |
+
import sys
|
| 133 |
+
import os
|
| 134 |
+
base_dir = getattr(sys, '_MEIPASS', os.path.dirname(os.path.abspath(__file__)))
|
| 135 |
+
self.wm = WindowManager(base_dir)
|
| 136 |
+
|
| 137 |
+
# Create scrollable dialog using WindowManager
|
| 138 |
+
dialog, scrollable_frame, canvas = self.wm.setup_scrollable(
|
| 139 |
+
self.parent,
|
| 140 |
+
"AI Hunter Configuration",
|
| 141 |
+
width=820,
|
| 142 |
+
height=None, # Will use default height
|
| 143 |
+
max_width_ratio=0.9,
|
| 144 |
+
max_height_ratio=0.85
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
self.window = dialog
|
| 148 |
+
|
| 149 |
+
# Create notebook inside scrollable frame
|
| 150 |
+
notebook = ttk.Notebook(scrollable_frame)
|
| 151 |
+
notebook.pack(fill='both', expand=True, padx=10, pady=10)
|
| 152 |
+
|
| 153 |
+
# Tab 1: Detection Thresholds
|
| 154 |
+
self.create_thresholds_tab(notebook)
|
| 155 |
+
|
| 156 |
+
# Tab 2: Detection Mode
|
| 157 |
+
self.create_mode_tab(notebook)
|
| 158 |
+
|
| 159 |
+
# Tab 3: Preprocessing
|
| 160 |
+
self.create_preprocessing_tab(notebook)
|
| 161 |
+
|
| 162 |
+
# Tab 4: Advanced Settings
|
| 163 |
+
self.create_advanced_tab(notebook)
|
| 164 |
+
|
| 165 |
+
# Buttons at the bottom (inside scrollable frame)
|
| 166 |
+
button_frame = tk.Frame(scrollable_frame)
|
| 167 |
+
button_frame.pack(fill='x', padx=10, pady=(10, 20))
|
| 168 |
+
|
| 169 |
+
tb.Button(button_frame, text="Save", command=self.apply_ai_hunter_settings,
|
| 170 |
+
bootstyle="success").pack(side='right', padx=5)
|
| 171 |
+
tb.Button(button_frame, text="Cancel", command=self.window.destroy,
|
| 172 |
+
bootstyle="secondary").pack(side='right')
|
| 173 |
+
tb.Button(button_frame, text="Reset to Defaults", command=self.reset_defaults,
|
| 174 |
+
bootstyle="warning").pack(side='left')
|
| 175 |
+
|
| 176 |
+
# Auto-resize and show
|
| 177 |
+
self.wm.auto_resize_dialog(dialog, canvas, max_width_ratio=0.9, max_height_ratio=1.1)
|
| 178 |
+
|
| 179 |
+
# Handle window close
|
| 180 |
+
dialog.protocol("WM_DELETE_WINDOW", lambda: [dialog._cleanup_scrolling(), dialog.destroy()])
|
| 181 |
+
|
| 182 |
+
def create_thresholds_tab(self, notebook):
|
| 183 |
+
"""Create the thresholds configuration tab"""
|
| 184 |
+
frame = ttk.Frame(notebook)
|
| 185 |
+
notebook.add(frame, text="Detection Thresholds")
|
| 186 |
+
|
| 187 |
+
# Title
|
| 188 |
+
tk.Label(frame, text="Detection Method Thresholds",
|
| 189 |
+
font=('TkDefaultFont', 12, 'bold')).pack(pady=10)
|
| 190 |
+
|
| 191 |
+
tk.Label(frame, text="Higher values = fewer false positives (more strict)\n"
|
| 192 |
+
"Lower values = more false positives (more sensitive)",
|
| 193 |
+
font=('TkDefaultFont', 10), fg='gray').pack(pady=(0, 20))
|
| 194 |
+
|
| 195 |
+
# Threshold controls
|
| 196 |
+
self.threshold_vars = {}
|
| 197 |
+
threshold_frame = tk.Frame(frame)
|
| 198 |
+
threshold_frame.pack(fill='both', expand=True, padx=20)
|
| 199 |
+
|
| 200 |
+
descriptions = {
|
| 201 |
+
'exact': 'Exact Text Match - Direct character-by-character comparison',
|
| 202 |
+
'text': 'Smart Text Similarity - Intelligent text comparison with sampling',
|
| 203 |
+
'semantic': 'Semantic Analysis - Character names, dialogue patterns, numbers',
|
| 204 |
+
'structural': 'Structural Patterns - Paragraph structure, dialogue distribution',
|
| 205 |
+
'character': 'Character Overlap - Common character names between chapters',
|
| 206 |
+
'pattern': 'Pattern Analysis - Narrative flow and structure patterns'
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
ai_config = self.get_ai_config()
|
| 210 |
+
|
| 211 |
+
for method, desc in descriptions.items():
|
| 212 |
+
method_frame = tk.Frame(threshold_frame)
|
| 213 |
+
method_frame.pack(fill='x', pady=10)
|
| 214 |
+
|
| 215 |
+
# Method name and description
|
| 216 |
+
label_frame = tk.Frame(method_frame)
|
| 217 |
+
label_frame.pack(fill='x')
|
| 218 |
+
|
| 219 |
+
tk.Label(label_frame, text=f"{method.title()}:",
|
| 220 |
+
font=('TkDefaultFont', 10, 'bold')).pack(side='left')
|
| 221 |
+
tk.Label(label_frame, text=f" {desc}",
|
| 222 |
+
font=('TkDefaultFont', 9), fg='gray').pack(side='left', padx=(10, 0))
|
| 223 |
+
|
| 224 |
+
# Slider and value
|
| 225 |
+
slider_frame = tk.Frame(method_frame)
|
| 226 |
+
slider_frame.pack(fill='x', pady=(5, 0))
|
| 227 |
+
|
| 228 |
+
self.threshold_vars[method] = tk.IntVar(value=ai_config['thresholds'][method])
|
| 229 |
+
|
| 230 |
+
slider = tb.Scale(slider_frame, from_=10, to=100,
|
| 231 |
+
variable=self.threshold_vars[method],
|
| 232 |
+
bootstyle="info", length=400)
|
| 233 |
+
slider.pack(side='left', padx=(20, 10))
|
| 234 |
+
|
| 235 |
+
value_label = tk.Label(slider_frame, text="", width=4)
|
| 236 |
+
value_label.pack(side='left')
|
| 237 |
+
|
| 238 |
+
# Update label when slider changes
|
| 239 |
+
def update_label(val, label=value_label, var=self.threshold_vars[method]):
|
| 240 |
+
label.config(text=f"{int(var.get())}%")
|
| 241 |
+
|
| 242 |
+
self.threshold_vars[method].trace('w', lambda *args, f=update_label: f(None))
|
| 243 |
+
update_label(None)
|
| 244 |
+
|
| 245 |
+
# Weight configuration
|
| 246 |
+
tk.Label(frame, text="Method Weights (for weighted average mode)",
|
| 247 |
+
font=('TkDefaultFont', 11, 'bold')).pack(pady=(30, 10))
|
| 248 |
+
|
| 249 |
+
self.weight_vars = {}
|
| 250 |
+
weight_frame = tk.Frame(frame)
|
| 251 |
+
weight_frame.pack(fill='x', padx=20)
|
| 252 |
+
|
| 253 |
+
for method in descriptions.keys():
|
| 254 |
+
w_frame = tk.Frame(weight_frame)
|
| 255 |
+
w_frame.pack(fill='x', pady=5)
|
| 256 |
+
|
| 257 |
+
tk.Label(w_frame, text=f"{method.title()} weight:", width=20,
|
| 258 |
+
anchor='w').pack(side='left')
|
| 259 |
+
|
| 260 |
+
self.weight_vars[method] = tk.DoubleVar(value=ai_config['weights'][method])
|
| 261 |
+
|
| 262 |
+
tb.Spinbox(w_frame, from_=0.1, to=2.0, increment=0.1,
|
| 263 |
+
textvariable=self.weight_vars[method],
|
| 264 |
+
width=10).pack(side='left', padx=10)
|
| 265 |
+
|
| 266 |
+
def create_mode_tab(self, notebook):
|
| 267 |
+
"""Create the detection mode configuration tab"""
|
| 268 |
+
frame = ttk.Frame(notebook)
|
| 269 |
+
notebook.add(frame, text="Detection Mode")
|
| 270 |
+
|
| 271 |
+
tk.Label(frame, text="Detection Mode Configuration",
|
| 272 |
+
font=('TkDefaultFont', 12, 'bold')).pack(pady=10)
|
| 273 |
+
|
| 274 |
+
# Detection mode selection
|
| 275 |
+
mode_frame = tk.LabelFrame(frame, text="Detection Mode", padx=20, pady=20)
|
| 276 |
+
mode_frame.pack(fill='x', padx=20, pady=10)
|
| 277 |
+
|
| 278 |
+
ai_config = self.get_ai_config()
|
| 279 |
+
self.mode_var = tk.StringVar(value=ai_config['detection_mode'])
|
| 280 |
+
|
| 281 |
+
modes = [
|
| 282 |
+
('single_method', 'Single Method',
|
| 283 |
+
'Flag as duplicate if ANY method exceeds its threshold\n(Most sensitive, most false positives)'),
|
| 284 |
+
('multi_method', 'Multi-Method Agreement',
|
| 285 |
+
'Require multiple methods to agree before flagging\n(Balanced approach)'),
|
| 286 |
+
('weighted_average', 'Weighted Average',
|
| 287 |
+
'Calculate weighted average of all methods\n(Most nuanced, least false positives)')
|
| 288 |
+
]
|
| 289 |
+
|
| 290 |
+
for value, text, desc in modes:
|
| 291 |
+
rb_frame = tk.Frame(mode_frame)
|
| 292 |
+
rb_frame.pack(fill='x', pady=10)
|
| 293 |
+
|
| 294 |
+
tb.Radiobutton(rb_frame, text=text, variable=self.mode_var,
|
| 295 |
+
value=value, bootstyle="primary").pack(anchor='w')
|
| 296 |
+
tk.Label(rb_frame, text=desc, font=('TkDefaultFont', 9),
|
| 297 |
+
fg='gray').pack(anchor='w', padx=(25, 0))
|
| 298 |
+
|
| 299 |
+
# Multi-method configuration
|
| 300 |
+
multi_frame = tk.LabelFrame(frame, text="Multi-Method Settings", padx=20, pady=20)
|
| 301 |
+
multi_frame.pack(fill='x', padx=20, pady=10)
|
| 302 |
+
|
| 303 |
+
tk.Label(multi_frame, text="Number of methods required to agree:",
|
| 304 |
+
font=('TkDefaultFont', 10)).pack(anchor='w')
|
| 305 |
+
|
| 306 |
+
self.methods_required_var = tk.IntVar(
|
| 307 |
+
value=ai_config['multi_method_requirements']['methods_required'])
|
| 308 |
+
|
| 309 |
+
tb.Spinbox(multi_frame, from_=1, to=6, textvariable=self.methods_required_var,
|
| 310 |
+
width=10).pack(anchor='w', pady=5)
|
| 311 |
+
|
| 312 |
+
tk.Label(multi_frame, text="Required methods (at least one must be included):",
|
| 313 |
+
font=('TkDefaultFont', 10)).pack(anchor='w', pady=(10, 5))
|
| 314 |
+
|
| 315 |
+
self.required_method_vars = {}
|
| 316 |
+
for method in ['exact', 'text', 'semantic', 'structural', 'character', 'pattern']:
|
| 317 |
+
var = tk.BooleanVar(
|
| 318 |
+
value=method in ai_config['multi_method_requirements']['min_methods'])
|
| 319 |
+
self.required_method_vars[method] = var
|
| 320 |
+
|
| 321 |
+
tb.Checkbutton(multi_frame, text=method.title(), variable=var,
|
| 322 |
+
bootstyle="round-toggle").pack(anchor='w', padx=20)
|
| 323 |
+
|
| 324 |
+
def create_preprocessing_tab(self, notebook):
|
| 325 |
+
"""Create the preprocessing configuration tab"""
|
| 326 |
+
frame = ttk.Frame(notebook)
|
| 327 |
+
notebook.add(frame, text="Preprocessing")
|
| 328 |
+
|
| 329 |
+
tk.Label(frame, text="Text Preprocessing Options",
|
| 330 |
+
font=('TkDefaultFont', 12, 'bold')).pack(pady=10)
|
| 331 |
+
|
| 332 |
+
tk.Label(frame, text="Configure how text is processed before comparison",
|
| 333 |
+
font=('TkDefaultFont', 10), fg='gray').pack(pady=(0, 20))
|
| 334 |
+
|
| 335 |
+
# Preprocessing options
|
| 336 |
+
prep_frame = tk.Frame(frame)
|
| 337 |
+
prep_frame.pack(fill='both', expand=True, padx=20)
|
| 338 |
+
|
| 339 |
+
self.prep_vars = {}
|
| 340 |
+
ai_config = self.get_ai_config()
|
| 341 |
+
|
| 342 |
+
options = [
|
| 343 |
+
('remove_html_spacing', 'Remove HTML with spacing',
|
| 344 |
+
'Replace HTML tags with spaces instead of removing completely'),
|
| 345 |
+
('normalize_unicode', 'Normalize Unicode',
|
| 346 |
+
'Normalize unicode characters (recommended)'),
|
| 347 |
+
('ignore_case', 'Case-insensitive comparison',
|
| 348 |
+
'Ignore character case when comparing'),
|
| 349 |
+
('remove_extra_whitespace', 'Remove extra whitespace',
|
| 350 |
+
'Collapse multiple spaces/newlines into single spaces')
|
| 351 |
+
]
|
| 352 |
+
|
| 353 |
+
for key, text, desc in options:
|
| 354 |
+
var = tk.BooleanVar(value=ai_config['preprocessing'][key])
|
| 355 |
+
self.prep_vars[key] = var
|
| 356 |
+
|
| 357 |
+
opt_frame = tk.Frame(prep_frame)
|
| 358 |
+
opt_frame.pack(fill='x', pady=10)
|
| 359 |
+
|
| 360 |
+
tb.Checkbutton(opt_frame, text=text, variable=var,
|
| 361 |
+
bootstyle="round-toggle").pack(anchor='w')
|
| 362 |
+
tk.Label(opt_frame, text=desc, font=('TkDefaultFont', 9),
|
| 363 |
+
fg='gray').pack(anchor='w', padx=(25, 0))
|
| 364 |
+
|
| 365 |
+
def create_advanced_tab(self, notebook):
|
| 366 |
+
"""Create the advanced settings tab"""
|
| 367 |
+
frame = ttk.Frame(notebook)
|
| 368 |
+
notebook.add(frame, text="Advanced")
|
| 369 |
+
|
| 370 |
+
tk.Label(frame, text="Advanced Settings",
|
| 371 |
+
font=('TkDefaultFont', 12, 'bold')).pack(pady=10)
|
| 372 |
+
|
| 373 |
+
# General settings
|
| 374 |
+
general_frame = tk.LabelFrame(frame, text="General", padx=20, pady=20)
|
| 375 |
+
general_frame.pack(fill='x', padx=20, pady=10)
|
| 376 |
+
|
| 377 |
+
ai_config = self.get_ai_config()
|
| 378 |
+
|
| 379 |
+
# Add separator for better organization
|
| 380 |
+
ttk.Separator(general_frame, orient='horizontal').pack(fill='x', pady=(0, 10))
|
| 381 |
+
|
| 382 |
+
# Sample size
|
| 383 |
+
ss_frame = tk.Frame(general_frame)
|
| 384 |
+
ss_frame.pack(fill='x', pady=5)
|
| 385 |
+
|
| 386 |
+
tk.Label(ss_frame, text="Sample size:", width=20, anchor='w').pack(side='left')
|
| 387 |
+
self.sample_size_var = tk.IntVar(value=ai_config['sample_size'])
|
| 388 |
+
tb.Spinbox(ss_frame, from_=1000, to=10000, increment=500,
|
| 389 |
+
textvariable=self.sample_size_var, width=10).pack(side='left', padx=10)
|
| 390 |
+
tk.Label(ss_frame, text="characters",
|
| 391 |
+
font=('TkDefaultFont', 9)).pack(side='left')
|
| 392 |
+
|
| 393 |
+
# AI Hunter Behavior Settings
|
| 394 |
+
tk.Label(general_frame, text="AI Hunter Behavior",
|
| 395 |
+
font=('TkDefaultFont', 10, 'bold')).pack(anchor='w', pady=(0, 5))
|
| 396 |
+
|
| 397 |
+
# Retry Attempts
|
| 398 |
+
retry_frame = tk.Frame(general_frame)
|
| 399 |
+
retry_frame.pack(fill='x', pady=5)
|
| 400 |
+
|
| 401 |
+
tk.Label(retry_frame, text="Retry attempts:", width=20, anchor='w').pack(side='left')
|
| 402 |
+
self.retry_attempts_var = tk.IntVar(value=ai_config.get('retry_attempts', 3))
|
| 403 |
+
tb.Spinbox(retry_frame, from_=1, to=10, textvariable=self.retry_attempts_var, width=10).pack(side='left', padx=10)
|
| 404 |
+
tk.Label(retry_frame, text="attempts", font=('TkDefaultFont', 9)).pack(side='left')
|
| 405 |
+
|
| 406 |
+
# Temperature Change Toggle
|
| 407 |
+
temp_frame = tk.Frame(general_frame)
|
| 408 |
+
temp_frame.pack(fill='x', pady=10)
|
| 409 |
+
|
| 410 |
+
self.disable_temp_change_var = tk.BooleanVar(value=ai_config.get('disable_temperature_change', False))
|
| 411 |
+
tb.Checkbutton(temp_frame, text="Disable temperature change behavior",
|
| 412 |
+
variable=self.disable_temp_change_var, bootstyle="round-toggle").pack(anchor='w')
|
| 413 |
+
tk.Label(temp_frame, text="Prevents AI Hunter from modifying temperature settings during retries",
|
| 414 |
+
font=('TkDefaultFont', 9), fg='gray').pack(anchor='w', padx=(25, 0))
|
| 415 |
+
|
| 416 |
+
# Edge filters
|
| 417 |
+
edge_frame = tk.LabelFrame(frame, text="Edge Case Filters", padx=20, pady=20)
|
| 418 |
+
edge_frame.pack(fill='x', padx=20, pady=10)
|
| 419 |
+
|
| 420 |
+
# Min text length
|
| 421 |
+
min_frame = tk.Frame(edge_frame)
|
| 422 |
+
min_frame.pack(fill='x', pady=5)
|
| 423 |
+
|
| 424 |
+
tk.Label(min_frame, text="Minimum text length:", width=20, anchor='w').pack(side='left')
|
| 425 |
+
self.min_length_var = tk.IntVar(value=ai_config['edge_filters']['min_text_length'])
|
| 426 |
+
tb.Spinbox(min_frame, from_=100, to=2000, increment=100,
|
| 427 |
+
textvariable=self.min_length_var, width=10).pack(side='left', padx=10)
|
| 428 |
+
tk.Label(min_frame, text="characters",
|
| 429 |
+
font=('TkDefaultFont', 9)).pack(side='left')
|
| 430 |
+
|
| 431 |
+
# Length ratios
|
| 432 |
+
ratio_frame = tk.Frame(edge_frame)
|
| 433 |
+
ratio_frame.pack(fill='x', pady=10)
|
| 434 |
+
|
| 435 |
+
tk.Label(ratio_frame, text="Length ratio limits:").pack(anchor='w')
|
| 436 |
+
|
| 437 |
+
r_frame = tk.Frame(ratio_frame)
|
| 438 |
+
r_frame.pack(fill='x', pady=5)
|
| 439 |
+
|
| 440 |
+
tk.Label(r_frame, text="Min ratio:", width=10, anchor='w').pack(side='left', padx=(20, 5))
|
| 441 |
+
self.min_ratio_var = tk.DoubleVar(value=ai_config['edge_filters']['min_length_ratio'])
|
| 442 |
+
tb.Spinbox(r_frame, from_=0.5, to=0.9, increment=0.1,
|
| 443 |
+
textvariable=self.min_ratio_var, width=8).pack(side='left')
|
| 444 |
+
|
| 445 |
+
tk.Label(r_frame, text="Max ratio:", width=10, anchor='w').pack(side='left', padx=(20, 5))
|
| 446 |
+
self.max_ratio_var = tk.DoubleVar(value=ai_config['edge_filters']['max_length_ratio'])
|
| 447 |
+
tb.Spinbox(r_frame, from_=1.1, to=2.0, increment=0.1,
|
| 448 |
+
textvariable=self.max_ratio_var, width=8).pack(side='left')
|
| 449 |
+
|
| 450 |
+
tk.Label(edge_frame, text="Chapters with vastly different lengths won't be compared",
|
| 451 |
+
font=('TkDefaultFont', 9), fg='gray').pack(anchor='w', padx=20)
|
| 452 |
+
|
| 453 |
+
# Language Detection
|
| 454 |
+
lang_frame = tk.LabelFrame(frame, text="Non-Target Language Detection", padx=20, pady=20)
|
| 455 |
+
lang_frame.pack(fill='x', padx=20, pady=10)
|
| 456 |
+
|
| 457 |
+
# Enable toggle
|
| 458 |
+
enable_frame = tk.Frame(lang_frame)
|
| 459 |
+
enable_frame.pack(fill='x', pady=5)
|
| 460 |
+
|
| 461 |
+
self.lang_enabled_var = tk.BooleanVar(value=ai_config['language_detection']['enabled'])
|
| 462 |
+
tb.Checkbutton(enable_frame, text="Enable non-target language detection",
|
| 463 |
+
variable=self.lang_enabled_var, bootstyle="round-toggle").pack(anchor='w')
|
| 464 |
+
tk.Label(enable_frame, text="Trigger retranslation when too much non-target language is detected",
|
| 465 |
+
font=('TkDefaultFont', 9), fg='gray').pack(anchor='w', padx=(25, 0))
|
| 466 |
+
|
| 467 |
+
# Target language selection
|
| 468 |
+
target_frame = tk.Frame(lang_frame)
|
| 469 |
+
target_frame.pack(fill='x', pady=10)
|
| 470 |
+
|
| 471 |
+
tk.Label(target_frame, text="Target language:", width=20, anchor='w').pack(side='left')
|
| 472 |
+
self.target_lang_var = tk.StringVar(value=ai_config['language_detection']['target_language'])
|
| 473 |
+
|
| 474 |
+
lang_options = list(ai_config['language_detection']['languages'].keys())
|
| 475 |
+
target_combo = ttk.Combobox(target_frame, textvariable=self.target_lang_var,
|
| 476 |
+
values=lang_options, state='readonly', width=15)
|
| 477 |
+
target_combo.pack(side='left', padx=10)
|
| 478 |
+
|
| 479 |
+
tk.Label(target_frame, text="Language that should be in the translation",
|
| 480 |
+
font=('TkDefaultFont', 9), fg='gray').pack(side='left', padx=(10, 0))
|
| 481 |
+
|
| 482 |
+
# Threshold setting
|
| 483 |
+
thresh_frame = tk.Frame(lang_frame)
|
| 484 |
+
thresh_frame.pack(fill='x', pady=5)
|
| 485 |
+
|
| 486 |
+
tk.Label(thresh_frame, text="Character threshold:", width=20, anchor='w').pack(side='left')
|
| 487 |
+
self.lang_threshold_var = tk.IntVar(value=ai_config['language_detection']['threshold_characters'])
|
| 488 |
+
tb.Spinbox(thresh_frame, from_=100, to=2000, increment=50,
|
| 489 |
+
textvariable=self.lang_threshold_var, width=10).pack(side='left', padx=10)
|
| 490 |
+
tk.Label(thresh_frame, text="non-target language characters to trigger retranslation",
|
| 491 |
+
font=('TkDefaultFont', 9), fg='gray').pack(side='left')
|
| 492 |
+
|
| 493 |
+
def apply_ai_hunter_settings(self):
|
| 494 |
+
"""Apply AI Hunter settings to the main config"""
|
| 495 |
+
ai_config = self.get_ai_config()
|
| 496 |
+
|
| 497 |
+
# Update from GUI variables
|
| 498 |
+
for method, var in self.threshold_vars.items():
|
| 499 |
+
ai_config['thresholds'][method] = var.get()
|
| 500 |
+
|
| 501 |
+
for method, var in self.weight_vars.items():
|
| 502 |
+
ai_config['weights'][method] = var.get()
|
| 503 |
+
|
| 504 |
+
ai_config['detection_mode'] = self.mode_var.get()
|
| 505 |
+
ai_config['multi_method_requirements']['methods_required'] = self.methods_required_var.get()
|
| 506 |
+
|
| 507 |
+
min_methods = [method for method, var in self.required_method_vars.items() if var.get()]
|
| 508 |
+
ai_config['multi_method_requirements']['min_methods'] = min_methods
|
| 509 |
+
|
| 510 |
+
for key, var in self.prep_vars.items():
|
| 511 |
+
ai_config['preprocessing'][key] = var.get()
|
| 512 |
+
|
| 513 |
+
ai_config['sample_size'] = self.sample_size_var.get()
|
| 514 |
+
|
| 515 |
+
ai_config['edge_filters']['min_text_length'] = self.min_length_var.get()
|
| 516 |
+
ai_config['edge_filters']['min_length_ratio'] = self.min_ratio_var.get()
|
| 517 |
+
ai_config['edge_filters']['max_length_ratio'] = self.max_ratio_var.get()
|
| 518 |
+
|
| 519 |
+
# Language detection settings
|
| 520 |
+
ai_config['language_detection']['enabled'] = self.lang_enabled_var.get()
|
| 521 |
+
ai_config['language_detection']['target_language'] = self.target_lang_var.get()
|
| 522 |
+
ai_config['language_detection']['threshold_characters'] = self.lang_threshold_var.get()
|
| 523 |
+
|
| 524 |
+
# Update retry attempts and temperature change settings
|
| 525 |
+
ai_config['retry_attempts'] = self.retry_attempts_var.get()
|
| 526 |
+
ai_config['disable_temperature_change'] = self.disable_temp_change_var.get()
|
| 527 |
+
|
| 528 |
+
# Update main config
|
| 529 |
+
self.config['ai_hunter_config'] = ai_config
|
| 530 |
+
|
| 531 |
+
# Call callback if provided (this should trigger main save_configuration)
|
| 532 |
+
if self.callback:
|
| 533 |
+
self.callback()
|
| 534 |
+
|
| 535 |
+
self.window.destroy()
|
| 536 |
+
|
| 537 |
+
def reset_defaults(self):
|
| 538 |
+
"""Reset all values to defaults"""
|
| 539 |
+
import tkinter.messagebox as messagebox
|
| 540 |
+
result = messagebox.askyesno("Reset to Defaults",
|
| 541 |
+
"Are you sure you want to reset all settings to defaults?")
|
| 542 |
+
if result:
|
| 543 |
+
self.config['ai_hunter_config'] = self.default_ai_hunter.copy()
|
| 544 |
+
self.window.destroy()
|
| 545 |
+
self.show_ai_hunter_config() # Reopen with default values
|
| 546 |
+
|
| 547 |
+
|
| 548 |
+
class ImprovedAIHunterDetection:
|
| 549 |
+
"""Improved AI Hunter detection methods for TranslateKRtoEN"""
|
| 550 |
+
|
| 551 |
+
def __init__(self, main_config):
|
| 552 |
+
"""
|
| 553 |
+
Initialize with reference to main config
|
| 554 |
+
|
| 555 |
+
Args:
|
| 556 |
+
main_config: Reference to main translator config dictionary
|
| 557 |
+
"""
|
| 558 |
+
self.main_config = main_config
|
| 559 |
+
|
| 560 |
+
# Default AI Hunter settings
|
| 561 |
+
self.default_ai_hunter = {
|
| 562 |
+
'enabled': True,
|
| 563 |
+
'lookback_chapters': 5,
|
| 564 |
+
'retry_attempts': 3,
|
| 565 |
+
'disable_temperature_change': False,
|
| 566 |
+
'sample_size': 3000,
|
| 567 |
+
'thresholds': {
|
| 568 |
+
'exact': 90,
|
| 569 |
+
'text': 85,
|
| 570 |
+
'semantic': 85,
|
| 571 |
+
'structural': 85,
|
| 572 |
+
'character': 80,
|
| 573 |
+
'pattern': 80
|
| 574 |
+
},
|
| 575 |
+
'weights': {
|
| 576 |
+
'exact': 1.5,
|
| 577 |
+
'text': 1.2,
|
| 578 |
+
'semantic': 1.0,
|
| 579 |
+
'structural': 1.0,
|
| 580 |
+
'character': 0.8,
|
| 581 |
+
'pattern': 0.8
|
| 582 |
+
},
|
| 583 |
+
'detection_mode': 'multi_method',
|
| 584 |
+
'multi_method_requirements': {
|
| 585 |
+
'methods_required': 2,
|
| 586 |
+
'min_methods': ['semantic', 'structural']
|
| 587 |
+
},
|
| 588 |
+
'preprocessing': {
|
| 589 |
+
'remove_html_spacing': True,
|
| 590 |
+
'normalize_unicode': True,
|
| 591 |
+
'ignore_case': True,
|
| 592 |
+
'remove_extra_whitespace': True
|
| 593 |
+
},
|
| 594 |
+
'edge_filters': {
|
| 595 |
+
'min_text_length': 500,
|
| 596 |
+
'max_length_ratio': 1.3,
|
| 597 |
+
'min_length_ratio': 0.7
|
| 598 |
+
},
|
| 599 |
+
'language_detection': {
|
| 600 |
+
'enabled': False,
|
| 601 |
+
'target_language': 'english',
|
| 602 |
+
'threshold_characters': 500,
|
| 603 |
+
'languages': {
|
| 604 |
+
'english': ['en'],
|
| 605 |
+
'japanese': ['ja', 'jp'],
|
| 606 |
+
'korean': ['ko', 'kr'],
|
| 607 |
+
'chinese': ['zh', 'zh-cn', 'zh-tw'],
|
| 608 |
+
'spanish': ['es'],
|
| 609 |
+
'french': ['fr'],
|
| 610 |
+
'german': ['de'],
|
| 611 |
+
'russian': ['ru'],
|
| 612 |
+
'arabic': ['ar'],
|
| 613 |
+
'hindi': ['hi'],
|
| 614 |
+
'portuguese': ['pt'],
|
| 615 |
+
'italian': ['it'],
|
| 616 |
+
'dutch': ['nl'],
|
| 617 |
+
'thai': ['th'],
|
| 618 |
+
'vietnamese': ['vi'],
|
| 619 |
+
'turkish': ['tr'],
|
| 620 |
+
'polish': ['pl'],
|
| 621 |
+
'swedish': ['sv'],
|
| 622 |
+
'danish': ['da'],
|
| 623 |
+
'norwegian': ['no'],
|
| 624 |
+
'finnish': ['fi']
|
| 625 |
+
}
|
| 626 |
+
}
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
def get_ai_config(self):
|
| 630 |
+
"""Get AI Hunter configuration from main config"""
|
| 631 |
+
return self.main_config.get('ai_hunter_config', self.default_ai_hunter)
|
| 632 |
+
|
| 633 |
+
def detect_duplicate_ai_hunter_enhanced(self, result, idx, prog, out, current_chapter_num=None):
|
| 634 |
+
"""Enhanced AI Hunter duplicate detection with configurable parameters"""
|
| 635 |
+
try:
|
| 636 |
+
print(f"\n ========== AI HUNTER DEBUG START ==========")
|
| 637 |
+
print(f" 📍 Current chapter index: {idx}")
|
| 638 |
+
if current_chapter_num:
|
| 639 |
+
print(f" 📖 Current chapter number: {current_chapter_num}")
|
| 640 |
+
|
| 641 |
+
# Get configuration
|
| 642 |
+
config = self.get_ai_config()
|
| 643 |
+
|
| 644 |
+
if not config.get('enabled', True):
|
| 645 |
+
print(f" ⚠️ AI Hunter is disabled")
|
| 646 |
+
print(f" ========== AI HUNTER DEBUG END ==========\n")
|
| 647 |
+
return False, 0
|
| 648 |
+
|
| 649 |
+
# Preprocess text
|
| 650 |
+
result_clean = self._preprocess_text(result, config['preprocessing'])
|
| 651 |
+
print(f" 📄 Text length after preprocessing: {len(result_clean)} chars")
|
| 652 |
+
|
| 653 |
+
# Check for non-target language detection
|
| 654 |
+
if config['language_detection']['enabled']:
|
| 655 |
+
non_target_detected, non_target_count = self._check_non_target_language(
|
| 656 |
+
result_clean, config['language_detection']
|
| 657 |
+
)
|
| 658 |
+
if non_target_detected:
|
| 659 |
+
print(f"\n 🌐 NON-TARGET LANGUAGE DETECTED!")
|
| 660 |
+
print(f" Non-target characters found: {non_target_count}")
|
| 661 |
+
print(f" Threshold: {config['language_detection']['threshold_characters']}")
|
| 662 |
+
print(f" Target language: {config['language_detection']['target_language']}")
|
| 663 |
+
print(f" ========== AI HUNTER DEBUG END ==========\n")
|
| 664 |
+
return True, 100 # High confidence for language detection
|
| 665 |
+
|
| 666 |
+
# Check edge cases
|
| 667 |
+
if len(result_clean) < config['edge_filters']['min_text_length']:
|
| 668 |
+
print(f" ⚠️ Text too short ({len(result_clean)} < {config['edge_filters']['min_text_length']})")
|
| 669 |
+
print(f" ========== AI HUNTER DEBUG END ==========\n")
|
| 670 |
+
return False, 0
|
| 671 |
+
|
| 672 |
+
# Extract features
|
| 673 |
+
print(f" 🔬 Extracting text features...")
|
| 674 |
+
result_features = self._extract_text_features(result_clean)
|
| 675 |
+
|
| 676 |
+
# Get lookback from main config, then fall back to env var if not found
|
| 677 |
+
lookback = self.main_config.get('duplicate_lookback_chapters',
|
| 678 |
+
int(os.getenv('DUPLICATE_LOOKBACK_CHAPTERS', '5')))
|
| 679 |
+
|
| 680 |
+
# Log configuration
|
| 681 |
+
print(f"\n 🔧 Configuration:")
|
| 682 |
+
print(f" Detection mode: {config['detection_mode']}")
|
| 683 |
+
print(f" Lookback chapters: {lookback}")
|
| 684 |
+
print(f" Sample size: {config['sample_size']}")
|
| 685 |
+
|
| 686 |
+
# FIX: Get all completed chapters sorted by actual chapter number
|
| 687 |
+
completed_chapters = []
|
| 688 |
+
for chapter_key, chapter_info in prog["chapters"].items():
|
| 689 |
+
if chapter_info.get("status") == "completed" and chapter_info.get("output_file"):
|
| 690 |
+
# Handle both numeric and hash-based chapter keys
|
| 691 |
+
try:
|
| 692 |
+
# Get actual_num from progress (this is the real chapter number)
|
| 693 |
+
chapter_num = chapter_info.get("actual_num")
|
| 694 |
+
if chapter_num is None:
|
| 695 |
+
# Try chapter_num as fallback
|
| 696 |
+
chapter_num = chapter_info.get("chapter_num")
|
| 697 |
+
if chapter_num is None:
|
| 698 |
+
# Skip chapters without valid numbers
|
| 699 |
+
print(f" ⚠️ No chapter number found for key {chapter_key}, skipping")
|
| 700 |
+
continue
|
| 701 |
+
|
| 702 |
+
completed_chapters.append({
|
| 703 |
+
'key': chapter_key,
|
| 704 |
+
'num': chapter_num,
|
| 705 |
+
'file': chapter_info.get("output_file"),
|
| 706 |
+
'ai_features': chapter_info.get("ai_features")
|
| 707 |
+
})
|
| 708 |
+
except Exception as e:
|
| 709 |
+
print(f" ⚠️ Error processing chapter {chapter_key}: {e}")
|
| 710 |
+
continue
|
| 711 |
+
|
| 712 |
+
# Sort by actual chapter number
|
| 713 |
+
completed_chapters.sort(key=lambda x: x['num'])
|
| 714 |
+
|
| 715 |
+
# If no current chapter number provided, try to infer it
|
| 716 |
+
if current_chapter_num is None:
|
| 717 |
+
# The current chapter should be passed in, but if not, we need to find it
|
| 718 |
+
# Since we're using content hash keys, we can't use idx directly
|
| 719 |
+
print(f" ⚠️ No current chapter number provided")
|
| 720 |
+
print(f" 📊 Current index: {idx}")
|
| 721 |
+
|
| 722 |
+
# The current chapter number should have been passed from the wrapper
|
| 723 |
+
# If it wasn't, we have a problem
|
| 724 |
+
print(f" ❌ ERROR: Current chapter number not provided to AI Hunter!")
|
| 725 |
+
print(f" ❌ This indicates the wrapper function is not passing the chapter number correctly")
|
| 726 |
+
|
| 727 |
+
# Emergency: just use a high number so we don't compare against anything
|
| 728 |
+
current_chapter_num = 999999
|
| 729 |
+
print(f" ⚠️ Using index-based chapter number: {current_chapter_num}")
|
| 730 |
+
|
| 731 |
+
print(f"\n 📚 Found {len(completed_chapters)} completed chapters in progress")
|
| 732 |
+
if completed_chapters:
|
| 733 |
+
chapter_nums = [ch['num'] for ch in completed_chapters]
|
| 734 |
+
print(f" 📊 Chapter numbers in progress: {sorted(chapter_nums)[:10]}{'...' if len(chapter_nums) > 10 else ''}")
|
| 735 |
+
print(f" 🎯 Current chapter number: {current_chapter_num}")
|
| 736 |
+
print(f" 🔍 Will check against last {lookback} chapters before chapter {current_chapter_num}")
|
| 737 |
+
|
| 738 |
+
# Check previous chapters
|
| 739 |
+
all_similarities = []
|
| 740 |
+
highest_similarity = 0.0
|
| 741 |
+
detected_method = None
|
| 742 |
+
detected_chapter = None
|
| 743 |
+
|
| 744 |
+
# FIX: Look at chapters by actual number, not index
|
| 745 |
+
chapters_checked = 0
|
| 746 |
+
for completed_chapter in reversed(completed_chapters):
|
| 747 |
+
# Only check chapters that come before the current one
|
| 748 |
+
if completed_chapter['num'] >= current_chapter_num:
|
| 749 |
+
continue
|
| 750 |
+
|
| 751 |
+
# Only check up to lookback number of chapters
|
| 752 |
+
if chapters_checked >= lookback:
|
| 753 |
+
break
|
| 754 |
+
|
| 755 |
+
chapters_checked += 1
|
| 756 |
+
|
| 757 |
+
print(f"\n 📝 Checking against chapter {completed_chapter['num']}...")
|
| 758 |
+
|
| 759 |
+
# Get previous chapter features
|
| 760 |
+
prev_features = completed_chapter.get('ai_features')
|
| 761 |
+
prev_clean = None
|
| 762 |
+
|
| 763 |
+
# Try to get cached features first
|
| 764 |
+
if prev_features:
|
| 765 |
+
print(f" ✅ Using cached features")
|
| 766 |
+
else:
|
| 767 |
+
# Read and extract features
|
| 768 |
+
prev_path = os.path.join(out, completed_chapter['file'])
|
| 769 |
+
|
| 770 |
+
if os.path.exists(prev_path):
|
| 771 |
+
try:
|
| 772 |
+
with open(prev_path, 'r', encoding='utf-8') as f:
|
| 773 |
+
prev_content = f.read()
|
| 774 |
+
prev_clean = self._preprocess_text(prev_content, config['preprocessing'])
|
| 775 |
+
|
| 776 |
+
# Check length ratio
|
| 777 |
+
len_ratio = len(result_clean) / max(1, len(prev_clean))
|
| 778 |
+
if (len_ratio < config['edge_filters']['min_length_ratio'] or
|
| 779 |
+
len_ratio > config['edge_filters']['max_length_ratio']):
|
| 780 |
+
print(f" ⚠️ Length ratio out of bounds: {len_ratio:.2f}")
|
| 781 |
+
continue
|
| 782 |
+
|
| 783 |
+
prev_features = self._extract_text_features(prev_clean)
|
| 784 |
+
print(f" 📄 Extracted features from file")
|
| 785 |
+
except Exception as e:
|
| 786 |
+
print(f" ❌ Failed to read file: {e}")
|
| 787 |
+
continue
|
| 788 |
+
else:
|
| 789 |
+
print(f" ❌ File not found: {prev_path}")
|
| 790 |
+
continue
|
| 791 |
+
|
| 792 |
+
# Calculate similarities
|
| 793 |
+
print(f" 🔍 Calculating similarities...")
|
| 794 |
+
similarities = self._calculate_all_similarities(
|
| 795 |
+
result_clean, result_features,
|
| 796 |
+
prev_clean, prev_features, config
|
| 797 |
+
)
|
| 798 |
+
|
| 799 |
+
# Store for reporting
|
| 800 |
+
all_similarities.append({
|
| 801 |
+
'chapter': completed_chapter['num'],
|
| 802 |
+
'similarities': similarities
|
| 803 |
+
})
|
| 804 |
+
|
| 805 |
+
# Log similarity scores
|
| 806 |
+
for method, score in similarities.items():
|
| 807 |
+
if score > 0:
|
| 808 |
+
print(f" {method}: {int(score*100)}%")
|
| 809 |
+
|
| 810 |
+
# Check if duplicate based on configured mode
|
| 811 |
+
is_duplicate, confidence, methods_triggered = self._evaluate_duplicate(
|
| 812 |
+
similarities, config
|
| 813 |
+
)
|
| 814 |
+
|
| 815 |
+
if is_duplicate:
|
| 816 |
+
print(f"\n 🚨 DUPLICATE DETECTED!")
|
| 817 |
+
print(f" Detection mode: {config['detection_mode']}")
|
| 818 |
+
print(f" Confidence: {int(confidence*100)}%")
|
| 819 |
+
print(f" Triggered methods: {', '.join(methods_triggered)}")
|
| 820 |
+
print(f" Match with: Chapter {completed_chapter['num']}")
|
| 821 |
+
print(f" ========== AI HUNTER DEBUG END ==========\n")
|
| 822 |
+
return True, int(confidence * 100)
|
| 823 |
+
|
| 824 |
+
# Track highest for reporting
|
| 825 |
+
for method, sim in similarities.items():
|
| 826 |
+
if sim > highest_similarity:
|
| 827 |
+
highest_similarity = sim
|
| 828 |
+
detected_method = method
|
| 829 |
+
detected_chapter = completed_chapter['num']
|
| 830 |
+
|
| 831 |
+
# No duplicate found
|
| 832 |
+
print(f"\n ✅ No duplicate found")
|
| 833 |
+
if detected_method:
|
| 834 |
+
print(f" Highest similarity: {int(highest_similarity*100)}% via {detected_method}")
|
| 835 |
+
print(f" Closest match: Chapter {detected_chapter}")
|
| 836 |
+
|
| 837 |
+
# Show top 3 closest matches
|
| 838 |
+
if all_similarities:
|
| 839 |
+
print(f"\n 📊 Top 3 closest matches:")
|
| 840 |
+
sorted_chapters = sorted(all_similarities,
|
| 841 |
+
key=lambda x: self._get_chapter_score(x['similarities'], config),
|
| 842 |
+
reverse=True)[:3]
|
| 843 |
+
for i, chapter_data in enumerate(sorted_chapters, 1):
|
| 844 |
+
score = self._get_chapter_score(chapter_data['similarities'], config)
|
| 845 |
+
print(f" {i}. Chapter {chapter_data['chapter']}: {int(score*100)}%")
|
| 846 |
+
|
| 847 |
+
print(f" ========== AI HUNTER DEBUG END ==========\n")
|
| 848 |
+
return False, 0
|
| 849 |
+
|
| 850 |
+
except Exception as e:
|
| 851 |
+
print(f" ❌ AI Hunter detection failed with error: {e}")
|
| 852 |
+
import traceback
|
| 853 |
+
print(f" {traceback.format_exc()}")
|
| 854 |
+
print(f" ========== AI HUNTER DEBUG END ==========\n")
|
| 855 |
+
return False, 0
|
| 856 |
+
|
| 857 |
+
def _preprocess_text(self, text, prep_config):
|
| 858 |
+
"""Preprocess text according to configuration"""
|
| 859 |
+
# Remove HTML
|
| 860 |
+
if prep_config.get('remove_html_spacing', True):
|
| 861 |
+
text = re.sub(r'<[^>]+>', ' ', text)
|
| 862 |
+
else:
|
| 863 |
+
text = re.sub(r'<[^>]+>', '', text)
|
| 864 |
+
|
| 865 |
+
# Normalize unicode
|
| 866 |
+
if prep_config.get('normalize_unicode', True):
|
| 867 |
+
text = unicodedata.normalize('NFKD', text)
|
| 868 |
+
|
| 869 |
+
# Remove extra whitespace
|
| 870 |
+
if prep_config.get('remove_extra_whitespace', True):
|
| 871 |
+
text = re.sub(r'\s+', ' ', text)
|
| 872 |
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
| 873 |
+
|
| 874 |
+
text = text.strip()
|
| 875 |
+
|
| 876 |
+
# Convert to lowercase if case-insensitive
|
| 877 |
+
if prep_config.get('ignore_case', True):
|
| 878 |
+
text = text.lower()
|
| 879 |
+
|
| 880 |
+
return text
|
| 881 |
+
|
| 882 |
+
def _calculate_all_similarities(self, result_clean, result_features,
|
| 883 |
+
prev_clean, prev_features, config):
|
| 884 |
+
"""Calculate all similarity metrics"""
|
| 885 |
+
similarities = {}
|
| 886 |
+
|
| 887 |
+
# Method 1: Exact content match
|
| 888 |
+
if prev_clean is not None:
|
| 889 |
+
sample_size = min(config['sample_size'], len(result_clean), len(prev_clean))
|
| 890 |
+
exact_sim = self._calculate_exact_similarity(
|
| 891 |
+
result_clean[:sample_size],
|
| 892 |
+
prev_clean[:sample_size]
|
| 893 |
+
)
|
| 894 |
+
similarities['exact'] = exact_sim
|
| 895 |
+
|
| 896 |
+
# Method 2: Smart text similarity
|
| 897 |
+
text_sim = self._calculate_smart_similarity(
|
| 898 |
+
result_clean, prev_clean, config['sample_size']
|
| 899 |
+
)
|
| 900 |
+
similarities['text'] = text_sim
|
| 901 |
+
else:
|
| 902 |
+
similarities['exact'] = 0.0
|
| 903 |
+
similarities['text'] = 0.0
|
| 904 |
+
|
| 905 |
+
# Method 3: Semantic fingerprint
|
| 906 |
+
semantic_sim = self._calculate_semantic_similarity(
|
| 907 |
+
result_features.get('semantic', {}),
|
| 908 |
+
prev_features.get('semantic', {})
|
| 909 |
+
)
|
| 910 |
+
similarities['semantic'] = semantic_sim
|
| 911 |
+
|
| 912 |
+
# Method 4: Structural signature
|
| 913 |
+
structural_sim = self._calculate_structural_similarity(
|
| 914 |
+
result_features.get('structural', {}),
|
| 915 |
+
prev_features.get('structural', {})
|
| 916 |
+
)
|
| 917 |
+
similarities['structural'] = structural_sim
|
| 918 |
+
|
| 919 |
+
# Method 5: Character analysis
|
| 920 |
+
char_sim = self._calculate_character_similarity(
|
| 921 |
+
result_features.get('characters', []),
|
| 922 |
+
prev_features.get('characters', [])
|
| 923 |
+
)
|
| 924 |
+
similarities['character'] = char_sim
|
| 925 |
+
|
| 926 |
+
# Method 6: Pattern analysis
|
| 927 |
+
pattern_sim = self._calculate_pattern_similarity(
|
| 928 |
+
result_features.get('patterns', {}),
|
| 929 |
+
prev_features.get('patterns', {})
|
| 930 |
+
)
|
| 931 |
+
similarities['pattern'] = pattern_sim
|
| 932 |
+
|
| 933 |
+
return similarities
|
| 934 |
+
|
| 935 |
+
def _evaluate_duplicate(self, similarities, config):
|
| 936 |
+
"""Evaluate if similarities indicate a duplicate based on detection mode"""
|
| 937 |
+
mode = config['detection_mode']
|
| 938 |
+
thresholds = {k: v/100.0 for k, v in config['thresholds'].items()}
|
| 939 |
+
|
| 940 |
+
if mode == 'single_method':
|
| 941 |
+
# Any method exceeding threshold
|
| 942 |
+
for method, sim in similarities.items():
|
| 943 |
+
if sim >= thresholds.get(method, 0.85):
|
| 944 |
+
return True, sim, [method]
|
| 945 |
+
return False, 0, []
|
| 946 |
+
|
| 947 |
+
elif mode == 'multi_method':
|
| 948 |
+
# Multiple methods must agree
|
| 949 |
+
triggered_methods = []
|
| 950 |
+
for method, sim in similarities.items():
|
| 951 |
+
if sim >= thresholds.get(method, 0.85):
|
| 952 |
+
triggered_methods.append(method)
|
| 953 |
+
|
| 954 |
+
# Check if enough methods triggered
|
| 955 |
+
required = config.get('multi_method_requirements', {}).get('methods_required', 2)
|
| 956 |
+
min_methods = config.get('multi_method_requirements', {}).get('min_methods', [])
|
| 957 |
+
|
| 958 |
+
if len(triggered_methods) >= required:
|
| 959 |
+
# Check if at least one required method is included
|
| 960 |
+
if not min_methods or any(m in triggered_methods for m in min_methods):
|
| 961 |
+
# Calculate average confidence of triggered methods
|
| 962 |
+
confidence = sum(similarities[m] for m in triggered_methods) / len(triggered_methods)
|
| 963 |
+
return True, confidence, triggered_methods
|
| 964 |
+
|
| 965 |
+
return False, 0, []
|
| 966 |
+
|
| 967 |
+
elif mode == 'weighted_average':
|
| 968 |
+
# Calculate weighted average
|
| 969 |
+
weights = config.get('weights', {})
|
| 970 |
+
total_weight = sum(weights.get(m, 1.0) for m in similarities)
|
| 971 |
+
weighted_sum = sum(similarities[m] * weights.get(m, 1.0) for m in similarities)
|
| 972 |
+
weighted_avg = weighted_sum / total_weight if total_weight > 0 else 0
|
| 973 |
+
|
| 974 |
+
# Check if weighted average exceeds average threshold
|
| 975 |
+
avg_threshold = sum(thresholds.values()) / len(thresholds) if thresholds else 0.85
|
| 976 |
+
|
| 977 |
+
if weighted_avg >= avg_threshold:
|
| 978 |
+
# Find which methods contributed most
|
| 979 |
+
triggered = [m for m, sim in similarities.items()
|
| 980 |
+
if sim >= thresholds.get(m, 0.85)]
|
| 981 |
+
return True, weighted_avg, triggered
|
| 982 |
+
|
| 983 |
+
return False, 0, []
|
| 984 |
+
|
| 985 |
+
return False, 0, []
|
| 986 |
+
|
| 987 |
+
def _get_chapter_score(self, similarities, config):
|
| 988 |
+
"""Calculate overall score for a chapter comparison"""
|
| 989 |
+
if config['detection_mode'] == 'weighted_average':
|
| 990 |
+
weights = config.get('weights', {})
|
| 991 |
+
total_weight = sum(weights.get(m, 1.0) for m in similarities)
|
| 992 |
+
return sum(similarities.get(m, 0) * weights.get(m, 1.0) for m in similarities) / total_weight if total_weight > 0 else 0
|
| 993 |
+
else:
|
| 994 |
+
return max(similarities.values()) if similarities else 0
|
| 995 |
+
|
| 996 |
+
def _extract_text_features(self, text):
|
| 997 |
+
"""Extract multiple features from text for AI Hunter analysis"""
|
| 998 |
+
features = {
|
| 999 |
+
'semantic': {},
|
| 1000 |
+
'structural': {},
|
| 1001 |
+
'characters': [],
|
| 1002 |
+
'patterns': {}
|
| 1003 |
+
}
|
| 1004 |
+
|
| 1005 |
+
# Semantic fingerprint
|
| 1006 |
+
lines = text.split('\n')
|
| 1007 |
+
|
| 1008 |
+
# Character extraction (names that appear 3+ times)
|
| 1009 |
+
words = re.findall(r'\b[A-Z][a-z]+\b', text)
|
| 1010 |
+
word_freq = Counter(words)
|
| 1011 |
+
features['characters'] = [name for name, count in word_freq.items()
|
| 1012 |
+
if count >= 3 and name not in {
|
| 1013 |
+
'The', 'A', 'An', 'In', 'On', 'At', 'To',
|
| 1014 |
+
'From', 'With', 'By', 'For', 'Of', 'As',
|
| 1015 |
+
'But', 'And', 'Or', 'He', 'She', 'It',
|
| 1016 |
+
'They', 'We', 'You', 'What', 'When', 'Where',
|
| 1017 |
+
'Who', 'Why', 'How', 'That', 'This', 'These'
|
| 1018 |
+
}]
|
| 1019 |
+
|
| 1020 |
+
# Dialogue patterns
|
| 1021 |
+
dialogue_patterns = re.findall(r'"([^"]+)"', text)
|
| 1022 |
+
features['semantic']['dialogue_count'] = len(dialogue_patterns)
|
| 1023 |
+
features['semantic']['dialogue_lengths'] = [len(d) for d in dialogue_patterns[:10]]
|
| 1024 |
+
|
| 1025 |
+
# Speaker patterns
|
| 1026 |
+
speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered)', text.lower())
|
| 1027 |
+
features['semantic']['speakers'] = list(set(speaker_patterns[:20]))
|
| 1028 |
+
|
| 1029 |
+
# Number extraction
|
| 1030 |
+
numbers = re.findall(r'\b\d+\b', text)
|
| 1031 |
+
features['patterns']['numbers'] = numbers[:20]
|
| 1032 |
+
|
| 1033 |
+
# Structural signature
|
| 1034 |
+
para_lengths = []
|
| 1035 |
+
dialogue_count = 0
|
| 1036 |
+
for para in text.split('\n\n'):
|
| 1037 |
+
if para.strip():
|
| 1038 |
+
para_lengths.append(len(para))
|
| 1039 |
+
if '"' in para:
|
| 1040 |
+
dialogue_count += 1
|
| 1041 |
+
|
| 1042 |
+
features['structural']['para_count'] = len(para_lengths)
|
| 1043 |
+
features['structural']['avg_para_length'] = sum(para_lengths) / max(1, len(para_lengths))
|
| 1044 |
+
features['structural']['dialogue_ratio'] = dialogue_count / max(1, len(para_lengths))
|
| 1045 |
+
|
| 1046 |
+
# Create structural pattern string
|
| 1047 |
+
pattern = []
|
| 1048 |
+
for para in text.split('\n\n')[:20]: # First 20 paragraphs
|
| 1049 |
+
if para.strip():
|
| 1050 |
+
if '"' in para:
|
| 1051 |
+
pattern.append('D') # Dialogue
|
| 1052 |
+
elif len(para) > 300:
|
| 1053 |
+
pattern.append('L') # Long
|
| 1054 |
+
elif len(para) < 100:
|
| 1055 |
+
pattern.append('S') # Short
|
| 1056 |
+
else:
|
| 1057 |
+
pattern.append('M') # Medium
|
| 1058 |
+
features['structural']['pattern'] = ''.join(pattern)
|
| 1059 |
+
|
| 1060 |
+
# Action density
|
| 1061 |
+
action_verbs = len(re.findall(r'\b\w+ed\b', text))
|
| 1062 |
+
features['semantic']['action_density'] = action_verbs / max(1, len(text.split()))
|
| 1063 |
+
|
| 1064 |
+
# Text length
|
| 1065 |
+
features['semantic']['text_length'] = len(text)
|
| 1066 |
+
|
| 1067 |
+
return features
|
| 1068 |
+
|
| 1069 |
+
def _calculate_exact_similarity(self, text1, text2):
|
| 1070 |
+
"""Calculate exact text similarity"""
|
| 1071 |
+
return SequenceMatcher(None, text1, text2).ratio()
|
| 1072 |
+
|
| 1073 |
+
def _calculate_smart_similarity(self, text1, text2, sample_size):
|
| 1074 |
+
"""Smart similarity with configurable sample size"""
|
| 1075 |
+
if len(text1) > sample_size * 3 and len(text2) > sample_size * 3:
|
| 1076 |
+
# Use multiple samples
|
| 1077 |
+
samples1 = [
|
| 1078 |
+
text1[:sample_size],
|
| 1079 |
+
text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2],
|
| 1080 |
+
text1[-sample_size:]
|
| 1081 |
+
]
|
| 1082 |
+
samples2 = [
|
| 1083 |
+
text2[:sample_size],
|
| 1084 |
+
text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2],
|
| 1085 |
+
text2[-sample_size:]
|
| 1086 |
+
]
|
| 1087 |
+
similarities = [SequenceMatcher(None, s1, s2).ratio()
|
| 1088 |
+
for s1, s2 in zip(samples1, samples2)]
|
| 1089 |
+
return sum(similarities) / len(similarities)
|
| 1090 |
+
else:
|
| 1091 |
+
# Use full text up to sample size
|
| 1092 |
+
return SequenceMatcher(None, text1[:sample_size], text2[:sample_size]).ratio()
|
| 1093 |
+
|
| 1094 |
+
def _calculate_semantic_similarity(self, sem1, sem2):
|
| 1095 |
+
"""Calculate semantic fingerprint similarity"""
|
| 1096 |
+
score = 0.0
|
| 1097 |
+
weights = 0.0
|
| 1098 |
+
|
| 1099 |
+
# Compare dialogue counts
|
| 1100 |
+
if 'dialogue_count' in sem1 and 'dialogue_count' in sem2:
|
| 1101 |
+
weights += 0.3
|
| 1102 |
+
if sem1['dialogue_count'] > 0 or sem2['dialogue_count'] > 0:
|
| 1103 |
+
ratio = min(sem1['dialogue_count'], sem2['dialogue_count']) / \
|
| 1104 |
+
max(1, max(sem1['dialogue_count'], sem2['dialogue_count']))
|
| 1105 |
+
score += ratio * 0.3
|
| 1106 |
+
|
| 1107 |
+
# Compare speakers
|
| 1108 |
+
if 'speakers' in sem1 and 'speakers' in sem2:
|
| 1109 |
+
weights += 0.4
|
| 1110 |
+
if sem1['speakers'] and sem2['speakers']:
|
| 1111 |
+
overlap = len(set(sem1['speakers']) & set(sem2['speakers']))
|
| 1112 |
+
total = len(set(sem1['speakers']) | set(sem2['speakers']))
|
| 1113 |
+
score += (overlap / max(1, total)) * 0.4
|
| 1114 |
+
elif not sem1['speakers'] and not sem2['speakers']:
|
| 1115 |
+
score += 0.4 # Both have no speakers
|
| 1116 |
+
|
| 1117 |
+
# Compare dialogue lengths pattern
|
| 1118 |
+
if 'dialogue_lengths' in sem1 and 'dialogue_lengths' in sem2:
|
| 1119 |
+
weights += 0.2
|
| 1120 |
+
if sem1['dialogue_lengths'] and sem2['dialogue_lengths']:
|
| 1121 |
+
len1 = sem1['dialogue_lengths'][:10]
|
| 1122 |
+
len2 = sem2['dialogue_lengths'][:10]
|
| 1123 |
+
if len1 and len2:
|
| 1124 |
+
avg1 = sum(len1) / len(len1)
|
| 1125 |
+
avg2 = sum(len2) / len(len2)
|
| 1126 |
+
ratio = min(avg1, avg2) / max(1, max(avg1, avg2))
|
| 1127 |
+
score += ratio * 0.2
|
| 1128 |
+
elif not sem1['dialogue_lengths'] and not sem2['dialogue_lengths']:
|
| 1129 |
+
score += 0.2 # Both have no dialogue
|
| 1130 |
+
|
| 1131 |
+
# Action density
|
| 1132 |
+
if 'action_density' in sem1 and 'action_density' in sem2:
|
| 1133 |
+
weights += 0.1
|
| 1134 |
+
act_sim = 1 - abs(sem1['action_density'] - sem2['action_density'])
|
| 1135 |
+
score += act_sim * 0.1
|
| 1136 |
+
|
| 1137 |
+
return score / max(0.1, weights)
|
| 1138 |
+
|
| 1139 |
+
def _calculate_structural_similarity(self, struct1, struct2):
|
| 1140 |
+
"""Calculate structural signature similarity"""
|
| 1141 |
+
score = 0.0
|
| 1142 |
+
|
| 1143 |
+
# Compare paragraph patterns
|
| 1144 |
+
if 'pattern' in struct1 and 'pattern' in struct2:
|
| 1145 |
+
pattern_sim = SequenceMatcher(None, struct1['pattern'], struct2['pattern']).ratio()
|
| 1146 |
+
score += pattern_sim * 0.5
|
| 1147 |
+
|
| 1148 |
+
# Compare paragraph statistics
|
| 1149 |
+
if all(k in struct1 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']) and \
|
| 1150 |
+
all(k in struct2 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']):
|
| 1151 |
+
|
| 1152 |
+
# Paragraph count ratio
|
| 1153 |
+
para_ratio = min(struct1['para_count'], struct2['para_count']) / \
|
| 1154 |
+
max(1, max(struct1['para_count'], struct2['para_count']))
|
| 1155 |
+
score += para_ratio * 0.2
|
| 1156 |
+
|
| 1157 |
+
# Average length ratio
|
| 1158 |
+
avg_ratio = min(struct1['avg_para_length'], struct2['avg_para_length']) / \
|
| 1159 |
+
max(1, max(struct1['avg_para_length'], struct2['avg_para_length']))
|
| 1160 |
+
score += avg_ratio * 0.15
|
| 1161 |
+
|
| 1162 |
+
# Dialogue ratio similarity
|
| 1163 |
+
dialogue_diff = abs(struct1['dialogue_ratio'] - struct2['dialogue_ratio'])
|
| 1164 |
+
score += (1 - min(1, dialogue_diff)) * 0.15
|
| 1165 |
+
|
| 1166 |
+
return score
|
| 1167 |
+
|
| 1168 |
+
def _calculate_character_similarity(self, chars1, chars2):
|
| 1169 |
+
"""Calculate character overlap similarity"""
|
| 1170 |
+
if not chars1 or not chars2:
|
| 1171 |
+
return 0.0
|
| 1172 |
+
|
| 1173 |
+
# Convert to sets
|
| 1174 |
+
set1 = set(chars1)
|
| 1175 |
+
set2 = set(chars2)
|
| 1176 |
+
|
| 1177 |
+
# If no overlap at all, return 0
|
| 1178 |
+
intersection = set1 & set2
|
| 1179 |
+
if not intersection:
|
| 1180 |
+
return 0.0
|
| 1181 |
+
|
| 1182 |
+
# Calculate Jaccard index (intersection over union)
|
| 1183 |
+
union = set1 | set2
|
| 1184 |
+
jaccard = len(intersection) / len(union)
|
| 1185 |
+
|
| 1186 |
+
# Also consider the proportion of matching characters relative to each set
|
| 1187 |
+
# This prevents small overlaps from scoring too high
|
| 1188 |
+
overlap1 = len(intersection) / len(set1)
|
| 1189 |
+
overlap2 = len(intersection) / len(set2)
|
| 1190 |
+
|
| 1191 |
+
# Take the minimum overlap to be more conservative
|
| 1192 |
+
min_overlap = min(overlap1, overlap2)
|
| 1193 |
+
|
| 1194 |
+
# Combine jaccard and overlap scores
|
| 1195 |
+
# Jaccard penalizes when sets are very different sizes
|
| 1196 |
+
# Min overlap ensures both texts share a significant portion of characters
|
| 1197 |
+
score = (jaccard + min_overlap) / 2
|
| 1198 |
+
|
| 1199 |
+
return score
|
| 1200 |
+
|
| 1201 |
+
def _calculate_pattern_similarity(self, pat1, pat2):
|
| 1202 |
+
"""Calculate pattern similarity (numbers, etc.)"""
|
| 1203 |
+
score = 0.0
|
| 1204 |
+
|
| 1205 |
+
# Number overlap
|
| 1206 |
+
if 'numbers' in pat1 and 'numbers' in pat2:
|
| 1207 |
+
nums1 = set(pat1['numbers'])
|
| 1208 |
+
nums2 = set(pat2['numbers'])
|
| 1209 |
+
|
| 1210 |
+
if nums1 or nums2:
|
| 1211 |
+
overlap = len(nums1 & nums2)
|
| 1212 |
+
total = len(nums1 | nums2)
|
| 1213 |
+
score = overlap / max(1, total)
|
| 1214 |
+
else:
|
| 1215 |
+
score = 1.0 # Both have no numbers
|
| 1216 |
+
|
| 1217 |
+
return score
|
| 1218 |
+
|
| 1219 |
+
def _check_non_target_language(self, text, lang_config):
|
| 1220 |
+
"""Check if text contains too much non-target language"""
|
| 1221 |
+
target_language = lang_config['target_language'].lower()
|
| 1222 |
+
threshold = lang_config['threshold_characters']
|
| 1223 |
+
|
| 1224 |
+
# Character ranges for different languages
|
| 1225 |
+
language_ranges = {
|
| 1226 |
+
'english': [ # Latin script + basic symbols
|
| 1227 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1228 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1229 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1230 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1231 |
+
(0x2000, 0x206F), # General Punctuation
|
| 1232 |
+
(0x20A0, 0x20CF), # Currency Symbols
|
| 1233 |
+
(0xFF00, 0xFFEF), # Halfwidth and Fullwidth Forms
|
| 1234 |
+
],
|
| 1235 |
+
'japanese': [
|
| 1236 |
+
(0x3040, 0x309F), # Hiragana
|
| 1237 |
+
(0x30A0, 0x30FF), # Katakana
|
| 1238 |
+
(0x4E00, 0x9FAF), # CJK Unified Ideographs
|
| 1239 |
+
(0x3400, 0x4DBF), # CJK Extension A
|
| 1240 |
+
(0xFF66, 0xFF9F), # Halfwidth Katakana
|
| 1241 |
+
],
|
| 1242 |
+
'korean': [
|
| 1243 |
+
(0xAC00, 0xD7AF), # Hangul Syllables
|
| 1244 |
+
(0x1100, 0x11FF), # Hangul Jamo
|
| 1245 |
+
(0x3130, 0x318F), # Hangul Compatibility Jamo
|
| 1246 |
+
(0xA960, 0xA97F), # Hangul Jamo Extended-A
|
| 1247 |
+
(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
|
| 1248 |
+
],
|
| 1249 |
+
'chinese': [
|
| 1250 |
+
(0x4E00, 0x9FAF), # CJK Unified Ideographs
|
| 1251 |
+
(0x3400, 0x4DBF), # CJK Extension A
|
| 1252 |
+
(0x20000, 0x2A6DF), # CJK Extension B
|
| 1253 |
+
(0x2A700, 0x2B73F), # CJK Extension C
|
| 1254 |
+
(0x2B740, 0x2B81F), # CJK Extension D
|
| 1255 |
+
(0x3000, 0x303F), # CJK Symbols and Punctuation
|
| 1256 |
+
],
|
| 1257 |
+
'arabic': [
|
| 1258 |
+
(0x0600, 0x06FF), # Arabic
|
| 1259 |
+
(0x0750, 0x077F), # Arabic Supplement
|
| 1260 |
+
(0x08A0, 0x08FF), # Arabic Extended-A
|
| 1261 |
+
(0xFB50, 0xFDFF), # Arabic Presentation Forms-A
|
| 1262 |
+
(0xFE70, 0xFEFF), # Arabic Presentation Forms-B
|
| 1263 |
+
],
|
| 1264 |
+
'russian': [
|
| 1265 |
+
(0x0400, 0x04FF), # Cyrillic
|
| 1266 |
+
(0x0500, 0x052F), # Cyrillic Supplement
|
| 1267 |
+
(0x2DE0, 0x2DFF), # Cyrillic Extended-A
|
| 1268 |
+
(0xA640, 0xA69F), # Cyrillic Extended-B
|
| 1269 |
+
],
|
| 1270 |
+
'thai': [
|
| 1271 |
+
(0x0E00, 0x0E7F), # Thai
|
| 1272 |
+
],
|
| 1273 |
+
'hindi': [
|
| 1274 |
+
(0x0900, 0x097F), # Devanagari
|
| 1275 |
+
(0xA8E0, 0xA8FF), # Devanagari Extended
|
| 1276 |
+
],
|
| 1277 |
+
'spanish': [ # Same as English (Latin script)
|
| 1278 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1279 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1280 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1281 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1282 |
+
],
|
| 1283 |
+
'french': [ # Same as English (Latin script)
|
| 1284 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1285 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1286 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1287 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1288 |
+
],
|
| 1289 |
+
'german': [ # Same as English (Latin script)
|
| 1290 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1291 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1292 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1293 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1294 |
+
],
|
| 1295 |
+
'portuguese': [ # Same as English (Latin script)
|
| 1296 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1297 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1298 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1299 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1300 |
+
],
|
| 1301 |
+
'italian': [ # Same as English (Latin script)
|
| 1302 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1303 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1304 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1305 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1306 |
+
],
|
| 1307 |
+
'dutch': [ # Same as English (Latin script)
|
| 1308 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1309 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1310 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1311 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1312 |
+
],
|
| 1313 |
+
'vietnamese': [
|
| 1314 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1315 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1316 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1317 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1318 |
+
(0x1EA0, 0x1EFF), # Latin Extended Additional (Vietnamese)
|
| 1319 |
+
],
|
| 1320 |
+
'turkish': [
|
| 1321 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1322 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1323 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1324 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1325 |
+
],
|
| 1326 |
+
'polish': [
|
| 1327 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1328 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1329 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1330 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1331 |
+
],
|
| 1332 |
+
'swedish': [ # Same as English (Latin script)
|
| 1333 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1334 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1335 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1336 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1337 |
+
],
|
| 1338 |
+
'danish': [ # Same as English (Latin script)
|
| 1339 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1340 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1341 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1342 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1343 |
+
],
|
| 1344 |
+
'norwegian': [ # Same as English (Latin script)
|
| 1345 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1346 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1347 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1348 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1349 |
+
],
|
| 1350 |
+
'finnish': [ # Same as English (Latin script)
|
| 1351 |
+
(0x0000, 0x007F), # Basic Latin
|
| 1352 |
+
(0x0080, 0x00FF), # Latin-1 Supplement
|
| 1353 |
+
(0x0100, 0x017F), # Latin Extended-A
|
| 1354 |
+
(0x0180, 0x024F), # Latin Extended-B
|
| 1355 |
+
],
|
| 1356 |
+
}
|
| 1357 |
+
|
| 1358 |
+
# Get target language ranges
|
| 1359 |
+
target_ranges = language_ranges.get(target_language, language_ranges['english'])
|
| 1360 |
+
|
| 1361 |
+
# Count characters that are NOT in target language ranges
|
| 1362 |
+
non_target_count = 0
|
| 1363 |
+
total_letters = 0
|
| 1364 |
+
|
| 1365 |
+
for char in text:
|
| 1366 |
+
# Skip whitespace, punctuation, and numbers for counting
|
| 1367 |
+
if char.isspace() or char.isdigit():
|
| 1368 |
+
continue
|
| 1369 |
+
|
| 1370 |
+
# Count as letter character
|
| 1371 |
+
total_letters += 1
|
| 1372 |
+
|
| 1373 |
+
# Check if character is in any target language range
|
| 1374 |
+
char_code = ord(char)
|
| 1375 |
+
is_target_char = any(start <= char_code <= end for start, end in target_ranges)
|
| 1376 |
+
|
| 1377 |
+
if not is_target_char:
|
| 1378 |
+
non_target_count += 1
|
| 1379 |
+
|
| 1380 |
+
# Debug logging
|
| 1381 |
+
if non_target_count > 0:
|
| 1382 |
+
print(f" 🌐 Language detection: {non_target_count}/{total_letters} non-target chars ({target_language})")
|
| 1383 |
+
|
| 1384 |
+
# Return True if non-target character count exceeds threshold
|
| 1385 |
+
return non_target_count >= threshold, non_target_count
|
api_key_encryption.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simple API Key Encryption Module for Glossarion
|
| 3 |
+
Encrypts only specific API key fields including multi-key support
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import base64
|
| 9 |
+
from cryptography.fernet import Fernet
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class APIKeyEncryption:
|
| 14 |
+
"""Simple encryption handler for API keys"""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.key_file = Path('.glossarion_key')
|
| 18 |
+
self.cipher = self._get_or_create_cipher()
|
| 19 |
+
|
| 20 |
+
# Define which fields to encrypt
|
| 21 |
+
self.api_key_fields = [
|
| 22 |
+
'api_key',
|
| 23 |
+
'replicate_api_key',
|
| 24 |
+
# Add more field names here if needed
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
def _get_or_create_cipher(self):
|
| 28 |
+
"""Get existing cipher or create new one"""
|
| 29 |
+
if self.key_file.exists():
|
| 30 |
+
try:
|
| 31 |
+
key = self.key_file.read_bytes()
|
| 32 |
+
return Fernet(key)
|
| 33 |
+
except:
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
# Generate new key
|
| 37 |
+
key = Fernet.generate_key()
|
| 38 |
+
self.key_file.write_bytes(key)
|
| 39 |
+
|
| 40 |
+
# Hide file on Windows
|
| 41 |
+
if os.name == 'nt':
|
| 42 |
+
import ctypes
|
| 43 |
+
ctypes.windll.kernel32.SetFileAttributesW(str(self.key_file), 2)
|
| 44 |
+
else:
|
| 45 |
+
# Restrict permissions on Unix
|
| 46 |
+
os.chmod(self.key_file, 0o600)
|
| 47 |
+
|
| 48 |
+
return Fernet(key)
|
| 49 |
+
|
| 50 |
+
def encrypt_value(self, value):
|
| 51 |
+
"""Encrypt a single value"""
|
| 52 |
+
try:
|
| 53 |
+
encrypted = self.cipher.encrypt(value.encode())
|
| 54 |
+
return f"ENC:{base64.b64encode(encrypted).decode()}"
|
| 55 |
+
except:
|
| 56 |
+
return value
|
| 57 |
+
|
| 58 |
+
def decrypt_value(self, value):
|
| 59 |
+
"""Decrypt a single value"""
|
| 60 |
+
if not isinstance(value, str) or not value.startswith('ENC:'):
|
| 61 |
+
return value
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
encrypted_data = base64.b64decode(value[4:])
|
| 65 |
+
return self.cipher.decrypt(encrypted_data).decode()
|
| 66 |
+
except:
|
| 67 |
+
return value
|
| 68 |
+
|
| 69 |
+
def encrypt_multi_keys(self, multi_keys):
|
| 70 |
+
"""Encrypt API keys in multi_api_keys array"""
|
| 71 |
+
if not isinstance(multi_keys, list):
|
| 72 |
+
return multi_keys
|
| 73 |
+
|
| 74 |
+
encrypted_keys = []
|
| 75 |
+
for key_entry in multi_keys:
|
| 76 |
+
if isinstance(key_entry, dict):
|
| 77 |
+
encrypted_entry = key_entry.copy()
|
| 78 |
+
# Encrypt the api_key field in each entry
|
| 79 |
+
if 'api_key' in encrypted_entry and encrypted_entry['api_key']:
|
| 80 |
+
value = encrypted_entry['api_key']
|
| 81 |
+
if isinstance(value, str) and not value.startswith('ENC:'):
|
| 82 |
+
encrypted_entry['api_key'] = self.encrypt_value(value)
|
| 83 |
+
encrypted_keys.append(encrypted_entry)
|
| 84 |
+
else:
|
| 85 |
+
encrypted_keys.append(key_entry)
|
| 86 |
+
|
| 87 |
+
return encrypted_keys
|
| 88 |
+
|
| 89 |
+
def decrypt_multi_keys(self, multi_keys):
|
| 90 |
+
"""Decrypt API keys in multi_api_keys array"""
|
| 91 |
+
if not isinstance(multi_keys, list):
|
| 92 |
+
return multi_keys
|
| 93 |
+
|
| 94 |
+
decrypted_keys = []
|
| 95 |
+
for key_entry in multi_keys:
|
| 96 |
+
if isinstance(key_entry, dict):
|
| 97 |
+
decrypted_entry = key_entry.copy()
|
| 98 |
+
# Decrypt the api_key field in each entry
|
| 99 |
+
if 'api_key' in decrypted_entry and decrypted_entry['api_key']:
|
| 100 |
+
decrypted_entry['api_key'] = self.decrypt_value(decrypted_entry['api_key'])
|
| 101 |
+
decrypted_keys.append(decrypted_entry)
|
| 102 |
+
else:
|
| 103 |
+
decrypted_keys.append(key_entry)
|
| 104 |
+
|
| 105 |
+
return decrypted_keys
|
| 106 |
+
|
| 107 |
+
def encrypt_config(self, config):
|
| 108 |
+
"""Encrypt specific API key fields including multi-key support"""
|
| 109 |
+
encrypted = config.copy()
|
| 110 |
+
|
| 111 |
+
# Encrypt regular API key fields
|
| 112 |
+
for field in self.api_key_fields:
|
| 113 |
+
if field in encrypted and encrypted[field]:
|
| 114 |
+
value = encrypted[field]
|
| 115 |
+
# Only encrypt if not already encrypted
|
| 116 |
+
if isinstance(value, str) and not value.startswith('ENC:'):
|
| 117 |
+
encrypted[field] = self.encrypt_value(value)
|
| 118 |
+
|
| 119 |
+
# Encrypt multi_api_keys if present
|
| 120 |
+
if 'multi_api_keys' in encrypted:
|
| 121 |
+
encrypted['multi_api_keys'] = self.encrypt_multi_keys(encrypted['multi_api_keys'])
|
| 122 |
+
|
| 123 |
+
# Encrypt fallback_keys if present
|
| 124 |
+
if 'fallback_keys' in encrypted:
|
| 125 |
+
encrypted['fallback_keys'] = self.encrypt_multi_keys(encrypted['fallback_keys'])
|
| 126 |
+
|
| 127 |
+
return encrypted
|
| 128 |
+
|
| 129 |
+
def decrypt_config(self, config):
|
| 130 |
+
"""Decrypt specific API key fields including multi-key support"""
|
| 131 |
+
decrypted = config.copy()
|
| 132 |
+
|
| 133 |
+
# Decrypt regular API key fields
|
| 134 |
+
for field in self.api_key_fields:
|
| 135 |
+
if field in decrypted and decrypted[field]:
|
| 136 |
+
decrypted[field] = self.decrypt_value(decrypted[field])
|
| 137 |
+
|
| 138 |
+
# Decrypt multi_api_keys if present
|
| 139 |
+
if 'multi_api_keys' in decrypted:
|
| 140 |
+
decrypted['multi_api_keys'] = self.decrypt_multi_keys(decrypted['multi_api_keys'])
|
| 141 |
+
|
| 142 |
+
# Decrypt fallback_keys if present
|
| 143 |
+
if 'fallback_keys' in decrypted:
|
| 144 |
+
decrypted['fallback_keys'] = self.decrypt_multi_keys(decrypted['fallback_keys'])
|
| 145 |
+
|
| 146 |
+
return decrypted
|
| 147 |
+
|
| 148 |
+
# Simple interface functions
|
| 149 |
+
_handler = None
|
| 150 |
+
|
| 151 |
+
def get_handler():
|
| 152 |
+
global _handler
|
| 153 |
+
if _handler is None:
|
| 154 |
+
_handler = APIKeyEncryption()
|
| 155 |
+
return _handler
|
| 156 |
+
|
| 157 |
+
def encrypt_config(config):
|
| 158 |
+
"""Encrypt API keys in config"""
|
| 159 |
+
return get_handler().encrypt_config(config)
|
| 160 |
+
|
| 161 |
+
def decrypt_config(config):
|
| 162 |
+
"""Decrypt API keys in config"""
|
| 163 |
+
return get_handler().decrypt_config(config)
|
| 164 |
+
|
| 165 |
+
def migrate_config_file(config_file='config.json'):
|
| 166 |
+
"""Migrate existing config to encrypted format"""
|
| 167 |
+
try:
|
| 168 |
+
# Read config
|
| 169 |
+
with open(config_file, 'r', encoding='utf-8') as f:
|
| 170 |
+
config = json.load(f)
|
| 171 |
+
|
| 172 |
+
# Check if already encrypted
|
| 173 |
+
handler = get_handler()
|
| 174 |
+
needs_encryption = False
|
| 175 |
+
|
| 176 |
+
# Check regular API key fields
|
| 177 |
+
for field in handler.api_key_fields:
|
| 178 |
+
if field in config and config[field]:
|
| 179 |
+
if isinstance(config[field], str) and not config[field].startswith('ENC:'):
|
| 180 |
+
needs_encryption = True
|
| 181 |
+
break
|
| 182 |
+
|
| 183 |
+
# Check multi_api_keys
|
| 184 |
+
if 'multi_api_keys' in config and isinstance(config['multi_api_keys'], list):
|
| 185 |
+
for key_entry in config['multi_api_keys']:
|
| 186 |
+
if isinstance(key_entry, dict) and 'api_key' in key_entry:
|
| 187 |
+
if key_entry['api_key'] and not key_entry['api_key'].startswith('ENC:'):
|
| 188 |
+
needs_encryption = True
|
| 189 |
+
break
|
| 190 |
+
|
| 191 |
+
# Check fallback_keys
|
| 192 |
+
if 'fallback_keys' in config and isinstance(config['fallback_keys'], list):
|
| 193 |
+
for key_entry in config['fallback_keys']:
|
| 194 |
+
if isinstance(key_entry, dict) and 'api_key' in key_entry:
|
| 195 |
+
if key_entry['api_key'] and not key_entry['api_key'].startswith('ENC:'):
|
| 196 |
+
needs_encryption = True
|
| 197 |
+
break
|
| 198 |
+
|
| 199 |
+
if not needs_encryption:
|
| 200 |
+
print("Config already encrypted or no API keys found.")
|
| 201 |
+
return True
|
| 202 |
+
|
| 203 |
+
# Backup
|
| 204 |
+
backup_file = f"{config_file}.backup"
|
| 205 |
+
with open(backup_file, 'w', encoding='utf-8') as f:
|
| 206 |
+
json.dump(config, f, ensure_ascii=False, indent=2)
|
| 207 |
+
print(f"Created backup: {backup_file}")
|
| 208 |
+
|
| 209 |
+
# Encrypt
|
| 210 |
+
encrypted = encrypt_config(config)
|
| 211 |
+
|
| 212 |
+
# Save
|
| 213 |
+
with open(config_file, 'w', encoding='utf-8') as f:
|
| 214 |
+
json.dump(encrypted, f, ensure_ascii=False, indent=2)
|
| 215 |
+
|
| 216 |
+
print("✅ Successfully encrypted API keys!")
|
| 217 |
+
|
| 218 |
+
# Show summary
|
| 219 |
+
if 'multi_api_keys' in config:
|
| 220 |
+
print(f" - Encrypted {len(config['multi_api_keys'])} multi-key entries")
|
| 221 |
+
|
| 222 |
+
if 'fallback_keys' in config:
|
| 223 |
+
print(f" - Encrypted {len(config['fallback_keys'])} fallback-key entries")
|
| 224 |
+
|
| 225 |
+
return True
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"❌ Error: {e}")
|
| 229 |
+
return False
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
if __name__ == "__main__":
|
| 233 |
+
# Simple migration script
|
| 234 |
+
import sys
|
| 235 |
+
|
| 236 |
+
config_file = 'config.json'
|
| 237 |
+
if len(sys.argv) > 1:
|
| 238 |
+
config_file = sys.argv[1]
|
| 239 |
+
|
| 240 |
+
if os.path.exists(config_file):
|
| 241 |
+
print(f"Encrypting API keys in {config_file}...")
|
| 242 |
+
migrate_config_file(config_file)
|
| 243 |
+
else:
|
| 244 |
+
print(f"Config file not found: {config_file}")
|
async_api_processor.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
bubble_detector.py
ADDED
|
@@ -0,0 +1,1881 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
bubble_detector.py - Modified version that works in frozen PyInstaller executables
|
| 3 |
+
Replace your bubble_detector.py with this version
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import json
|
| 8 |
+
import numpy as np
|
| 9 |
+
import cv2
|
| 10 |
+
from typing import List, Tuple, Optional, Dict, Any
|
| 11 |
+
import logging
|
| 12 |
+
import traceback
|
| 13 |
+
import hashlib
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
import threading
|
| 16 |
+
import time
|
| 17 |
+
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# Check if we're running in a frozen environment
|
| 22 |
+
IS_FROZEN = getattr(sys, 'frozen', False)
|
| 23 |
+
if IS_FROZEN:
|
| 24 |
+
# In frozen environment, set proper paths for ML libraries
|
| 25 |
+
MEIPASS = sys._MEIPASS
|
| 26 |
+
os.environ['TORCH_HOME'] = MEIPASS
|
| 27 |
+
os.environ['TRANSFORMERS_CACHE'] = os.path.join(MEIPASS, 'transformers')
|
| 28 |
+
os.environ['HF_HOME'] = os.path.join(MEIPASS, 'huggingface')
|
| 29 |
+
logger.info(f"Running in frozen environment: {MEIPASS}")
|
| 30 |
+
|
| 31 |
+
# Modified import checks for frozen environment
|
| 32 |
+
YOLO_AVAILABLE = False
|
| 33 |
+
YOLO = None
|
| 34 |
+
torch = None
|
| 35 |
+
TORCH_AVAILABLE = False
|
| 36 |
+
ONNX_AVAILABLE = False
|
| 37 |
+
TRANSFORMERS_AVAILABLE = False
|
| 38 |
+
RTDetrForObjectDetection = None
|
| 39 |
+
RTDetrImageProcessor = None
|
| 40 |
+
PIL_AVAILABLE = False
|
| 41 |
+
|
| 42 |
+
# Try to import YOLO dependencies with better error handling
|
| 43 |
+
if IS_FROZEN:
|
| 44 |
+
# In frozen environment, try harder to import
|
| 45 |
+
try:
|
| 46 |
+
# First try to import torch components individually
|
| 47 |
+
import torch
|
| 48 |
+
import torch.nn
|
| 49 |
+
import torch.cuda
|
| 50 |
+
TORCH_AVAILABLE = True
|
| 51 |
+
logger.info("✓ PyTorch loaded in frozen environment")
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.warning(f"PyTorch not available in frozen environment: {e}")
|
| 54 |
+
TORCH_AVAILABLE = False
|
| 55 |
+
torch = None
|
| 56 |
+
|
| 57 |
+
# Try ultralytics after torch
|
| 58 |
+
if TORCH_AVAILABLE:
|
| 59 |
+
try:
|
| 60 |
+
from ultralytics import YOLO
|
| 61 |
+
YOLO_AVAILABLE = True
|
| 62 |
+
logger.info("✓ Ultralytics YOLO loaded in frozen environment")
|
| 63 |
+
except Exception as e:
|
| 64 |
+
logger.warning(f"Ultralytics not available in frozen environment: {e}")
|
| 65 |
+
YOLO_AVAILABLE = False
|
| 66 |
+
|
| 67 |
+
# Try transformers
|
| 68 |
+
try:
|
| 69 |
+
import transformers
|
| 70 |
+
# Try specific imports
|
| 71 |
+
try:
|
| 72 |
+
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
|
| 73 |
+
TRANSFORMERS_AVAILABLE = True
|
| 74 |
+
logger.info("✓ Transformers RT-DETR loaded in frozen environment")
|
| 75 |
+
except ImportError:
|
| 76 |
+
# Try alternative import
|
| 77 |
+
try:
|
| 78 |
+
from transformers import AutoModel, AutoImageProcessor
|
| 79 |
+
RTDetrForObjectDetection = AutoModel
|
| 80 |
+
RTDetrImageProcessor = AutoImageProcessor
|
| 81 |
+
TRANSFORMERS_AVAILABLE = True
|
| 82 |
+
logger.info("✓ Transformers loaded with AutoModel fallback")
|
| 83 |
+
except:
|
| 84 |
+
TRANSFORMERS_AVAILABLE = False
|
| 85 |
+
logger.warning("Transformers RT-DETR not available in frozen environment")
|
| 86 |
+
except Exception as e:
|
| 87 |
+
logger.warning(f"Transformers not available in frozen environment: {e}")
|
| 88 |
+
TRANSFORMERS_AVAILABLE = False
|
| 89 |
+
else:
|
| 90 |
+
# Normal environment - original import logic
|
| 91 |
+
try:
|
| 92 |
+
from ultralytics import YOLO
|
| 93 |
+
YOLO_AVAILABLE = True
|
| 94 |
+
except:
|
| 95 |
+
YOLO_AVAILABLE = False
|
| 96 |
+
logger.warning("Ultralytics YOLO not available")
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
import torch
|
| 100 |
+
# Test if cuda attribute exists
|
| 101 |
+
_ = torch.cuda
|
| 102 |
+
TORCH_AVAILABLE = True
|
| 103 |
+
except (ImportError, AttributeError):
|
| 104 |
+
TORCH_AVAILABLE = False
|
| 105 |
+
torch = None
|
| 106 |
+
logger.warning("PyTorch not available or incomplete")
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
|
| 110 |
+
try:
|
| 111 |
+
from transformers import RTDetrV2ForObjectDetection
|
| 112 |
+
RTDetrForObjectDetection = RTDetrV2ForObjectDetection
|
| 113 |
+
except ImportError:
|
| 114 |
+
pass
|
| 115 |
+
TRANSFORMERS_AVAILABLE = True
|
| 116 |
+
except:
|
| 117 |
+
TRANSFORMERS_AVAILABLE = False
|
| 118 |
+
logger.info("Transformers not available for RT-DETR")
|
| 119 |
+
|
| 120 |
+
# Configure ORT memory behavior before importing
|
| 121 |
+
try:
|
| 122 |
+
os.environ.setdefault('ORT_DISABLE_MEMORY_ARENA', '1')
|
| 123 |
+
except Exception:
|
| 124 |
+
pass
|
| 125 |
+
# ONNX Runtime - works well in frozen environments
|
| 126 |
+
try:
|
| 127 |
+
import onnxruntime as ort
|
| 128 |
+
ONNX_AVAILABLE = True
|
| 129 |
+
logger.info("✓ ONNX Runtime available")
|
| 130 |
+
except ImportError:
|
| 131 |
+
ONNX_AVAILABLE = False
|
| 132 |
+
logger.warning("ONNX Runtime not available")
|
| 133 |
+
|
| 134 |
+
# PIL
|
| 135 |
+
try:
|
| 136 |
+
from PIL import Image
|
| 137 |
+
PIL_AVAILABLE = True
|
| 138 |
+
except ImportError:
|
| 139 |
+
PIL_AVAILABLE = False
|
| 140 |
+
logger.info("PIL not available")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
class BubbleDetector:
|
| 144 |
+
"""
|
| 145 |
+
Combined YOLOv8 and RT-DETR speech bubble detector for comics and manga.
|
| 146 |
+
Supports multiple model formats and provides configurable detection.
|
| 147 |
+
Backward compatible with existing code while adding RT-DETR support.
|
| 148 |
+
"""
|
| 149 |
+
|
| 150 |
+
# Process-wide shared RT-DETR to avoid concurrent meta-device loads
|
| 151 |
+
_rtdetr_init_lock = threading.Lock()
|
| 152 |
+
_rtdetr_shared_model = None
|
| 153 |
+
_rtdetr_shared_processor = None
|
| 154 |
+
_rtdetr_loaded = False
|
| 155 |
+
_rtdetr_repo_id = 'ogkalu/comic-text-and-bubble-detector'
|
| 156 |
+
|
| 157 |
+
# Shared RT-DETR (ONNX) across process to avoid device/context storms
|
| 158 |
+
_rtdetr_onnx_init_lock = threading.Lock()
|
| 159 |
+
_rtdetr_onnx_shared_session = None
|
| 160 |
+
_rtdetr_onnx_loaded = False
|
| 161 |
+
_rtdetr_onnx_providers = None
|
| 162 |
+
_rtdetr_onnx_model_path = None
|
| 163 |
+
# Limit DML concurrent runs to avoid DXGI device hang. Adjustable via env DML_MAX_CONCURRENT
|
| 164 |
+
try:
|
| 165 |
+
_rtdetr_onnx_max_concurrent = int(os.environ.get('DML_MAX_CONCURRENT', '1'))
|
| 166 |
+
except Exception:
|
| 167 |
+
_rtdetr_onnx_max_concurrent = 1
|
| 168 |
+
_rtdetr_onnx_sema = threading.Semaphore(max(1, _rtdetr_onnx_max_concurrent))
|
| 169 |
+
|
| 170 |
+
def __init__(self, config_path: str = "config.json"):
|
| 171 |
+
"""
|
| 172 |
+
Initialize the bubble detector.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
config_path: Path to configuration file
|
| 176 |
+
"""
|
| 177 |
+
self.config_path = config_path
|
| 178 |
+
self.config = self._load_config()
|
| 179 |
+
|
| 180 |
+
# YOLOv8 components (original)
|
| 181 |
+
self.model = None
|
| 182 |
+
self.model_loaded = False
|
| 183 |
+
self.model_type = None # 'yolo', 'onnx', or 'torch'
|
| 184 |
+
self.onnx_session = None
|
| 185 |
+
|
| 186 |
+
# RT-DETR components (new)
|
| 187 |
+
self.rtdetr_model = None
|
| 188 |
+
self.rtdetr_processor = None
|
| 189 |
+
self.rtdetr_loaded = False
|
| 190 |
+
self.rtdetr_repo = 'ogkalu/comic-text-and-bubble-detector'
|
| 191 |
+
|
| 192 |
+
# RT-DETR (ONNX) backend components
|
| 193 |
+
self.rtdetr_onnx_session = None
|
| 194 |
+
self.rtdetr_onnx_loaded = False
|
| 195 |
+
self.rtdetr_onnx_repo = 'ogkalu/comic-text-and-bubble-detector'
|
| 196 |
+
|
| 197 |
+
# RT-DETR class definitions
|
| 198 |
+
self.CLASS_BUBBLE = 0 # Empty speech bubble
|
| 199 |
+
self.CLASS_TEXT_BUBBLE = 1 # Bubble with text
|
| 200 |
+
self.CLASS_TEXT_FREE = 2 # Text without bubble
|
| 201 |
+
|
| 202 |
+
# Detection settings
|
| 203 |
+
self.default_confidence = 0.5
|
| 204 |
+
self.default_iou_threshold = 0.45
|
| 205 |
+
# Allow override from settings
|
| 206 |
+
try:
|
| 207 |
+
ocr_cfg = self.config.get('manga_settings', {}).get('ocr', {}) if isinstance(self.config, dict) else {}
|
| 208 |
+
self.default_max_detections = int(ocr_cfg.get('bubble_max_detections', 100))
|
| 209 |
+
self.max_det_yolo = int(ocr_cfg.get('bubble_max_detections_yolo', self.default_max_detections))
|
| 210 |
+
self.max_det_rtdetr = int(ocr_cfg.get('bubble_max_detections_rtdetr', self.default_max_detections))
|
| 211 |
+
except Exception:
|
| 212 |
+
self.default_max_detections = 100
|
| 213 |
+
self.max_det_yolo = 100
|
| 214 |
+
self.max_det_rtdetr = 100
|
| 215 |
+
|
| 216 |
+
# Cache directory for ONNX conversions
|
| 217 |
+
self.cache_dir = os.environ.get('BUBBLE_CACHE_DIR', 'models')
|
| 218 |
+
os.makedirs(self.cache_dir, exist_ok=True)
|
| 219 |
+
|
| 220 |
+
# GPU availability
|
| 221 |
+
self.use_gpu = TORCH_AVAILABLE and torch.cuda.is_available()
|
| 222 |
+
self.device = 'cuda' if self.use_gpu else 'cpu'
|
| 223 |
+
|
| 224 |
+
# Quantization/precision settings
|
| 225 |
+
adv_cfg = self.config.get('manga_settings', {}).get('advanced', {}) if isinstance(self.config, dict) else {}
|
| 226 |
+
ocr_cfg = self.config.get('manga_settings', {}).get('ocr', {}) if isinstance(self.config, dict) else {}
|
| 227 |
+
env_quant = os.environ.get('MODEL_QUANTIZE', 'false').lower() == 'true'
|
| 228 |
+
self.quantize_enabled = bool(env_quant or adv_cfg.get('quantize_models', False) or ocr_cfg.get('quantize_bubble_detector', False))
|
| 229 |
+
self.quantize_dtype = str(adv_cfg.get('torch_precision', os.environ.get('TORCH_PRECISION', 'auto'))).lower()
|
| 230 |
+
# Prefer advanced.onnx_quantize; fall back to env or global quantize
|
| 231 |
+
self.onnx_quantize_enabled = bool(adv_cfg.get('onnx_quantize', os.environ.get('ONNX_QUANTIZE', 'false').lower() == 'true' or self.quantize_enabled))
|
| 232 |
+
|
| 233 |
+
# Stop flag support
|
| 234 |
+
self.stop_flag = None
|
| 235 |
+
self._stopped = False
|
| 236 |
+
self.log_callback = None
|
| 237 |
+
|
| 238 |
+
logger.info(f"🗨️ BubbleDetector initialized")
|
| 239 |
+
logger.info(f" GPU: {'Available' if self.use_gpu else 'Not available'}")
|
| 240 |
+
logger.info(f" YOLO: {'Available' if YOLO_AVAILABLE else 'Not installed'}")
|
| 241 |
+
logger.info(f" ONNX: {'Available' if ONNX_AVAILABLE else 'Not installed'}")
|
| 242 |
+
logger.info(f" RT-DETR: {'Available' if TRANSFORMERS_AVAILABLE else 'Not installed'}")
|
| 243 |
+
logger.info(f" Quantization: {'ENABLED' if self.quantize_enabled else 'disabled'} (torch_precision={self.quantize_dtype}, onnx_quantize={'on' if self.onnx_quantize_enabled else 'off'})" )
|
| 244 |
+
|
| 245 |
+
def _load_config(self) -> Dict[str, Any]:
|
| 246 |
+
"""Load configuration from file."""
|
| 247 |
+
if os.path.exists(self.config_path):
|
| 248 |
+
try:
|
| 249 |
+
with open(self.config_path, 'r', encoding='utf-8') as f:
|
| 250 |
+
return json.load(f)
|
| 251 |
+
except Exception as e:
|
| 252 |
+
logger.warning(f"Failed to load config: {e}")
|
| 253 |
+
return {}
|
| 254 |
+
|
| 255 |
+
def _save_config(self):
|
| 256 |
+
"""Save configuration to file."""
|
| 257 |
+
try:
|
| 258 |
+
with open(self.config_path, 'w', encoding='utf-8') as f:
|
| 259 |
+
json.dump(self.config, f, indent=2)
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.error(f"Failed to save config: {e}")
|
| 262 |
+
|
| 263 |
+
def set_stop_flag(self, stop_flag):
|
| 264 |
+
"""Set the stop flag for checking interruptions"""
|
| 265 |
+
self.stop_flag = stop_flag
|
| 266 |
+
self._stopped = False
|
| 267 |
+
|
| 268 |
+
def set_log_callback(self, log_callback):
|
| 269 |
+
"""Set log callback for GUI integration"""
|
| 270 |
+
self.log_callback = log_callback
|
| 271 |
+
|
| 272 |
+
def _check_stop(self) -> bool:
|
| 273 |
+
"""Check if stop has been requested"""
|
| 274 |
+
if self._stopped:
|
| 275 |
+
return True
|
| 276 |
+
if self.stop_flag and self.stop_flag.is_set():
|
| 277 |
+
self._stopped = True
|
| 278 |
+
return True
|
| 279 |
+
# Check global manga translator cancellation
|
| 280 |
+
try:
|
| 281 |
+
from manga_translator import MangaTranslator
|
| 282 |
+
if MangaTranslator.is_globally_cancelled():
|
| 283 |
+
self._stopped = True
|
| 284 |
+
return True
|
| 285 |
+
except Exception:
|
| 286 |
+
pass
|
| 287 |
+
return False
|
| 288 |
+
|
| 289 |
+
def _log(self, message: str, level: str = "info"):
|
| 290 |
+
"""Log message with stop suppression"""
|
| 291 |
+
# Suppress logs when stopped (allow only essential stop confirmation messages)
|
| 292 |
+
if self._check_stop():
|
| 293 |
+
essential_stop_keywords = [
|
| 294 |
+
"⏹️ Translation stopped by user",
|
| 295 |
+
"⏹️ Bubble detection stopped",
|
| 296 |
+
"cleanup", "🧹"
|
| 297 |
+
]
|
| 298 |
+
if not any(keyword in message for keyword in essential_stop_keywords):
|
| 299 |
+
return
|
| 300 |
+
|
| 301 |
+
if self.log_callback:
|
| 302 |
+
self.log_callback(message, level)
|
| 303 |
+
else:
|
| 304 |
+
logger.info(message) if level == 'info' else getattr(logger, level, logger.info)(message)
|
| 305 |
+
|
| 306 |
+
def reset_stop_flags(self):
|
| 307 |
+
"""Reset stop flags when starting new processing"""
|
| 308 |
+
self._stopped = False
|
| 309 |
+
|
| 310 |
+
def load_model(self, model_path: str, force_reload: bool = False) -> bool:
|
| 311 |
+
"""
|
| 312 |
+
Load a YOLOv8 model for bubble detection.
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
model_path: Path to model file (.pt, .onnx, or .torchscript)
|
| 316 |
+
force_reload: Force reload even if model is already loaded
|
| 317 |
+
|
| 318 |
+
Returns:
|
| 319 |
+
True if model loaded successfully, False otherwise
|
| 320 |
+
"""
|
| 321 |
+
try:
|
| 322 |
+
# If given a Hugging Face repo ID (e.g., 'owner/name'), fetch detector.onnx into models/
|
| 323 |
+
if model_path and (('/' in model_path) and not os.path.exists(model_path)):
|
| 324 |
+
try:
|
| 325 |
+
from huggingface_hub import hf_hub_download
|
| 326 |
+
os.makedirs(self.cache_dir, exist_ok=True)
|
| 327 |
+
logger.info(f"📥 Resolving repo '{model_path}' to detector.onnx in {self.cache_dir}...")
|
| 328 |
+
resolved = hf_hub_download(repo_id=model_path, filename='detector.onnx', cache_dir=self.cache_dir, local_dir=self.cache_dir, local_dir_use_symlinks=False)
|
| 329 |
+
if resolved and os.path.exists(resolved):
|
| 330 |
+
model_path = resolved
|
| 331 |
+
logger.info(f"✅ Downloaded detector.onnx to: {model_path}")
|
| 332 |
+
except Exception as repo_err:
|
| 333 |
+
logger.error(f"Failed to download from repo '{model_path}': {repo_err}")
|
| 334 |
+
if not os.path.exists(model_path):
|
| 335 |
+
logger.error(f"Model file not found: {model_path}")
|
| 336 |
+
return False
|
| 337 |
+
|
| 338 |
+
# Check if it's the same model already loaded
|
| 339 |
+
if self.model_loaded and not force_reload:
|
| 340 |
+
last_path = self.config.get('last_model_path', '')
|
| 341 |
+
if last_path == model_path:
|
| 342 |
+
logger.info("Model already loaded (same path)")
|
| 343 |
+
return True
|
| 344 |
+
else:
|
| 345 |
+
logger.info(f"Model path changed from {last_path} to {model_path}, reloading...")
|
| 346 |
+
force_reload = True
|
| 347 |
+
|
| 348 |
+
# Clear previous model if force reload
|
| 349 |
+
if force_reload:
|
| 350 |
+
logger.info("Force reloading model...")
|
| 351 |
+
self.model = None
|
| 352 |
+
self.onnx_session = None
|
| 353 |
+
self.model_loaded = False
|
| 354 |
+
self.model_type = None
|
| 355 |
+
|
| 356 |
+
logger.info(f"📥 Loading bubble detection model: {model_path}")
|
| 357 |
+
|
| 358 |
+
# Determine model type by extension
|
| 359 |
+
ext = Path(model_path).suffix.lower()
|
| 360 |
+
|
| 361 |
+
if ext in ['.pt', '.pth']:
|
| 362 |
+
if not YOLO_AVAILABLE:
|
| 363 |
+
logger.warning("Ultralytics package not available in this build")
|
| 364 |
+
logger.info("Bubble detection will be disabled - this is normal for lightweight builds")
|
| 365 |
+
# Don't return False immediately, try other fallbacks
|
| 366 |
+
self.model_loaded = False
|
| 367 |
+
return False
|
| 368 |
+
|
| 369 |
+
# Load YOLOv8 model
|
| 370 |
+
try:
|
| 371 |
+
self.model = YOLO(model_path)
|
| 372 |
+
self.model_type = 'yolo'
|
| 373 |
+
|
| 374 |
+
# Set to eval mode
|
| 375 |
+
if hasattr(self.model, 'model'):
|
| 376 |
+
self.model.model.eval()
|
| 377 |
+
|
| 378 |
+
# Move to GPU if available
|
| 379 |
+
if self.use_gpu and TORCH_AVAILABLE:
|
| 380 |
+
try:
|
| 381 |
+
self.model.to('cuda')
|
| 382 |
+
except Exception as gpu_error:
|
| 383 |
+
logger.warning(f"Could not move model to GPU: {gpu_error}")
|
| 384 |
+
|
| 385 |
+
logger.info("✅ YOLOv8 model loaded successfully")
|
| 386 |
+
# Apply optional FP16 precision to reduce VRAM if enabled
|
| 387 |
+
if self.quantize_enabled and self.use_gpu and TORCH_AVAILABLE:
|
| 388 |
+
try:
|
| 389 |
+
m = self.model.model if hasattr(self.model, 'model') else self.model
|
| 390 |
+
m.half()
|
| 391 |
+
logger.info("🔻 Applied FP16 precision to YOLO model (GPU)")
|
| 392 |
+
except Exception as _e:
|
| 393 |
+
logger.warning(f"Could not switch YOLO model to FP16: {_e}")
|
| 394 |
+
|
| 395 |
+
except Exception as yolo_error:
|
| 396 |
+
logger.error(f"Failed to load YOLO model: {yolo_error}")
|
| 397 |
+
return False
|
| 398 |
+
|
| 399 |
+
elif ext == '.onnx':
|
| 400 |
+
if not ONNX_AVAILABLE:
|
| 401 |
+
logger.warning("ONNX Runtime not available in this build")
|
| 402 |
+
logger.info("ONNX model support disabled - this is normal for lightweight builds")
|
| 403 |
+
return False
|
| 404 |
+
|
| 405 |
+
try:
|
| 406 |
+
# Load ONNX model
|
| 407 |
+
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if self.use_gpu else ['CPUExecutionProvider']
|
| 408 |
+
session_path = model_path
|
| 409 |
+
if self.quantize_enabled:
|
| 410 |
+
try:
|
| 411 |
+
from onnxruntime.quantization import quantize_dynamic, QuantType
|
| 412 |
+
quant_path = os.path.splitext(model_path)[0] + ".int8.onnx"
|
| 413 |
+
if not os.path.exists(quant_path) or os.environ.get('FORCE_ONNX_REBUILD', 'false').lower() == 'true':
|
| 414 |
+
logger.info("🔻 Quantizing ONNX model weights to INT8 (dynamic)...")
|
| 415 |
+
quantize_dynamic(model_input=model_path, model_output=quant_path, weight_type=QuantType.QInt8, op_types_to_quantize=['Conv', 'MatMul'])
|
| 416 |
+
session_path = quant_path
|
| 417 |
+
self.config['last_onnx_quantized_path'] = quant_path
|
| 418 |
+
self._save_config()
|
| 419 |
+
logger.info(f"✅ Using quantized ONNX model: {quant_path}")
|
| 420 |
+
except Exception as qe:
|
| 421 |
+
logger.warning(f"ONNX quantization not applied: {qe}")
|
| 422 |
+
# Use conservative ORT memory options to reduce RAM growth
|
| 423 |
+
so = ort.SessionOptions()
|
| 424 |
+
try:
|
| 425 |
+
so.enable_mem_pattern = False
|
| 426 |
+
so.enable_cpu_mem_arena = False
|
| 427 |
+
except Exception:
|
| 428 |
+
pass
|
| 429 |
+
self.onnx_session = ort.InferenceSession(session_path, sess_options=so, providers=providers)
|
| 430 |
+
self.model_type = 'onnx'
|
| 431 |
+
|
| 432 |
+
logger.info("✅ ONNX model loaded successfully")
|
| 433 |
+
|
| 434 |
+
except Exception as onnx_error:
|
| 435 |
+
logger.error(f"Failed to load ONNX model: {onnx_error}")
|
| 436 |
+
return False
|
| 437 |
+
|
| 438 |
+
elif ext == '.torchscript':
|
| 439 |
+
if not TORCH_AVAILABLE:
|
| 440 |
+
logger.warning("PyTorch not available in this build")
|
| 441 |
+
logger.info("TorchScript model support disabled - this is normal for lightweight builds")
|
| 442 |
+
return False
|
| 443 |
+
|
| 444 |
+
try:
|
| 445 |
+
# Add safety check for torch being None
|
| 446 |
+
if torch is None:
|
| 447 |
+
logger.error("PyTorch module is None - cannot load TorchScript model")
|
| 448 |
+
return False
|
| 449 |
+
|
| 450 |
+
# Load TorchScript model
|
| 451 |
+
self.model = torch.jit.load(model_path, map_location='cpu')
|
| 452 |
+
self.model.eval()
|
| 453 |
+
self.model_type = 'torch'
|
| 454 |
+
|
| 455 |
+
if self.use_gpu:
|
| 456 |
+
try:
|
| 457 |
+
self.model = self.model.cuda()
|
| 458 |
+
except Exception as gpu_error:
|
| 459 |
+
logger.warning(f"Could not move TorchScript model to GPU: {gpu_error}")
|
| 460 |
+
|
| 461 |
+
logger.info("✅ TorchScript model loaded successfully")
|
| 462 |
+
|
| 463 |
+
# Optional FP16 precision on GPU
|
| 464 |
+
if self.quantize_enabled and self.use_gpu and TORCH_AVAILABLE:
|
| 465 |
+
try:
|
| 466 |
+
self.model = self.model.half()
|
| 467 |
+
logger.info("🔻 Applied FP16 precision to TorchScript model (GPU)")
|
| 468 |
+
except Exception as _e:
|
| 469 |
+
logger.warning(f"Could not switch TorchScript model to FP16: {_e}")
|
| 470 |
+
|
| 471 |
+
except Exception as torch_error:
|
| 472 |
+
logger.error(f"Failed to load TorchScript model: {torch_error}")
|
| 473 |
+
return False
|
| 474 |
+
|
| 475 |
+
else:
|
| 476 |
+
logger.error(f"Unsupported model format: {ext}")
|
| 477 |
+
logger.info("Supported formats: .pt/.pth (YOLOv8), .onnx (ONNX), .torchscript (TorchScript)")
|
| 478 |
+
return False
|
| 479 |
+
|
| 480 |
+
# Only set loaded if we actually succeeded
|
| 481 |
+
self.model_loaded = True
|
| 482 |
+
self.config['last_model_path'] = model_path
|
| 483 |
+
self.config['model_type'] = self.model_type
|
| 484 |
+
self._save_config()
|
| 485 |
+
|
| 486 |
+
return True
|
| 487 |
+
|
| 488 |
+
except Exception as e:
|
| 489 |
+
logger.error(f"Failed to load model: {e}")
|
| 490 |
+
logger.error(traceback.format_exc())
|
| 491 |
+
self.model_loaded = False
|
| 492 |
+
|
| 493 |
+
# Provide helpful context for .exe users
|
| 494 |
+
logger.info("Note: If running from .exe, some ML libraries may not be included")
|
| 495 |
+
logger.info("This is normal for lightweight builds - bubble detection will be disabled")
|
| 496 |
+
|
| 497 |
+
return False
|
| 498 |
+
|
| 499 |
+
def load_rtdetr_model(self, model_path: str = None, model_id: str = None, force_reload: bool = False) -> bool:
|
| 500 |
+
"""
|
| 501 |
+
Load RT-DETR model for advanced bubble and text detection.
|
| 502 |
+
This implementation avoids the 'meta tensor' copy error by:
|
| 503 |
+
- Serializing the entire load under a class lock (no concurrent loads)
|
| 504 |
+
- Loading directly onto the target device (CUDA if available) via device_map='auto'
|
| 505 |
+
- Avoiding .to() on a potentially-meta model; no device migration post-load
|
| 506 |
+
|
| 507 |
+
Args:
|
| 508 |
+
model_path: Optional path to local model
|
| 509 |
+
model_id: Optional HuggingFace model ID (default: 'ogkalu/comic-text-and-bubble-detector')
|
| 510 |
+
force_reload: Force reload even if already loaded
|
| 511 |
+
|
| 512 |
+
Returns:
|
| 513 |
+
True if successful, False otherwise
|
| 514 |
+
"""
|
| 515 |
+
if not TRANSFORMERS_AVAILABLE:
|
| 516 |
+
logger.error("Transformers library required for RT-DETR. Install with: pip install transformers")
|
| 517 |
+
return False
|
| 518 |
+
|
| 519 |
+
if not PIL_AVAILABLE:
|
| 520 |
+
logger.error("PIL required for RT-DETR. Install with: pip install pillow")
|
| 521 |
+
return False
|
| 522 |
+
|
| 523 |
+
if self.rtdetr_loaded and not force_reload:
|
| 524 |
+
logger.info("RT-DETR model already loaded")
|
| 525 |
+
return True
|
| 526 |
+
|
| 527 |
+
# Fast path: if shared already loaded and not forcing reload, attach
|
| 528 |
+
if BubbleDetector._rtdetr_loaded and not force_reload:
|
| 529 |
+
self.rtdetr_model = BubbleDetector._rtdetr_shared_model
|
| 530 |
+
self.rtdetr_processor = BubbleDetector._rtdetr_shared_processor
|
| 531 |
+
self.rtdetr_loaded = True
|
| 532 |
+
logger.info("RT-DETR model attached from shared cache")
|
| 533 |
+
return True
|
| 534 |
+
|
| 535 |
+
# Serialize the ENTIRE loading sequence to avoid concurrent init issues
|
| 536 |
+
with BubbleDetector._rtdetr_init_lock:
|
| 537 |
+
try:
|
| 538 |
+
# Re-check after acquiring lock
|
| 539 |
+
if BubbleDetector._rtdetr_loaded and not force_reload:
|
| 540 |
+
self.rtdetr_model = BubbleDetector._rtdetr_shared_model
|
| 541 |
+
self.rtdetr_processor = BubbleDetector._rtdetr_shared_processor
|
| 542 |
+
self.rtdetr_loaded = True
|
| 543 |
+
logger.info("RT-DETR model attached from shared cache (post-lock)")
|
| 544 |
+
return True
|
| 545 |
+
|
| 546 |
+
# Use custom model_id if provided, otherwise use default
|
| 547 |
+
repo_id = model_id if model_id else self.rtdetr_repo
|
| 548 |
+
logger.info(f"📥 Loading RT-DETR model from {repo_id}...")
|
| 549 |
+
|
| 550 |
+
# Ensure TorchDynamo/compile doesn't interfere on some builds
|
| 551 |
+
try:
|
| 552 |
+
os.environ.setdefault('TORCHDYNAMO_DISABLE', '1')
|
| 553 |
+
except Exception:
|
| 554 |
+
pass
|
| 555 |
+
|
| 556 |
+
# Decide device strategy
|
| 557 |
+
gpu_available = bool(TORCH_AVAILABLE and hasattr(torch, 'cuda') and torch.cuda.is_available())
|
| 558 |
+
device_map = 'auto' if gpu_available else None
|
| 559 |
+
# Choose dtype
|
| 560 |
+
dtype = None
|
| 561 |
+
if TORCH_AVAILABLE:
|
| 562 |
+
try:
|
| 563 |
+
dtype = torch.float16 if gpu_available else torch.float32
|
| 564 |
+
except Exception:
|
| 565 |
+
dtype = None
|
| 566 |
+
low_cpu = True if gpu_available else False
|
| 567 |
+
|
| 568 |
+
# Load processor (once)
|
| 569 |
+
self.rtdetr_processor = RTDetrImageProcessor.from_pretrained(
|
| 570 |
+
repo_id,
|
| 571 |
+
size={"width": 640, "height": 640},
|
| 572 |
+
cache_dir=self.cache_dir if not model_path else None
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
# Prepare kwargs for from_pretrained
|
| 576 |
+
from_kwargs = {
|
| 577 |
+
'cache_dir': self.cache_dir if not model_path else None,
|
| 578 |
+
'low_cpu_mem_usage': low_cpu,
|
| 579 |
+
'device_map': device_map,
|
| 580 |
+
}
|
| 581 |
+
if dtype is not None:
|
| 582 |
+
from_kwargs['dtype'] = dtype
|
| 583 |
+
|
| 584 |
+
# First attempt: load directly to target (CUDA if available)
|
| 585 |
+
try:
|
| 586 |
+
self.rtdetr_model = RTDetrForObjectDetection.from_pretrained(
|
| 587 |
+
model_path if model_path else repo_id,
|
| 588 |
+
**from_kwargs,
|
| 589 |
+
)
|
| 590 |
+
except Exception as primary_err:
|
| 591 |
+
# Fallback to a simple CPU load (no device move) if CUDA path fails
|
| 592 |
+
logger.warning(f"RT-DETR primary load failed ({primary_err}); retrying on CPU...")
|
| 593 |
+
from_kwargs_fallback = {
|
| 594 |
+
'cache_dir': self.cache_dir if not model_path else None,
|
| 595 |
+
'low_cpu_mem_usage': False,
|
| 596 |
+
'device_map': None,
|
| 597 |
+
}
|
| 598 |
+
if TORCH_AVAILABLE:
|
| 599 |
+
from_kwargs_fallback['dtype'] = torch.float32
|
| 600 |
+
self.rtdetr_model = RTDetrForObjectDetection.from_pretrained(
|
| 601 |
+
model_path if model_path else repo_id,
|
| 602 |
+
**from_kwargs_fallback,
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
# Optional dynamic quantization for linear layers (CPU only)
|
| 606 |
+
if self.quantize_enabled and TORCH_AVAILABLE and (not gpu_available):
|
| 607 |
+
try:
|
| 608 |
+
try:
|
| 609 |
+
import torch.ao.quantization as tq
|
| 610 |
+
quantize_dynamic = tq.quantize_dynamic # type: ignore
|
| 611 |
+
except Exception:
|
| 612 |
+
import torch.quantization as tq # type: ignore
|
| 613 |
+
quantize_dynamic = tq.quantize_dynamic # type: ignore
|
| 614 |
+
self.rtdetr_model = quantize_dynamic(self.rtdetr_model, {torch.nn.Linear}, dtype=torch.qint8)
|
| 615 |
+
logger.info("🔻 Applied dynamic INT8 quantization to RT-DETR linear layers (CPU)")
|
| 616 |
+
except Exception as qe:
|
| 617 |
+
logger.warning(f"RT-DETR dynamic quantization skipped: {qe}")
|
| 618 |
+
|
| 619 |
+
# Finalize
|
| 620 |
+
self.rtdetr_model.eval()
|
| 621 |
+
|
| 622 |
+
# Sanity check: ensure no parameter is left on 'meta' device
|
| 623 |
+
try:
|
| 624 |
+
for n, p in self.rtdetr_model.named_parameters():
|
| 625 |
+
dev = getattr(p, 'device', None)
|
| 626 |
+
if dev is not None and getattr(dev, 'type', '') == 'meta':
|
| 627 |
+
raise RuntimeError(f"Parameter {n} is on 'meta' device after load")
|
| 628 |
+
except Exception as e:
|
| 629 |
+
logger.error(f"RT-DETR load sanity check failed: {e}")
|
| 630 |
+
self.rtdetr_loaded = False
|
| 631 |
+
return False
|
| 632 |
+
|
| 633 |
+
# Publish shared cache
|
| 634 |
+
BubbleDetector._rtdetr_shared_model = self.rtdetr_model
|
| 635 |
+
BubbleDetector._rtdetr_shared_processor = self.rtdetr_processor
|
| 636 |
+
BubbleDetector._rtdetr_loaded = True
|
| 637 |
+
BubbleDetector._rtdetr_repo_id = repo_id
|
| 638 |
+
|
| 639 |
+
self.rtdetr_loaded = True
|
| 640 |
+
|
| 641 |
+
# Save the model ID that was used
|
| 642 |
+
self.config['rtdetr_loaded'] = True
|
| 643 |
+
self.config['rtdetr_model_id'] = repo_id
|
| 644 |
+
self._save_config()
|
| 645 |
+
|
| 646 |
+
loc = 'CUDA' if gpu_available else 'CPU'
|
| 647 |
+
logger.info(f"✅ RT-DETR model loaded successfully ({loc})")
|
| 648 |
+
logger.info(" Classes: Empty bubbles, Text bubbles, Free text")
|
| 649 |
+
|
| 650 |
+
# Auto-convert to ONNX for RT-DETR only if explicitly enabled
|
| 651 |
+
if os.environ.get('AUTO_CONVERT_RTDETR_ONNX', 'false').lower() == 'true':
|
| 652 |
+
onnx_path = os.path.join(self.cache_dir, 'rtdetr_comic.onnx')
|
| 653 |
+
if self.convert_to_onnx('rtdetr', onnx_path):
|
| 654 |
+
logger.info("🚀 RT-DETR converted to ONNX for faster inference")
|
| 655 |
+
# Store ONNX path for later use
|
| 656 |
+
self.config['rtdetr_onnx_path'] = onnx_path
|
| 657 |
+
self._save_config()
|
| 658 |
+
# Optionally quantize ONNX for reduced RAM
|
| 659 |
+
if self.onnx_quantize_enabled:
|
| 660 |
+
try:
|
| 661 |
+
from onnxruntime.quantization import quantize_dynamic, QuantType
|
| 662 |
+
quant_path = os.path.splitext(onnx_path)[0] + ".int8.onnx"
|
| 663 |
+
if not os.path.exists(quant_path) or os.environ.get('FORCE_ONNX_REBUILD', 'false').lower() == 'true':
|
| 664 |
+
logger.info("🔻 Quantizing RT-DETR ONNX to INT8 (dynamic)...")
|
| 665 |
+
quantize_dynamic(model_input=onnx_path, model_output=quant_path, weight_type=QuantType.QInt8, op_types_to_quantize=['Conv', 'MatMul'])
|
| 666 |
+
self.config['rtdetr_onnx_quantized_path'] = quant_path
|
| 667 |
+
self._save_config()
|
| 668 |
+
logger.info(f"✅ Quantized RT-DETR ONNX saved to: {quant_path}")
|
| 669 |
+
except Exception as qe:
|
| 670 |
+
logger.warning(f"ONNX quantization for RT-DETR skipped: {qe}")
|
| 671 |
+
else:
|
| 672 |
+
logger.info("ℹ️ Skipping RT-DETR ONNX export (converter not supported in current environment)")
|
| 673 |
+
|
| 674 |
+
return True
|
| 675 |
+
except Exception as e:
|
| 676 |
+
logger.error(f"❌ Failed to load RT-DETR: {e}")
|
| 677 |
+
self.rtdetr_loaded = False
|
| 678 |
+
return False
|
| 679 |
+
|
| 680 |
+
def check_rtdetr_available(self, model_id: str = None) -> bool:
|
| 681 |
+
"""
|
| 682 |
+
Check if RT-DETR model is available (cached).
|
| 683 |
+
|
| 684 |
+
Args:
|
| 685 |
+
model_id: Optional HuggingFace model ID
|
| 686 |
+
|
| 687 |
+
Returns:
|
| 688 |
+
True if model is cached and available
|
| 689 |
+
"""
|
| 690 |
+
try:
|
| 691 |
+
from pathlib import Path
|
| 692 |
+
|
| 693 |
+
# Use provided model_id or default
|
| 694 |
+
repo_id = model_id if model_id else self.rtdetr_repo
|
| 695 |
+
|
| 696 |
+
# Check HuggingFace cache
|
| 697 |
+
cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
|
| 698 |
+
model_id_formatted = repo_id.replace("/", "--")
|
| 699 |
+
|
| 700 |
+
# Look for model folder
|
| 701 |
+
model_folders = list(cache_dir.glob(f"models--{model_id_formatted}*"))
|
| 702 |
+
|
| 703 |
+
if model_folders:
|
| 704 |
+
for folder in model_folders:
|
| 705 |
+
if (folder / "snapshots").exists():
|
| 706 |
+
snapshots = list((folder / "snapshots").iterdir())
|
| 707 |
+
if snapshots:
|
| 708 |
+
return True
|
| 709 |
+
|
| 710 |
+
return False
|
| 711 |
+
|
| 712 |
+
except Exception:
|
| 713 |
+
return False
|
| 714 |
+
|
| 715 |
+
def detect_bubbles(self,
|
| 716 |
+
image_path: str,
|
| 717 |
+
confidence: float = None,
|
| 718 |
+
iou_threshold: float = None,
|
| 719 |
+
max_detections: int = None,
|
| 720 |
+
use_rtdetr: bool = None) -> List[Tuple[int, int, int, int]]:
|
| 721 |
+
"""
|
| 722 |
+
Detect speech bubbles in an image (backward compatible method).
|
| 723 |
+
|
| 724 |
+
Args:
|
| 725 |
+
image_path: Path to image file
|
| 726 |
+
confidence: Minimum confidence threshold (0-1)
|
| 727 |
+
iou_threshold: IOU threshold for NMS (0-1)
|
| 728 |
+
max_detections: Maximum number of detections to return
|
| 729 |
+
use_rtdetr: If True, use RT-DETR instead of YOLOv8 (if available)
|
| 730 |
+
|
| 731 |
+
Returns:
|
| 732 |
+
List of bubble bounding boxes as (x, y, width, height) tuples
|
| 733 |
+
"""
|
| 734 |
+
# Check for stop at start
|
| 735 |
+
if self._check_stop():
|
| 736 |
+
self._log("⏹️ Bubble detection stopped by user", "warning")
|
| 737 |
+
return []
|
| 738 |
+
|
| 739 |
+
# Decide which model to use
|
| 740 |
+
if use_rtdetr is None:
|
| 741 |
+
# Auto-select: prefer RT-DETR if available
|
| 742 |
+
use_rtdetr = self.rtdetr_loaded
|
| 743 |
+
|
| 744 |
+
if use_rtdetr:
|
| 745 |
+
# Prefer ONNX backend if available, else PyTorch
|
| 746 |
+
if getattr(self, 'rtdetr_onnx_loaded', False):
|
| 747 |
+
results = self.detect_with_rtdetr_onnx(
|
| 748 |
+
image_path=image_path,
|
| 749 |
+
confidence=confidence,
|
| 750 |
+
return_all_bubbles=True
|
| 751 |
+
)
|
| 752 |
+
return results
|
| 753 |
+
if self.rtdetr_loaded:
|
| 754 |
+
results = self.detect_with_rtdetr(
|
| 755 |
+
image_path=image_path,
|
| 756 |
+
confidence=confidence,
|
| 757 |
+
return_all_bubbles=True
|
| 758 |
+
)
|
| 759 |
+
return results
|
| 760 |
+
|
| 761 |
+
# Original YOLOv8 detection
|
| 762 |
+
if not self.model_loaded:
|
| 763 |
+
logger.error("No model loaded. Call load_model() first.")
|
| 764 |
+
return []
|
| 765 |
+
|
| 766 |
+
# Use defaults if not specified
|
| 767 |
+
confidence = confidence or self.default_confidence
|
| 768 |
+
iou_threshold = iou_threshold or self.default_iou_threshold
|
| 769 |
+
max_detections = max_detections or self.default_max_detections
|
| 770 |
+
|
| 771 |
+
try:
|
| 772 |
+
# Load image
|
| 773 |
+
image = cv2.imread(image_path)
|
| 774 |
+
if image is None:
|
| 775 |
+
logger.error(f"Failed to load image: {image_path}")
|
| 776 |
+
return []
|
| 777 |
+
|
| 778 |
+
h, w = image.shape[:2]
|
| 779 |
+
self._log(f"🔍 Detecting bubbles in {w}x{h} image")
|
| 780 |
+
|
| 781 |
+
# Check for stop before inference
|
| 782 |
+
if self._check_stop():
|
| 783 |
+
self._log("⏹️ Bubble detection inference stopped by user", "warning")
|
| 784 |
+
return []
|
| 785 |
+
|
| 786 |
+
if self.model_type == 'yolo':
|
| 787 |
+
# YOLOv8 inference
|
| 788 |
+
results = self.model(
|
| 789 |
+
image_path,
|
| 790 |
+
conf=confidence,
|
| 791 |
+
iou=iou_threshold,
|
| 792 |
+
max_det=min(max_detections, getattr(self, 'max_det_yolo', max_detections)),
|
| 793 |
+
verbose=False
|
| 794 |
+
)
|
| 795 |
+
|
| 796 |
+
bubbles = []
|
| 797 |
+
for r in results:
|
| 798 |
+
if r.boxes is not None:
|
| 799 |
+
for box in r.boxes:
|
| 800 |
+
# Get box coordinates
|
| 801 |
+
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
|
| 802 |
+
x, y = int(x1), int(y1)
|
| 803 |
+
width = int(x2 - x1)
|
| 804 |
+
height = int(y2 - y1)
|
| 805 |
+
|
| 806 |
+
# Get confidence
|
| 807 |
+
conf = float(box.conf[0])
|
| 808 |
+
|
| 809 |
+
# Add to list
|
| 810 |
+
if len(bubbles) < max_detections:
|
| 811 |
+
bubbles.append((x, y, width, height))
|
| 812 |
+
|
| 813 |
+
logger.debug(f" Bubble: ({x},{y}) {width}x{height} conf={conf:.2f}")
|
| 814 |
+
|
| 815 |
+
elif self.model_type == 'onnx':
|
| 816 |
+
# ONNX inference
|
| 817 |
+
bubbles = self._detect_with_onnx(image, confidence, iou_threshold, max_detections)
|
| 818 |
+
|
| 819 |
+
elif self.model_type == 'torch':
|
| 820 |
+
# TorchScript inference
|
| 821 |
+
bubbles = self._detect_with_torchscript(image, confidence, iou_threshold, max_detections)
|
| 822 |
+
|
| 823 |
+
else:
|
| 824 |
+
logger.error(f"Unknown model type: {self.model_type}")
|
| 825 |
+
return []
|
| 826 |
+
|
| 827 |
+
logger.info(f"✅ Detected {len(bubbles)} speech bubbles")
|
| 828 |
+
time.sleep(0.1) # Brief pause for stability
|
| 829 |
+
logger.debug("💤 Bubble detection pausing briefly for stability")
|
| 830 |
+
return bubbles
|
| 831 |
+
|
| 832 |
+
except Exception as e:
|
| 833 |
+
logger.error(f"Detection failed: {e}")
|
| 834 |
+
logger.error(traceback.format_exc())
|
| 835 |
+
return []
|
| 836 |
+
|
| 837 |
+
def detect_with_rtdetr(self,
|
| 838 |
+
image_path: str = None,
|
| 839 |
+
image: np.ndarray = None,
|
| 840 |
+
confidence: float = None,
|
| 841 |
+
return_all_bubbles: bool = False) -> Any:
|
| 842 |
+
"""
|
| 843 |
+
Detect using RT-DETR model with 3-class detection (PyTorch backend).
|
| 844 |
+
|
| 845 |
+
Args:
|
| 846 |
+
image_path: Path to image file
|
| 847 |
+
image: Image array (BGR format)
|
| 848 |
+
confidence: Confidence threshold
|
| 849 |
+
return_all_bubbles: If True, return list of bubble boxes (for compatibility)
|
| 850 |
+
If False, return dict with all classes
|
| 851 |
+
|
| 852 |
+
Returns:
|
| 853 |
+
List of bubbles if return_all_bubbles=True, else dict with classes
|
| 854 |
+
"""
|
| 855 |
+
# Check for stop at start
|
| 856 |
+
if self._check_stop():
|
| 857 |
+
self._log("⏹️ RT-DETR detection stopped by user", "warning")
|
| 858 |
+
if return_all_bubbles:
|
| 859 |
+
return []
|
| 860 |
+
return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
|
| 861 |
+
|
| 862 |
+
if not self.rtdetr_loaded:
|
| 863 |
+
self._log("RT-DETR not loaded. Call load_rtdetr_model() first.", "warning")
|
| 864 |
+
if return_all_bubbles:
|
| 865 |
+
return []
|
| 866 |
+
return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
|
| 867 |
+
|
| 868 |
+
confidence = confidence or self.default_confidence
|
| 869 |
+
|
| 870 |
+
try:
|
| 871 |
+
# Load image
|
| 872 |
+
if image_path:
|
| 873 |
+
image = cv2.imread(image_path)
|
| 874 |
+
elif image is None:
|
| 875 |
+
logger.error("No image provided")
|
| 876 |
+
if return_all_bubbles:
|
| 877 |
+
return []
|
| 878 |
+
return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
|
| 879 |
+
|
| 880 |
+
# Convert BGR to RGB for PIL
|
| 881 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 882 |
+
pil_image = Image.fromarray(image_rgb)
|
| 883 |
+
|
| 884 |
+
# Prepare image for model
|
| 885 |
+
inputs = self.rtdetr_processor(images=pil_image, return_tensors="pt")
|
| 886 |
+
|
| 887 |
+
# Move inputs to the same device as the model and match model dtype for floating tensors
|
| 888 |
+
model_device = next(self.rtdetr_model.parameters()).device if self.rtdetr_model is not None else (torch.device('cpu') if TORCH_AVAILABLE else 'cpu')
|
| 889 |
+
model_dtype = None
|
| 890 |
+
if TORCH_AVAILABLE and self.rtdetr_model is not None:
|
| 891 |
+
try:
|
| 892 |
+
model_dtype = next(self.rtdetr_model.parameters()).dtype
|
| 893 |
+
except Exception:
|
| 894 |
+
model_dtype = None
|
| 895 |
+
|
| 896 |
+
if TORCH_AVAILABLE:
|
| 897 |
+
new_inputs = {}
|
| 898 |
+
for k, v in inputs.items():
|
| 899 |
+
if isinstance(v, torch.Tensor):
|
| 900 |
+
v = v.to(model_device)
|
| 901 |
+
if model_dtype is not None and torch.is_floating_point(v):
|
| 902 |
+
v = v.to(model_dtype)
|
| 903 |
+
new_inputs[k] = v
|
| 904 |
+
inputs = new_inputs
|
| 905 |
+
|
| 906 |
+
# Run inference with autocast when model is half/bfloat16 on CUDA
|
| 907 |
+
use_amp = TORCH_AVAILABLE and hasattr(model_device, 'type') and model_device.type == 'cuda' and (model_dtype in (torch.float16, torch.bfloat16))
|
| 908 |
+
autocast_dtype = model_dtype if model_dtype in (torch.float16, torch.bfloat16) else None
|
| 909 |
+
|
| 910 |
+
with torch.no_grad():
|
| 911 |
+
if use_amp and autocast_dtype is not None:
|
| 912 |
+
with torch.autocast('cuda', dtype=autocast_dtype):
|
| 913 |
+
outputs = self.rtdetr_model(**inputs)
|
| 914 |
+
else:
|
| 915 |
+
outputs = self.rtdetr_model(**inputs)
|
| 916 |
+
|
| 917 |
+
# Brief pause for stability after inference
|
| 918 |
+
time.sleep(0.1)
|
| 919 |
+
logger.debug("💤 RT-DETR inference pausing briefly for stability")
|
| 920 |
+
|
| 921 |
+
# Post-process results
|
| 922 |
+
target_sizes = torch.tensor([pil_image.size[::-1]]) if TORCH_AVAILABLE else None
|
| 923 |
+
if TORCH_AVAILABLE and hasattr(model_device, 'type') and model_device.type == "cuda":
|
| 924 |
+
target_sizes = target_sizes.to(model_device)
|
| 925 |
+
|
| 926 |
+
results = self.rtdetr_processor.post_process_object_detection(
|
| 927 |
+
outputs,
|
| 928 |
+
target_sizes=target_sizes,
|
| 929 |
+
threshold=confidence
|
| 930 |
+
)[0]
|
| 931 |
+
|
| 932 |
+
# Apply per-detector cap if configured
|
| 933 |
+
cap = getattr(self, 'max_det_rtdetr', self.default_max_detections)
|
| 934 |
+
if cap and len(results['boxes']) > cap:
|
| 935 |
+
# Keep top-scoring first
|
| 936 |
+
scores = results['scores']
|
| 937 |
+
top_idx = scores.topk(k=cap).indices if hasattr(scores, 'topk') else range(cap)
|
| 938 |
+
results = {
|
| 939 |
+
'boxes': [results['boxes'][i] for i in top_idx],
|
| 940 |
+
'scores': [results['scores'][i] for i in top_idx],
|
| 941 |
+
'labels': [results['labels'][i] for i in top_idx]
|
| 942 |
+
}
|
| 943 |
+
|
| 944 |
+
logger.info(f"📊 RT-DETR found {len(results['boxes'])} detections above {confidence:.2f} confidence")
|
| 945 |
+
|
| 946 |
+
# Organize detections by class
|
| 947 |
+
detections = {
|
| 948 |
+
'bubbles': [], # Empty speech bubbles
|
| 949 |
+
'text_bubbles': [], # Bubbles with text
|
| 950 |
+
'text_free': [] # Text without bubbles
|
| 951 |
+
}
|
| 952 |
+
|
| 953 |
+
for box, score, label in zip(results['boxes'], results['scores'], results['labels']):
|
| 954 |
+
x1, y1, x2, y2 = map(int, box.tolist())
|
| 955 |
+
width = x2 - x1
|
| 956 |
+
height = y2 - y1
|
| 957 |
+
|
| 958 |
+
# Store as (x, y, width, height) to match YOLOv8 format
|
| 959 |
+
bbox = (x1, y1, width, height)
|
| 960 |
+
|
| 961 |
+
label_id = label.item()
|
| 962 |
+
if label_id == self.CLASS_BUBBLE:
|
| 963 |
+
detections['bubbles'].append(bbox)
|
| 964 |
+
elif label_id == self.CLASS_TEXT_BUBBLE:
|
| 965 |
+
detections['text_bubbles'].append(bbox)
|
| 966 |
+
elif label_id == self.CLASS_TEXT_FREE:
|
| 967 |
+
detections['text_free'].append(bbox)
|
| 968 |
+
|
| 969 |
+
# Stop early if we hit the configured cap across all classes
|
| 970 |
+
total_count = len(detections['bubbles']) + len(detections['text_bubbles']) + len(detections['text_free'])
|
| 971 |
+
if total_count >= (self.config.get('manga_settings', {}).get('ocr', {}).get('bubble_max_detections', self.default_max_detections) if isinstance(self.config, dict) else self.default_max_detections):
|
| 972 |
+
break
|
| 973 |
+
|
| 974 |
+
# Log results
|
| 975 |
+
total = len(detections['bubbles']) + len(detections['text_bubbles']) + len(detections['text_free'])
|
| 976 |
+
logger.info(f"✅ RT-DETR detected {total} objects:")
|
| 977 |
+
logger.info(f" - Empty bubbles: {len(detections['bubbles'])}")
|
| 978 |
+
logger.info(f" - Text bubbles: {len(detections['text_bubbles'])}")
|
| 979 |
+
logger.info(f" - Free text: {len(detections['text_free'])}")
|
| 980 |
+
|
| 981 |
+
# Return format based on compatibility mode
|
| 982 |
+
if return_all_bubbles:
|
| 983 |
+
# Return all bubbles (empty + with text) for backward compatibility
|
| 984 |
+
all_bubbles = detections['bubbles'] + detections['text_bubbles']
|
| 985 |
+
return all_bubbles
|
| 986 |
+
else:
|
| 987 |
+
return detections
|
| 988 |
+
|
| 989 |
+
except Exception as e:
|
| 990 |
+
logger.error(f"RT-DETR detection failed: {e}")
|
| 991 |
+
logger.error(traceback.format_exc())
|
| 992 |
+
if return_all_bubbles:
|
| 993 |
+
return []
|
| 994 |
+
return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
|
| 995 |
+
|
| 996 |
+
def detect_all_text_regions(self, image_path: str = None, image: np.ndarray = None) -> List[Tuple[int, int, int, int]]:
|
| 997 |
+
"""
|
| 998 |
+
Detect all text regions using RT-DETR (both in bubbles and free text).
|
| 999 |
+
|
| 1000 |
+
Returns:
|
| 1001 |
+
List of bounding boxes for all text regions
|
| 1002 |
+
"""
|
| 1003 |
+
if not self.rtdetr_loaded:
|
| 1004 |
+
logger.warning("RT-DETR required for text detection")
|
| 1005 |
+
return []
|
| 1006 |
+
|
| 1007 |
+
detections = self.detect_with_rtdetr(image_path=image_path, image=image, return_all_bubbles=False)
|
| 1008 |
+
|
| 1009 |
+
# Combine text bubbles and free text
|
| 1010 |
+
all_text = detections['text_bubbles'] + detections['text_free']
|
| 1011 |
+
|
| 1012 |
+
logger.info(f"📝 Found {len(all_text)} text regions total")
|
| 1013 |
+
return all_text
|
| 1014 |
+
|
| 1015 |
+
def _detect_with_onnx(self, image: np.ndarray, confidence: float,
|
| 1016 |
+
iou_threshold: float, max_detections: int) -> List[Tuple[int, int, int, int]]:
|
| 1017 |
+
"""Run detection using ONNX model."""
|
| 1018 |
+
# Preprocess image
|
| 1019 |
+
img_size = 640 # Standard YOLOv8 input size
|
| 1020 |
+
img_resized = cv2.resize(image, (img_size, img_size))
|
| 1021 |
+
img_norm = img_resized.astype(np.float32) / 255.0
|
| 1022 |
+
img_transposed = np.transpose(img_norm, (2, 0, 1))
|
| 1023 |
+
img_batch = np.expand_dims(img_transposed, axis=0)
|
| 1024 |
+
|
| 1025 |
+
# Run inference
|
| 1026 |
+
input_name = self.onnx_session.get_inputs()[0].name
|
| 1027 |
+
outputs = self.onnx_session.run(None, {input_name: img_batch})
|
| 1028 |
+
|
| 1029 |
+
# Process outputs (YOLOv8 format)
|
| 1030 |
+
predictions = outputs[0][0] # Remove batch dimension
|
| 1031 |
+
|
| 1032 |
+
# Filter by confidence and apply NMS
|
| 1033 |
+
bubbles = []
|
| 1034 |
+
boxes = []
|
| 1035 |
+
scores = []
|
| 1036 |
+
|
| 1037 |
+
for pred in predictions.T: # Transpose to get predictions per detection
|
| 1038 |
+
if len(pred) >= 5:
|
| 1039 |
+
x_center, y_center, width, height, obj_conf = pred[:5]
|
| 1040 |
+
|
| 1041 |
+
if obj_conf >= confidence:
|
| 1042 |
+
# Convert to corner coordinates
|
| 1043 |
+
x1 = x_center - width / 2
|
| 1044 |
+
y1 = y_center - height / 2
|
| 1045 |
+
|
| 1046 |
+
# Scale to original image size
|
| 1047 |
+
h, w = image.shape[:2]
|
| 1048 |
+
x1 = int(x1 * w / img_size)
|
| 1049 |
+
y1 = int(y1 * h / img_size)
|
| 1050 |
+
width = int(width * w / img_size)
|
| 1051 |
+
height = int(height * h / img_size)
|
| 1052 |
+
|
| 1053 |
+
boxes.append([x1, y1, x1 + width, y1 + height])
|
| 1054 |
+
scores.append(float(obj_conf))
|
| 1055 |
+
|
| 1056 |
+
# Apply NMS
|
| 1057 |
+
if boxes:
|
| 1058 |
+
indices = cv2.dnn.NMSBoxes(boxes, scores, confidence, iou_threshold)
|
| 1059 |
+
if len(indices) > 0:
|
| 1060 |
+
indices = indices.flatten()[:max_detections]
|
| 1061 |
+
for i in indices:
|
| 1062 |
+
x1, y1, x2, y2 = boxes[i]
|
| 1063 |
+
bubbles.append((x1, y1, x2 - x1, y2 - y1))
|
| 1064 |
+
|
| 1065 |
+
return bubbles
|
| 1066 |
+
|
| 1067 |
+
def _detect_with_torchscript(self, image: np.ndarray, confidence: float,
|
| 1068 |
+
iou_threshold: float, max_detections: int) -> List[Tuple[int, int, int, int]]:
|
| 1069 |
+
"""Run detection using TorchScript model."""
|
| 1070 |
+
# Similar to ONNX but using PyTorch tensors
|
| 1071 |
+
img_size = 640
|
| 1072 |
+
img_resized = cv2.resize(image, (img_size, img_size))
|
| 1073 |
+
img_norm = img_resized.astype(np.float32) / 255.0
|
| 1074 |
+
img_tensor = torch.from_numpy(img_norm).permute(2, 0, 1).unsqueeze(0)
|
| 1075 |
+
|
| 1076 |
+
if self.use_gpu:
|
| 1077 |
+
img_tensor = img_tensor.cuda()
|
| 1078 |
+
|
| 1079 |
+
with torch.no_grad():
|
| 1080 |
+
outputs = self.model(img_tensor)
|
| 1081 |
+
|
| 1082 |
+
# Process outputs similar to ONNX
|
| 1083 |
+
# Implementation depends on exact model output format
|
| 1084 |
+
# This is a placeholder - adjust based on your model
|
| 1085 |
+
return []
|
| 1086 |
+
|
| 1087 |
+
def visualize_detections(self, image_path: str, bubbles: List[Tuple[int, int, int, int]] = None,
|
| 1088 |
+
output_path: str = None, use_rtdetr: bool = False) -> np.ndarray:
|
| 1089 |
+
"""
|
| 1090 |
+
Visualize detected bubbles on the image.
|
| 1091 |
+
|
| 1092 |
+
Args:
|
| 1093 |
+
image_path: Path to original image
|
| 1094 |
+
bubbles: List of bubble bounding boxes (if None, will detect)
|
| 1095 |
+
output_path: Optional path to save visualization
|
| 1096 |
+
use_rtdetr: Use RT-DETR for visualization with class colors
|
| 1097 |
+
|
| 1098 |
+
Returns:
|
| 1099 |
+
Image with drawn bounding boxes
|
| 1100 |
+
"""
|
| 1101 |
+
image = cv2.imread(image_path)
|
| 1102 |
+
if image is None:
|
| 1103 |
+
logger.error(f"Failed to load image: {image_path}")
|
| 1104 |
+
return None
|
| 1105 |
+
|
| 1106 |
+
vis_image = image.copy()
|
| 1107 |
+
|
| 1108 |
+
if use_rtdetr and self.rtdetr_loaded:
|
| 1109 |
+
# RT-DETR visualization with different colors per class
|
| 1110 |
+
detections = self.detect_with_rtdetr(image_path=image_path, return_all_bubbles=False)
|
| 1111 |
+
|
| 1112 |
+
# Colors for each class
|
| 1113 |
+
colors = {
|
| 1114 |
+
'bubbles': (0, 255, 0), # Green for empty bubbles
|
| 1115 |
+
'text_bubbles': (255, 0, 0), # Blue for text bubbles
|
| 1116 |
+
'text_free': (0, 0, 255) # Red for free text
|
| 1117 |
+
}
|
| 1118 |
+
|
| 1119 |
+
# Draw detections
|
| 1120 |
+
for class_name, bboxes in detections.items():
|
| 1121 |
+
color = colors[class_name]
|
| 1122 |
+
|
| 1123 |
+
for i, (x, y, w, h) in enumerate(bboxes):
|
| 1124 |
+
# Draw rectangle
|
| 1125 |
+
cv2.rectangle(vis_image, (x, y), (x + w, y + h), color, 2)
|
| 1126 |
+
|
| 1127 |
+
# Add label
|
| 1128 |
+
label = f"{class_name.replace('_', ' ').title()} {i+1}"
|
| 1129 |
+
label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
| 1130 |
+
cv2.rectangle(vis_image, (x, y - label_size[1] - 4),
|
| 1131 |
+
(x + label_size[0], y), color, -1)
|
| 1132 |
+
cv2.putText(vis_image, label, (x, y - 2),
|
| 1133 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
| 1134 |
+
else:
|
| 1135 |
+
# Original YOLOv8 visualization
|
| 1136 |
+
if bubbles is None:
|
| 1137 |
+
bubbles = self.detect_bubbles(image_path)
|
| 1138 |
+
|
| 1139 |
+
# Draw bounding boxes
|
| 1140 |
+
for i, (x, y, w, h) in enumerate(bubbles):
|
| 1141 |
+
# Draw rectangle
|
| 1142 |
+
color = (0, 255, 0) # Green
|
| 1143 |
+
thickness = 2
|
| 1144 |
+
cv2.rectangle(vis_image, (x, y), (x + w, y + h), color, thickness)
|
| 1145 |
+
|
| 1146 |
+
# Add label
|
| 1147 |
+
label = f"Bubble {i+1}"
|
| 1148 |
+
label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
| 1149 |
+
cv2.rectangle(vis_image, (x, y - label_size[1] - 4), (x + label_size[0], y), color, -1)
|
| 1150 |
+
cv2.putText(vis_image, label, (x, y - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
| 1151 |
+
|
| 1152 |
+
# Save if output path provided
|
| 1153 |
+
if output_path:
|
| 1154 |
+
cv2.imwrite(output_path, vis_image)
|
| 1155 |
+
logger.info(f"💾 Visualization saved to: {output_path}")
|
| 1156 |
+
|
| 1157 |
+
return vis_image
|
| 1158 |
+
|
| 1159 |
+
def convert_to_onnx(self, model_path: str, output_path: str = None) -> bool:
|
| 1160 |
+
"""
|
| 1161 |
+
Convert a YOLOv8 or RT-DETR model to ONNX format.
|
| 1162 |
+
|
| 1163 |
+
Args:
|
| 1164 |
+
model_path: Path to model file or 'rtdetr' for loaded RT-DETR
|
| 1165 |
+
output_path: Path for ONNX output (auto-generated if None)
|
| 1166 |
+
|
| 1167 |
+
Returns:
|
| 1168 |
+
True if conversion successful, False otherwise
|
| 1169 |
+
"""
|
| 1170 |
+
try:
|
| 1171 |
+
logger.info(f"🔄 Converting {model_path} to ONNX...")
|
| 1172 |
+
|
| 1173 |
+
# Generate output path if not provided
|
| 1174 |
+
if output_path is None:
|
| 1175 |
+
if model_path == 'rtdetr' and self.rtdetr_loaded:
|
| 1176 |
+
base_name = 'rtdetr_comic'
|
| 1177 |
+
else:
|
| 1178 |
+
base_name = Path(model_path).stem
|
| 1179 |
+
output_path = os.path.join(self.cache_dir, f"{base_name}.onnx")
|
| 1180 |
+
|
| 1181 |
+
# Check if already exists
|
| 1182 |
+
if os.path.exists(output_path) and not os.environ.get('FORCE_ONNX_REBUILD', 'false').lower() == 'true':
|
| 1183 |
+
logger.info(f"✅ ONNX model already exists: {output_path}")
|
| 1184 |
+
return True
|
| 1185 |
+
|
| 1186 |
+
# Handle RT-DETR conversion
|
| 1187 |
+
if model_path == 'rtdetr' and self.rtdetr_loaded:
|
| 1188 |
+
if not TORCH_AVAILABLE:
|
| 1189 |
+
logger.error("PyTorch required for RT-DETR ONNX conversion")
|
| 1190 |
+
return False
|
| 1191 |
+
|
| 1192 |
+
# RT-DETR specific conversion
|
| 1193 |
+
self.rtdetr_model.eval()
|
| 1194 |
+
|
| 1195 |
+
# Create dummy input (pixel values): BxCxHxW
|
| 1196 |
+
dummy_input = torch.randn(1, 3, 640, 640)
|
| 1197 |
+
if self.device == 'cuda':
|
| 1198 |
+
dummy_input = dummy_input.to('cuda')
|
| 1199 |
+
|
| 1200 |
+
# Wrap the model to return only tensors (logits, pred_boxes)
|
| 1201 |
+
class _RTDetrExportWrapper(torch.nn.Module):
|
| 1202 |
+
def __init__(self, mdl):
|
| 1203 |
+
super().__init__()
|
| 1204 |
+
self.mdl = mdl
|
| 1205 |
+
def forward(self, images):
|
| 1206 |
+
out = self.mdl(pixel_values=images)
|
| 1207 |
+
# Handle dict/ModelOutput/tuple outputs
|
| 1208 |
+
logits = None
|
| 1209 |
+
boxes = None
|
| 1210 |
+
try:
|
| 1211 |
+
if isinstance(out, dict):
|
| 1212 |
+
logits = out.get('logits', None)
|
| 1213 |
+
boxes = out.get('pred_boxes', out.get('boxes', None))
|
| 1214 |
+
else:
|
| 1215 |
+
logits = getattr(out, 'logits', None)
|
| 1216 |
+
boxes = getattr(out, 'pred_boxes', getattr(out, 'boxes', None))
|
| 1217 |
+
except Exception:
|
| 1218 |
+
pass
|
| 1219 |
+
if (logits is None or boxes is None) and isinstance(out, (tuple, list)) and len(out) >= 2:
|
| 1220 |
+
logits, boxes = out[0], out[1]
|
| 1221 |
+
return logits, boxes
|
| 1222 |
+
|
| 1223 |
+
wrapper = _RTDetrExportWrapper(self.rtdetr_model)
|
| 1224 |
+
if self.device == 'cuda':
|
| 1225 |
+
wrapper = wrapper.to('cuda')
|
| 1226 |
+
|
| 1227 |
+
# Try PyTorch 2.x dynamo_export first (more tolerant of newer aten ops)
|
| 1228 |
+
try:
|
| 1229 |
+
success = False
|
| 1230 |
+
try:
|
| 1231 |
+
from torch.onnx import dynamo_export
|
| 1232 |
+
try:
|
| 1233 |
+
exp = dynamo_export(wrapper, dummy_input)
|
| 1234 |
+
except TypeError:
|
| 1235 |
+
# Older PyTorch dynamo_export may not support this calling convention
|
| 1236 |
+
exp = dynamo_export(wrapper, dummy_input)
|
| 1237 |
+
# exp may have save(); otherwise, it may expose model_proto
|
| 1238 |
+
try:
|
| 1239 |
+
exp.save(output_path) # type: ignore
|
| 1240 |
+
success = True
|
| 1241 |
+
except Exception:
|
| 1242 |
+
try:
|
| 1243 |
+
import onnx as _onnx
|
| 1244 |
+
_onnx.save(exp.model_proto, output_path) # type: ignore
|
| 1245 |
+
success = True
|
| 1246 |
+
except Exception as _se:
|
| 1247 |
+
logger.warning(f"dynamo_export produced model but could not save: {_se}")
|
| 1248 |
+
except Exception as de:
|
| 1249 |
+
logger.warning(f"dynamo_export failed; falling back to legacy exporter: {de}")
|
| 1250 |
+
if success:
|
| 1251 |
+
logger.info(f"✅ RT-DETR ONNX saved to: {output_path} (dynamo_export)")
|
| 1252 |
+
return True
|
| 1253 |
+
except Exception as de2:
|
| 1254 |
+
logger.warning(f"dynamo_export path error: {de2}")
|
| 1255 |
+
|
| 1256 |
+
# Legacy exporter with opset fallback
|
| 1257 |
+
last_err = None
|
| 1258 |
+
for opset in [19, 18, 17, 16, 15, 14, 13]:
|
| 1259 |
+
try:
|
| 1260 |
+
torch.onnx.export(
|
| 1261 |
+
wrapper,
|
| 1262 |
+
dummy_input,
|
| 1263 |
+
output_path,
|
| 1264 |
+
export_params=True,
|
| 1265 |
+
opset_version=opset,
|
| 1266 |
+
do_constant_folding=True,
|
| 1267 |
+
input_names=['pixel_values'],
|
| 1268 |
+
output_names=['logits', 'boxes'],
|
| 1269 |
+
dynamic_axes={
|
| 1270 |
+
'pixel_values': {0: 'batch', 2: 'height', 3: 'width'},
|
| 1271 |
+
'logits': {0: 'batch'},
|
| 1272 |
+
'boxes': {0: 'batch'}
|
| 1273 |
+
}
|
| 1274 |
+
)
|
| 1275 |
+
logger.info(f"✅ RT-DETR ONNX saved to: {output_path} (opset {opset})")
|
| 1276 |
+
return True
|
| 1277 |
+
except Exception as _e:
|
| 1278 |
+
last_err = _e
|
| 1279 |
+
try:
|
| 1280 |
+
msg = str(_e)
|
| 1281 |
+
except Exception:
|
| 1282 |
+
msg = ''
|
| 1283 |
+
logger.warning(f"RT-DETR ONNX export failed at opset {opset}: {msg}")
|
| 1284 |
+
continue
|
| 1285 |
+
|
| 1286 |
+
logger.error(f"All RT-DETR ONNX export attempts failed. Last error: {last_err}")
|
| 1287 |
+
return False
|
| 1288 |
+
|
| 1289 |
+
# Handle YOLOv8 conversion - FIXED
|
| 1290 |
+
elif YOLO_AVAILABLE and os.path.exists(model_path):
|
| 1291 |
+
logger.info(f"Loading YOLOv8 model from: {model_path}")
|
| 1292 |
+
|
| 1293 |
+
# Load model
|
| 1294 |
+
model = YOLO(model_path)
|
| 1295 |
+
|
| 1296 |
+
# Export to ONNX - this returns the path to the exported model
|
| 1297 |
+
logger.info("Exporting to ONNX format...")
|
| 1298 |
+
exported_path = model.export(format='onnx', imgsz=640, simplify=True)
|
| 1299 |
+
|
| 1300 |
+
# exported_path could be a string or Path object
|
| 1301 |
+
exported_path = str(exported_path) if exported_path else None
|
| 1302 |
+
|
| 1303 |
+
if exported_path and os.path.exists(exported_path):
|
| 1304 |
+
# Move to desired location if different
|
| 1305 |
+
if exported_path != output_path:
|
| 1306 |
+
import shutil
|
| 1307 |
+
logger.info(f"Moving ONNX from {exported_path} to {output_path}")
|
| 1308 |
+
shutil.move(exported_path, output_path)
|
| 1309 |
+
|
| 1310 |
+
logger.info(f"✅ YOLOv8 ONNX saved to: {output_path}")
|
| 1311 |
+
return True
|
| 1312 |
+
else:
|
| 1313 |
+
# Fallback: check if it was created with expected name
|
| 1314 |
+
expected_onnx = model_path.replace('.pt', '.onnx')
|
| 1315 |
+
if os.path.exists(expected_onnx):
|
| 1316 |
+
if expected_onnx != output_path:
|
| 1317 |
+
import shutil
|
| 1318 |
+
shutil.move(expected_onnx, output_path)
|
| 1319 |
+
logger.info(f"✅ YOLOv8 ONNX saved to: {output_path}")
|
| 1320 |
+
return True
|
| 1321 |
+
else:
|
| 1322 |
+
logger.error(f"ONNX export failed - no output file found")
|
| 1323 |
+
return False
|
| 1324 |
+
|
| 1325 |
+
else:
|
| 1326 |
+
logger.error(f"Cannot convert {model_path}: Model not found or dependencies missing")
|
| 1327 |
+
return False
|
| 1328 |
+
|
| 1329 |
+
except Exception as e:
|
| 1330 |
+
logger.error(f"Conversion failed: {e}")
|
| 1331 |
+
# Avoid noisy full stack trace in production logs; return False gracefully
|
| 1332 |
+
return False
|
| 1333 |
+
|
| 1334 |
+
def batch_detect(self, image_paths: List[str], **kwargs) -> Dict[str, List[Tuple[int, int, int, int]]]:
|
| 1335 |
+
"""
|
| 1336 |
+
Detect bubbles in multiple images.
|
| 1337 |
+
|
| 1338 |
+
Args:
|
| 1339 |
+
image_paths: List of image paths
|
| 1340 |
+
**kwargs: Detection parameters (confidence, iou_threshold, max_detections, use_rtdetr)
|
| 1341 |
+
|
| 1342 |
+
Returns:
|
| 1343 |
+
Dictionary mapping image paths to bubble lists
|
| 1344 |
+
"""
|
| 1345 |
+
results = {}
|
| 1346 |
+
|
| 1347 |
+
for i, image_path in enumerate(image_paths):
|
| 1348 |
+
logger.info(f"Processing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}")
|
| 1349 |
+
bubbles = self.detect_bubbles(image_path, **kwargs)
|
| 1350 |
+
results[image_path] = bubbles
|
| 1351 |
+
|
| 1352 |
+
return results
|
| 1353 |
+
|
| 1354 |
+
def unload(self, release_shared: bool = False):
|
| 1355 |
+
"""Release model resources held by this detector instance.
|
| 1356 |
+
Args:
|
| 1357 |
+
release_shared: If True, also clear class-level shared RT-DETR caches.
|
| 1358 |
+
"""
|
| 1359 |
+
try:
|
| 1360 |
+
# Release instance-level models and sessions
|
| 1361 |
+
try:
|
| 1362 |
+
if getattr(self, 'onnx_session', None) is not None:
|
| 1363 |
+
self.onnx_session = None
|
| 1364 |
+
except Exception:
|
| 1365 |
+
pass
|
| 1366 |
+
try:
|
| 1367 |
+
if getattr(self, 'rtdetr_onnx_session', None) is not None:
|
| 1368 |
+
self.rtdetr_onnx_session = None
|
| 1369 |
+
except Exception:
|
| 1370 |
+
pass
|
| 1371 |
+
for attr in ['model', 'rtdetr_model', 'rtdetr_processor']:
|
| 1372 |
+
try:
|
| 1373 |
+
if hasattr(self, attr):
|
| 1374 |
+
setattr(self, attr, None)
|
| 1375 |
+
except Exception:
|
| 1376 |
+
pass
|
| 1377 |
+
for flag in ['model_loaded', 'rtdetr_loaded', 'rtdetr_onnx_loaded']:
|
| 1378 |
+
try:
|
| 1379 |
+
if hasattr(self, flag):
|
| 1380 |
+
setattr(self, flag, False)
|
| 1381 |
+
except Exception:
|
| 1382 |
+
pass
|
| 1383 |
+
|
| 1384 |
+
# Optional: release shared caches
|
| 1385 |
+
if release_shared:
|
| 1386 |
+
try:
|
| 1387 |
+
BubbleDetector._rtdetr_shared_model = None
|
| 1388 |
+
BubbleDetector._rtdetr_shared_processor = None
|
| 1389 |
+
BubbleDetector._rtdetr_loaded = False
|
| 1390 |
+
except Exception:
|
| 1391 |
+
pass
|
| 1392 |
+
|
| 1393 |
+
# Free CUDA cache and trigger GC
|
| 1394 |
+
try:
|
| 1395 |
+
if TORCH_AVAILABLE and torch is not None and torch.cuda.is_available():
|
| 1396 |
+
torch.cuda.empty_cache()
|
| 1397 |
+
except Exception:
|
| 1398 |
+
pass
|
| 1399 |
+
try:
|
| 1400 |
+
import gc
|
| 1401 |
+
gc.collect()
|
| 1402 |
+
except Exception:
|
| 1403 |
+
pass
|
| 1404 |
+
except Exception:
|
| 1405 |
+
# Best-effort only
|
| 1406 |
+
pass
|
| 1407 |
+
|
| 1408 |
+
def get_bubble_masks(self, image_path: str, bubbles: List[Tuple[int, int, int, int]]) -> np.ndarray:
|
| 1409 |
+
"""
|
| 1410 |
+
Create a mask image with bubble regions.
|
| 1411 |
+
|
| 1412 |
+
Args:
|
| 1413 |
+
image_path: Path to original image
|
| 1414 |
+
bubbles: List of bubble bounding boxes
|
| 1415 |
+
|
| 1416 |
+
Returns:
|
| 1417 |
+
Binary mask with bubble regions as white (255)
|
| 1418 |
+
"""
|
| 1419 |
+
image = cv2.imread(image_path)
|
| 1420 |
+
if image is None:
|
| 1421 |
+
return None
|
| 1422 |
+
|
| 1423 |
+
h, w = image.shape[:2]
|
| 1424 |
+
mask = np.zeros((h, w), dtype=np.uint8)
|
| 1425 |
+
|
| 1426 |
+
# Fill bubble regions
|
| 1427 |
+
for x, y, bw, bh in bubbles:
|
| 1428 |
+
cv2.rectangle(mask, (x, y), (x + bw, y + bh), 255, -1)
|
| 1429 |
+
|
| 1430 |
+
return mask
|
| 1431 |
+
|
| 1432 |
+
def filter_bubbles_by_size(self, bubbles: List[Tuple[int, int, int, int]],
|
| 1433 |
+
min_area: int = 100,
|
| 1434 |
+
max_area: int = None) -> List[Tuple[int, int, int, int]]:
|
| 1435 |
+
"""
|
| 1436 |
+
Filter bubbles by area.
|
| 1437 |
+
|
| 1438 |
+
Args:
|
| 1439 |
+
bubbles: List of bubble bounding boxes
|
| 1440 |
+
min_area: Minimum area in pixels
|
| 1441 |
+
max_area: Maximum area in pixels (None for no limit)
|
| 1442 |
+
|
| 1443 |
+
Returns:
|
| 1444 |
+
Filtered list of bubbles
|
| 1445 |
+
"""
|
| 1446 |
+
filtered = []
|
| 1447 |
+
|
| 1448 |
+
for x, y, w, h in bubbles:
|
| 1449 |
+
area = w * h
|
| 1450 |
+
if area >= min_area and (max_area is None or area <= max_area):
|
| 1451 |
+
filtered.append((x, y, w, h))
|
| 1452 |
+
|
| 1453 |
+
return filtered
|
| 1454 |
+
|
| 1455 |
+
def merge_overlapping_bubbles(self, bubbles: List[Tuple[int, int, int, int]],
|
| 1456 |
+
overlap_threshold: float = 0.1) -> List[Tuple[int, int, int, int]]:
|
| 1457 |
+
"""
|
| 1458 |
+
Merge overlapping bubble detections.
|
| 1459 |
+
|
| 1460 |
+
Args:
|
| 1461 |
+
bubbles: List of bubble bounding boxes
|
| 1462 |
+
overlap_threshold: Minimum overlap ratio to merge
|
| 1463 |
+
|
| 1464 |
+
Returns:
|
| 1465 |
+
Merged list of bubbles
|
| 1466 |
+
"""
|
| 1467 |
+
if not bubbles:
|
| 1468 |
+
return []
|
| 1469 |
+
|
| 1470 |
+
# Convert to numpy array for easier manipulation
|
| 1471 |
+
boxes = np.array([(x, y, x+w, y+h) for x, y, w, h in bubbles])
|
| 1472 |
+
|
| 1473 |
+
merged = []
|
| 1474 |
+
used = set()
|
| 1475 |
+
|
| 1476 |
+
for i, box1 in enumerate(boxes):
|
| 1477 |
+
if i in used:
|
| 1478 |
+
continue
|
| 1479 |
+
|
| 1480 |
+
# Start with current box
|
| 1481 |
+
x1, y1, x2, y2 = box1
|
| 1482 |
+
|
| 1483 |
+
# Check for overlaps with remaining boxes
|
| 1484 |
+
for j in range(i + 1, len(boxes)):
|
| 1485 |
+
if j in used:
|
| 1486 |
+
continue
|
| 1487 |
+
|
| 1488 |
+
box2 = boxes[j]
|
| 1489 |
+
|
| 1490 |
+
# Calculate intersection
|
| 1491 |
+
ix1 = max(x1, box2[0])
|
| 1492 |
+
iy1 = max(y1, box2[1])
|
| 1493 |
+
ix2 = min(x2, box2[2])
|
| 1494 |
+
iy2 = min(y2, box2[3])
|
| 1495 |
+
|
| 1496 |
+
if ix1 < ix2 and iy1 < iy2:
|
| 1497 |
+
# Calculate overlap ratio
|
| 1498 |
+
intersection = (ix2 - ix1) * (iy2 - iy1)
|
| 1499 |
+
area1 = (x2 - x1) * (y2 - y1)
|
| 1500 |
+
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
| 1501 |
+
overlap = intersection / min(area1, area2)
|
| 1502 |
+
|
| 1503 |
+
if overlap >= overlap_threshold:
|
| 1504 |
+
# Merge boxes
|
| 1505 |
+
x1 = min(x1, box2[0])
|
| 1506 |
+
y1 = min(y1, box2[1])
|
| 1507 |
+
x2 = max(x2, box2[2])
|
| 1508 |
+
y2 = max(y2, box2[3])
|
| 1509 |
+
used.add(j)
|
| 1510 |
+
|
| 1511 |
+
merged.append((int(x1), int(y1), int(x2 - x1), int(y2 - y1)))
|
| 1512 |
+
|
| 1513 |
+
return merged
|
| 1514 |
+
|
| 1515 |
+
# ============================
|
| 1516 |
+
# RT-DETR (ONNX) BACKEND
|
| 1517 |
+
# ============================
|
| 1518 |
+
def load_rtdetr_onnx_model(self, model_id: str = None, force_reload: bool = False) -> bool:
|
| 1519 |
+
"""
|
| 1520 |
+
Load RT-DETR ONNX model using onnxruntime. Downloads detector.onnx and config.json
|
| 1521 |
+
from the provided Hugging Face repo if not already cached.
|
| 1522 |
+
"""
|
| 1523 |
+
if not ONNX_AVAILABLE:
|
| 1524 |
+
logger.error("ONNX Runtime not available for RT-DETR ONNX backend")
|
| 1525 |
+
return False
|
| 1526 |
+
try:
|
| 1527 |
+
# If singleton mode and already loaded, just attach shared session
|
| 1528 |
+
try:
|
| 1529 |
+
adv = (self.config or {}).get('manga_settings', {}).get('advanced', {}) if isinstance(self.config, dict) else {}
|
| 1530 |
+
singleton = bool(adv.get('use_singleton_models', True))
|
| 1531 |
+
except Exception:
|
| 1532 |
+
singleton = True
|
| 1533 |
+
if singleton and BubbleDetector._rtdetr_onnx_loaded and not force_reload and BubbleDetector._rtdetr_onnx_shared_session is not None:
|
| 1534 |
+
self.rtdetr_onnx_session = BubbleDetector._rtdetr_onnx_shared_session
|
| 1535 |
+
self.rtdetr_onnx_loaded = True
|
| 1536 |
+
return True
|
| 1537 |
+
|
| 1538 |
+
repo = model_id or self.rtdetr_onnx_repo
|
| 1539 |
+
try:
|
| 1540 |
+
from huggingface_hub import hf_hub_download
|
| 1541 |
+
except Exception as e:
|
| 1542 |
+
logger.error(f"huggingface-hub required to fetch RT-DETR ONNX: {e}")
|
| 1543 |
+
return False
|
| 1544 |
+
|
| 1545 |
+
# Ensure local models dir (use configured cache_dir directly: e.g., 'models')
|
| 1546 |
+
cache_dir = self.cache_dir
|
| 1547 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 1548 |
+
|
| 1549 |
+
# Download files into models/ and avoid symlinks so the file is visible there
|
| 1550 |
+
try:
|
| 1551 |
+
_ = hf_hub_download(repo_id=repo, filename='config.json', cache_dir=cache_dir, local_dir=cache_dir, local_dir_use_symlinks=False)
|
| 1552 |
+
except Exception:
|
| 1553 |
+
pass
|
| 1554 |
+
onnx_fp = hf_hub_download(repo_id=repo, filename='detector.onnx', cache_dir=cache_dir, local_dir=cache_dir, local_dir_use_symlinks=False)
|
| 1555 |
+
BubbleDetector._rtdetr_onnx_model_path = onnx_fp
|
| 1556 |
+
|
| 1557 |
+
# Pick providers: prefer CUDA if available; otherwise CPU. Do NOT use DML.
|
| 1558 |
+
providers = ['CPUExecutionProvider']
|
| 1559 |
+
try:
|
| 1560 |
+
avail = ort.get_available_providers() if ONNX_AVAILABLE else []
|
| 1561 |
+
if 'CUDAExecutionProvider' in avail:
|
| 1562 |
+
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
| 1563 |
+
except Exception:
|
| 1564 |
+
pass
|
| 1565 |
+
|
| 1566 |
+
# Session options with reduced memory arena and optional thread limiting in singleton mode
|
| 1567 |
+
so = ort.SessionOptions()
|
| 1568 |
+
try:
|
| 1569 |
+
so.enable_mem_pattern = False
|
| 1570 |
+
so.enable_cpu_mem_arena = False
|
| 1571 |
+
except Exception:
|
| 1572 |
+
pass
|
| 1573 |
+
# If singleton models mode is enabled in config, limit ORT threading to reduce CPU spikes
|
| 1574 |
+
try:
|
| 1575 |
+
adv = (self.config or {}).get('manga_settings', {}).get('advanced', {}) if isinstance(self.config, dict) else {}
|
| 1576 |
+
if bool(adv.get('use_singleton_models', True)):
|
| 1577 |
+
so.intra_op_num_threads = 1
|
| 1578 |
+
so.inter_op_num_threads = 1
|
| 1579 |
+
try:
|
| 1580 |
+
so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
| 1581 |
+
except Exception:
|
| 1582 |
+
pass
|
| 1583 |
+
try:
|
| 1584 |
+
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
|
| 1585 |
+
except Exception:
|
| 1586 |
+
pass
|
| 1587 |
+
except Exception:
|
| 1588 |
+
pass
|
| 1589 |
+
|
| 1590 |
+
# Create session (serialize creation in singleton mode to avoid device storms)
|
| 1591 |
+
if singleton:
|
| 1592 |
+
with BubbleDetector._rtdetr_onnx_init_lock:
|
| 1593 |
+
# Re-check after acquiring lock
|
| 1594 |
+
if BubbleDetector._rtdetr_onnx_loaded and BubbleDetector._rtdetr_onnx_shared_session is not None and not force_reload:
|
| 1595 |
+
self.rtdetr_onnx_session = BubbleDetector._rtdetr_onnx_shared_session
|
| 1596 |
+
self.rtdetr_onnx_loaded = True
|
| 1597 |
+
return True
|
| 1598 |
+
sess = ort.InferenceSession(onnx_fp, providers=providers, sess_options=so)
|
| 1599 |
+
BubbleDetector._rtdetr_onnx_shared_session = sess
|
| 1600 |
+
BubbleDetector._rtdetr_onnx_loaded = True
|
| 1601 |
+
BubbleDetector._rtdetr_onnx_providers = providers
|
| 1602 |
+
self.rtdetr_onnx_session = sess
|
| 1603 |
+
self.rtdetr_onnx_loaded = True
|
| 1604 |
+
else:
|
| 1605 |
+
self.rtdetr_onnx_session = ort.InferenceSession(onnx_fp, providers=providers, sess_options=so)
|
| 1606 |
+
self.rtdetr_onnx_loaded = True
|
| 1607 |
+
logger.info("✅ RT-DETR (ONNX) model ready")
|
| 1608 |
+
return True
|
| 1609 |
+
except Exception as e:
|
| 1610 |
+
logger.error(f"Failed to load RT-DETR ONNX: {e}")
|
| 1611 |
+
self.rtdetr_onnx_session = None
|
| 1612 |
+
self.rtdetr_onnx_loaded = False
|
| 1613 |
+
return False
|
| 1614 |
+
|
| 1615 |
+
def detect_with_rtdetr_onnx(self,
|
| 1616 |
+
image_path: str = None,
|
| 1617 |
+
image: np.ndarray = None,
|
| 1618 |
+
confidence: float = 0.3,
|
| 1619 |
+
return_all_bubbles: bool = False) -> Any:
|
| 1620 |
+
"""Detect using RT-DETR ONNX backend.
|
| 1621 |
+
Returns bubbles list if return_all_bubbles else dict by classes similar to PyTorch path.
|
| 1622 |
+
"""
|
| 1623 |
+
if not self.rtdetr_onnx_loaded or self.rtdetr_onnx_session is None:
|
| 1624 |
+
logger.warning("RT-DETR ONNX not loaded")
|
| 1625 |
+
return [] if return_all_bubbles else {'bubbles': [], 'text_bubbles': [], 'text_free': []}
|
| 1626 |
+
try:
|
| 1627 |
+
# Acquire image
|
| 1628 |
+
if image_path is not None:
|
| 1629 |
+
import cv2
|
| 1630 |
+
image = cv2.imread(image_path)
|
| 1631 |
+
if image is None:
|
| 1632 |
+
raise RuntimeError(f"Failed to read image: {image_path}")
|
| 1633 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 1634 |
+
else:
|
| 1635 |
+
if image is None:
|
| 1636 |
+
raise RuntimeError("No image provided")
|
| 1637 |
+
# Assume image is BGR np.ndarray if from OpenCV
|
| 1638 |
+
try:
|
| 1639 |
+
import cv2
|
| 1640 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 1641 |
+
except Exception:
|
| 1642 |
+
image_rgb = image
|
| 1643 |
+
|
| 1644 |
+
# To PIL then resize 640x640 as in reference
|
| 1645 |
+
from PIL import Image as _PILImage
|
| 1646 |
+
pil_image = _PILImage.fromarray(image_rgb)
|
| 1647 |
+
im_resized = pil_image.resize((640, 640))
|
| 1648 |
+
arr = np.asarray(im_resized, dtype=np.float32) / 255.0
|
| 1649 |
+
arr = np.transpose(arr, (2, 0, 1)) # (3,H,W)
|
| 1650 |
+
im_data = arr[np.newaxis, ...]
|
| 1651 |
+
|
| 1652 |
+
w, h = pil_image.size
|
| 1653 |
+
orig_size = np.array([[w, h]], dtype=np.int64)
|
| 1654 |
+
|
| 1655 |
+
# Run with a concurrency guard when using DML to prevent device hangs
|
| 1656 |
+
providers = BubbleDetector._rtdetr_onnx_providers or []
|
| 1657 |
+
def _do_run(session):
|
| 1658 |
+
return session.run(None, {
|
| 1659 |
+
'images': im_data,
|
| 1660 |
+
'orig_target_sizes': orig_size
|
| 1661 |
+
})
|
| 1662 |
+
if 'DmlExecutionProvider' in providers:
|
| 1663 |
+
acquired = False
|
| 1664 |
+
try:
|
| 1665 |
+
BubbleDetector._rtdetr_onnx_sema.acquire()
|
| 1666 |
+
acquired = True
|
| 1667 |
+
outputs = _do_run(self.rtdetr_onnx_session)
|
| 1668 |
+
except Exception as dml_err:
|
| 1669 |
+
msg = str(dml_err)
|
| 1670 |
+
if '887A0005' in msg or '887A0006' in msg or 'Dml' in msg:
|
| 1671 |
+
# Rebuild CPU session and retry once
|
| 1672 |
+
try:
|
| 1673 |
+
base_path = BubbleDetector._rtdetr_onnx_model_path
|
| 1674 |
+
if base_path:
|
| 1675 |
+
so = ort.SessionOptions()
|
| 1676 |
+
so.enable_mem_pattern = False
|
| 1677 |
+
so.enable_cpu_mem_arena = False
|
| 1678 |
+
cpu_providers = ['CPUExecutionProvider']
|
| 1679 |
+
# Serialize rebuild
|
| 1680 |
+
with BubbleDetector._rtdetr_onnx_init_lock:
|
| 1681 |
+
sess = ort.InferenceSession(base_path, providers=cpu_providers, sess_options=so)
|
| 1682 |
+
BubbleDetector._rtdetr_onnx_shared_session = sess
|
| 1683 |
+
BubbleDetector._rtdetr_onnx_providers = cpu_providers
|
| 1684 |
+
self.rtdetr_onnx_session = sess
|
| 1685 |
+
outputs = _do_run(self.rtdetr_onnx_session)
|
| 1686 |
+
else:
|
| 1687 |
+
raise
|
| 1688 |
+
except Exception:
|
| 1689 |
+
raise
|
| 1690 |
+
else:
|
| 1691 |
+
raise
|
| 1692 |
+
finally:
|
| 1693 |
+
if acquired:
|
| 1694 |
+
try:
|
| 1695 |
+
BubbleDetector._rtdetr_onnx_sema.release()
|
| 1696 |
+
except Exception:
|
| 1697 |
+
pass
|
| 1698 |
+
else:
|
| 1699 |
+
outputs = _do_run(self.rtdetr_onnx_session)
|
| 1700 |
+
|
| 1701 |
+
# outputs expected: labels, boxes, scores
|
| 1702 |
+
labels, boxes, scores = outputs[:3]
|
| 1703 |
+
if labels.ndim == 2 and labels.shape[0] == 1:
|
| 1704 |
+
labels = labels[0]
|
| 1705 |
+
if scores.ndim == 2 and scores.shape[0] == 1:
|
| 1706 |
+
scores = scores[0]
|
| 1707 |
+
if boxes.ndim == 3 and boxes.shape[0] == 1:
|
| 1708 |
+
boxes = boxes[0]
|
| 1709 |
+
|
| 1710 |
+
detections = {'bubbles': [], 'text_bubbles': [], 'text_free': []}
|
| 1711 |
+
bubbles_all = []
|
| 1712 |
+
for lab, box, scr in zip(labels, boxes, scores):
|
| 1713 |
+
if float(scr) < float(confidence):
|
| 1714 |
+
continue
|
| 1715 |
+
x1, y1, x2, y2 = map(int, box)
|
| 1716 |
+
bbox = (x1, y1, x2 - x1, y2 - y1)
|
| 1717 |
+
label_id = int(lab)
|
| 1718 |
+
if label_id == self.CLASS_BUBBLE:
|
| 1719 |
+
detections['bubbles'].append(bbox)
|
| 1720 |
+
bubbles_all.append(bbox)
|
| 1721 |
+
elif label_id == self.CLASS_TEXT_BUBBLE:
|
| 1722 |
+
detections['text_bubbles'].append(bbox)
|
| 1723 |
+
bubbles_all.append(bbox)
|
| 1724 |
+
elif label_id == self.CLASS_TEXT_FREE:
|
| 1725 |
+
detections['text_free'].append(bbox)
|
| 1726 |
+
|
| 1727 |
+
return bubbles_all if return_all_bubbles else detections
|
| 1728 |
+
except Exception as e:
|
| 1729 |
+
logger.error(f"RT-DETR ONNX detection failed: {e}")
|
| 1730 |
+
return [] if return_all_bubbles else {'bubbles': [], 'text_bubbles': [], 'text_free': []}
|
| 1731 |
+
|
| 1732 |
+
|
| 1733 |
+
# Standalone utility functions
|
| 1734 |
+
def download_model_from_huggingface(repo_id: str = "ogkalu/comic-speech-bubble-detector-yolov8m",
|
| 1735 |
+
filename: str = "comic-speech-bubble-detector-yolov8m.pt",
|
| 1736 |
+
cache_dir: str = "models") -> str:
|
| 1737 |
+
"""
|
| 1738 |
+
Download model from Hugging Face Hub.
|
| 1739 |
+
|
| 1740 |
+
Args:
|
| 1741 |
+
repo_id: Hugging Face repository ID
|
| 1742 |
+
filename: Model filename in the repository
|
| 1743 |
+
cache_dir: Local directory to cache the model
|
| 1744 |
+
|
| 1745 |
+
Returns:
|
| 1746 |
+
Path to downloaded model file
|
| 1747 |
+
"""
|
| 1748 |
+
try:
|
| 1749 |
+
from huggingface_hub import hf_hub_download
|
| 1750 |
+
|
| 1751 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 1752 |
+
|
| 1753 |
+
logger.info(f"📥 Downloading {filename} from {repo_id}...")
|
| 1754 |
+
|
| 1755 |
+
model_path = hf_hub_download(
|
| 1756 |
+
repo_id=repo_id,
|
| 1757 |
+
filename=filename,
|
| 1758 |
+
cache_dir=cache_dir,
|
| 1759 |
+
local_dir=cache_dir
|
| 1760 |
+
)
|
| 1761 |
+
|
| 1762 |
+
logger.info(f"✅ Model downloaded to: {model_path}")
|
| 1763 |
+
return model_path
|
| 1764 |
+
|
| 1765 |
+
except ImportError:
|
| 1766 |
+
logger.error("huggingface-hub package required. Install with: pip install huggingface-hub")
|
| 1767 |
+
return None
|
| 1768 |
+
except Exception as e:
|
| 1769 |
+
logger.error(f"Download failed: {e}")
|
| 1770 |
+
return None
|
| 1771 |
+
|
| 1772 |
+
|
| 1773 |
+
def download_rtdetr_model(cache_dir: str = "models") -> bool:
|
| 1774 |
+
"""
|
| 1775 |
+
Download RT-DETR model for advanced detection.
|
| 1776 |
+
|
| 1777 |
+
Args:
|
| 1778 |
+
cache_dir: Directory to cache the model
|
| 1779 |
+
|
| 1780 |
+
Returns:
|
| 1781 |
+
True if successful
|
| 1782 |
+
"""
|
| 1783 |
+
if not TRANSFORMERS_AVAILABLE:
|
| 1784 |
+
logger.error("Transformers required. Install with: pip install transformers")
|
| 1785 |
+
return False
|
| 1786 |
+
|
| 1787 |
+
try:
|
| 1788 |
+
logger.info("📥 Downloading RT-DETR model...")
|
| 1789 |
+
from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
|
| 1790 |
+
|
| 1791 |
+
# This will download and cache the model
|
| 1792 |
+
processor = RTDetrImageProcessor.from_pretrained(
|
| 1793 |
+
"ogkalu/comic-text-and-bubble-detector",
|
| 1794 |
+
cache_dir=cache_dir
|
| 1795 |
+
)
|
| 1796 |
+
model = RTDetrForObjectDetection.from_pretrained(
|
| 1797 |
+
"ogkalu/comic-text-and-bubble-detector",
|
| 1798 |
+
cache_dir=cache_dir
|
| 1799 |
+
)
|
| 1800 |
+
|
| 1801 |
+
logger.info("✅ RT-DETR model downloaded successfully")
|
| 1802 |
+
return True
|
| 1803 |
+
|
| 1804 |
+
except Exception as e:
|
| 1805 |
+
logger.error(f"Download failed: {e}")
|
| 1806 |
+
return False
|
| 1807 |
+
|
| 1808 |
+
|
| 1809 |
+
# Example usage and testing
|
| 1810 |
+
if __name__ == "__main__":
|
| 1811 |
+
import sys
|
| 1812 |
+
|
| 1813 |
+
# Create detector
|
| 1814 |
+
detector = BubbleDetector()
|
| 1815 |
+
|
| 1816 |
+
if len(sys.argv) > 1:
|
| 1817 |
+
if sys.argv[1] == "download":
|
| 1818 |
+
# Download model from Hugging Face
|
| 1819 |
+
model_path = download_model_from_huggingface()
|
| 1820 |
+
if model_path:
|
| 1821 |
+
print(f"YOLOv8 model downloaded to: {model_path}")
|
| 1822 |
+
|
| 1823 |
+
# Also download RT-DETR
|
| 1824 |
+
if download_rtdetr_model():
|
| 1825 |
+
print("RT-DETR model downloaded")
|
| 1826 |
+
|
| 1827 |
+
elif sys.argv[1] == "detect" and len(sys.argv) > 3:
|
| 1828 |
+
# Detect bubbles in an image
|
| 1829 |
+
model_path = sys.argv[2]
|
| 1830 |
+
image_path = sys.argv[3]
|
| 1831 |
+
|
| 1832 |
+
# Load appropriate model
|
| 1833 |
+
if 'rtdetr' in model_path.lower():
|
| 1834 |
+
if detector.load_rtdetr_model():
|
| 1835 |
+
# Use RT-DETR
|
| 1836 |
+
results = detector.detect_with_rtdetr(image_path)
|
| 1837 |
+
print(f"RT-DETR Detection:")
|
| 1838 |
+
print(f" Empty bubbles: {len(results['bubbles'])}")
|
| 1839 |
+
print(f" Text bubbles: {len(results['text_bubbles'])}")
|
| 1840 |
+
print(f" Free text: {len(results['text_free'])}")
|
| 1841 |
+
else:
|
| 1842 |
+
if detector.load_model(model_path):
|
| 1843 |
+
bubbles = detector.detect_bubbles(image_path, confidence=0.5)
|
| 1844 |
+
print(f"YOLOv8 detected {len(bubbles)} bubbles:")
|
| 1845 |
+
for i, (x, y, w, h) in enumerate(bubbles):
|
| 1846 |
+
print(f" Bubble {i+1}: position=({x},{y}) size=({w}x{h})")
|
| 1847 |
+
|
| 1848 |
+
# Optionally visualize
|
| 1849 |
+
if len(sys.argv) > 4:
|
| 1850 |
+
output_path = sys.argv[4]
|
| 1851 |
+
detector.visualize_detections(image_path, output_path=output_path,
|
| 1852 |
+
use_rtdetr='rtdetr' in model_path.lower())
|
| 1853 |
+
|
| 1854 |
+
elif sys.argv[1] == "test-both" and len(sys.argv) > 2:
|
| 1855 |
+
# Test both models
|
| 1856 |
+
image_path = sys.argv[2]
|
| 1857 |
+
|
| 1858 |
+
# Load YOLOv8
|
| 1859 |
+
yolo_path = "models/comic-speech-bubble-detector-yolov8m.pt"
|
| 1860 |
+
if os.path.exists(yolo_path):
|
| 1861 |
+
detector.load_model(yolo_path)
|
| 1862 |
+
yolo_bubbles = detector.detect_bubbles(image_path, use_rtdetr=False)
|
| 1863 |
+
print(f"YOLOv8: {len(yolo_bubbles)} bubbles")
|
| 1864 |
+
|
| 1865 |
+
# Load RT-DETR
|
| 1866 |
+
if detector.load_rtdetr_model():
|
| 1867 |
+
rtdetr_bubbles = detector.detect_bubbles(image_path, use_rtdetr=True)
|
| 1868 |
+
print(f"RT-DETR: {len(rtdetr_bubbles)} bubbles")
|
| 1869 |
+
|
| 1870 |
+
else:
|
| 1871 |
+
print("Usage:")
|
| 1872 |
+
print(" python bubble_detector.py download")
|
| 1873 |
+
print(" python bubble_detector.py detect <model_path> <image_path> [output_path]")
|
| 1874 |
+
print(" python bubble_detector.py test-both <image_path>")
|
| 1875 |
+
|
| 1876 |
+
else:
|
| 1877 |
+
print("Bubble Detector Module (YOLOv8 + RT-DETR)")
|
| 1878 |
+
print("Usage:")
|
| 1879 |
+
print(" python bubble_detector.py download")
|
| 1880 |
+
print(" python bubble_detector.py detect <model_path> <image_path> [output_path]")
|
| 1881 |
+
print(" python bubble_detector.py test-both <image_path>")
|
chapter_extraction_manager.py
ADDED
|
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Chapter Extraction Manager - Manages chapter extraction in subprocess to prevent GUI freezing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
import json
|
| 10 |
+
import threading
|
| 11 |
+
import queue
|
| 12 |
+
import time
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ChapterExtractionManager:
|
| 17 |
+
"""
|
| 18 |
+
Manages chapter extraction in a separate process to prevent GUI freezing
|
| 19 |
+
Similar to GlossaryManager but for chapter extraction
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, log_callback=None):
|
| 23 |
+
"""
|
| 24 |
+
Initialize the extraction manager
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
log_callback: Function to call with log messages (for GUI integration)
|
| 28 |
+
"""
|
| 29 |
+
self.log_callback = log_callback
|
| 30 |
+
self.process = None
|
| 31 |
+
self.output_queue = queue.Queue()
|
| 32 |
+
self.error_queue = queue.Queue()
|
| 33 |
+
self.result = None
|
| 34 |
+
self.is_running = False
|
| 35 |
+
self.stop_requested = False
|
| 36 |
+
|
| 37 |
+
def extract_chapters_async(self, epub_path, output_dir, extraction_mode="smart",
|
| 38 |
+
progress_callback=None, completion_callback=None):
|
| 39 |
+
"""
|
| 40 |
+
Start chapter extraction in a subprocess
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
epub_path: Path to EPUB file
|
| 44 |
+
output_dir: Output directory for extracted content
|
| 45 |
+
extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
|
| 46 |
+
progress_callback: Function to call with progress updates
|
| 47 |
+
completion_callback: Function to call when extraction completes
|
| 48 |
+
"""
|
| 49 |
+
if self.is_running:
|
| 50 |
+
self._log("⚠️ Chapter extraction already in progress")
|
| 51 |
+
return False
|
| 52 |
+
|
| 53 |
+
self.is_running = True
|
| 54 |
+
self.stop_requested = False
|
| 55 |
+
self.result = None
|
| 56 |
+
|
| 57 |
+
# Start extraction in a thread that manages the subprocess
|
| 58 |
+
thread = threading.Thread(
|
| 59 |
+
target=self._run_extraction_subprocess,
|
| 60 |
+
args=(epub_path, output_dir, extraction_mode, progress_callback, completion_callback),
|
| 61 |
+
daemon=True
|
| 62 |
+
)
|
| 63 |
+
thread.start()
|
| 64 |
+
|
| 65 |
+
return True
|
| 66 |
+
|
| 67 |
+
def _run_extraction_subprocess(self, epub_path, output_dir, extraction_mode,
|
| 68 |
+
progress_callback, completion_callback):
|
| 69 |
+
"""
|
| 70 |
+
Run the extraction subprocess and handle its output
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
# Build command differently for frozen vs dev mode
|
| 74 |
+
if getattr(sys, 'frozen', False):
|
| 75 |
+
# In a frozen one-file build, sys.executable is our GUI .exe, not Python.
|
| 76 |
+
# Use an internal worker-mode flag handled by translator_gui.py to run the worker.
|
| 77 |
+
cmd = [
|
| 78 |
+
sys.executable,
|
| 79 |
+
'--run-chapter-extraction',
|
| 80 |
+
epub_path,
|
| 81 |
+
output_dir,
|
| 82 |
+
extraction_mode
|
| 83 |
+
]
|
| 84 |
+
else:
|
| 85 |
+
# In dev mode, invoke the worker script with the Python interpreter
|
| 86 |
+
base_dir = Path(__file__).parent
|
| 87 |
+
worker_script = base_dir / "chapter_extraction_worker.py"
|
| 88 |
+
cmd = [
|
| 89 |
+
sys.executable,
|
| 90 |
+
str(worker_script),
|
| 91 |
+
epub_path,
|
| 92 |
+
output_dir,
|
| 93 |
+
extraction_mode
|
| 94 |
+
]
|
| 95 |
+
|
| 96 |
+
# Set environment to force UTF-8 encoding
|
| 97 |
+
env = os.environ.copy()
|
| 98 |
+
env['PYTHONIOENCODING'] = 'utf-8'
|
| 99 |
+
env['PYTHONLEGACYWINDOWSSTDIO'] = '0' # Use new Windows console API
|
| 100 |
+
|
| 101 |
+
self._log(f"🚀 Starting chapter extraction subprocess...")
|
| 102 |
+
self._log(f"📚 EPUB: {os.path.basename(epub_path)}")
|
| 103 |
+
self._log(f"📂 Output: {output_dir}")
|
| 104 |
+
self._log(f"⚙️ Mode: {extraction_mode}")
|
| 105 |
+
|
| 106 |
+
# Start the subprocess with UTF-8 encoding
|
| 107 |
+
self.process = subprocess.Popen(
|
| 108 |
+
cmd,
|
| 109 |
+
stdout=subprocess.PIPE,
|
| 110 |
+
stderr=subprocess.PIPE,
|
| 111 |
+
text=True,
|
| 112 |
+
encoding='utf-8',
|
| 113 |
+
errors='replace', # Replace invalid chars instead of failing
|
| 114 |
+
bufsize=1,
|
| 115 |
+
universal_newlines=True,
|
| 116 |
+
env=env # Pass the environment with UTF-8 settings
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Read output in real-time
|
| 120 |
+
while True:
|
| 121 |
+
if self.stop_requested:
|
| 122 |
+
self._terminate_process()
|
| 123 |
+
break
|
| 124 |
+
|
| 125 |
+
# Check if process is still running
|
| 126 |
+
if self.process.poll() is not None:
|
| 127 |
+
break
|
| 128 |
+
|
| 129 |
+
# Read stdout line by line with error handling
|
| 130 |
+
try:
|
| 131 |
+
line = self.process.stdout.readline()
|
| 132 |
+
if not line:
|
| 133 |
+
continue
|
| 134 |
+
|
| 135 |
+
line = line.strip()
|
| 136 |
+
if not line:
|
| 137 |
+
continue
|
| 138 |
+
except UnicodeDecodeError as e:
|
| 139 |
+
self._log(f"⚠️ Encoding error reading output: {e}")
|
| 140 |
+
continue
|
| 141 |
+
|
| 142 |
+
# Skip all processing if stop is requested to suppress logs
|
| 143 |
+
if self.stop_requested:
|
| 144 |
+
continue
|
| 145 |
+
|
| 146 |
+
# Parse output based on prefix
|
| 147 |
+
if line.startswith("[PROGRESS]"):
|
| 148 |
+
# Progress update
|
| 149 |
+
message = line[10:].strip()
|
| 150 |
+
if progress_callback:
|
| 151 |
+
progress_callback(message)
|
| 152 |
+
self._log(f"📊 {message}")
|
| 153 |
+
|
| 154 |
+
elif line.startswith("[INFO]"):
|
| 155 |
+
# Information message
|
| 156 |
+
message = line[6:].strip()
|
| 157 |
+
self._log(f"ℹ️ {message}")
|
| 158 |
+
|
| 159 |
+
elif line.startswith("[ERROR]"):
|
| 160 |
+
# Error message
|
| 161 |
+
message = line[7:].strip()
|
| 162 |
+
self._log(f"❌ {message}")
|
| 163 |
+
self.error_queue.put(message)
|
| 164 |
+
|
| 165 |
+
elif line.startswith("[RESULT]"):
|
| 166 |
+
# Final result as JSON
|
| 167 |
+
try:
|
| 168 |
+
json_str = line[8:].strip()
|
| 169 |
+
self.result = json.loads(json_str)
|
| 170 |
+
|
| 171 |
+
if self.result.get("success"):
|
| 172 |
+
self._log(f"✅ Extraction completed successfully!")
|
| 173 |
+
self._log(f"📚 Extracted {self.result.get('chapters', 0)} chapters")
|
| 174 |
+
else:
|
| 175 |
+
error = self.result.get("error", "Unknown error")
|
| 176 |
+
self._log(f"❌ Extraction failed: {error}")
|
| 177 |
+
|
| 178 |
+
except json.JSONDecodeError as e:
|
| 179 |
+
self._log(f"⚠️ Failed to parse result: {e}")
|
| 180 |
+
|
| 181 |
+
elif line.startswith("["):
|
| 182 |
+
# Other prefixed messages - skip
|
| 183 |
+
pass
|
| 184 |
+
else:
|
| 185 |
+
# Regular output - only log if not too verbose
|
| 186 |
+
if not any(skip in line for skip in ["📁 Searching for", "📁 Found", "📁 ✓", "📁 ✗"]):
|
| 187 |
+
self._log(line)
|
| 188 |
+
|
| 189 |
+
# Get any remaining output - but only process if not stopped
|
| 190 |
+
if not self.stop_requested:
|
| 191 |
+
remaining_output, remaining_error = self.process.communicate(timeout=1)
|
| 192 |
+
|
| 193 |
+
# Process any remaining output
|
| 194 |
+
if remaining_output:
|
| 195 |
+
for line in remaining_output.strip().split('\n'):
|
| 196 |
+
if line and not line.startswith("["):
|
| 197 |
+
self._log(line)
|
| 198 |
+
|
| 199 |
+
# Check for errors
|
| 200 |
+
if remaining_error:
|
| 201 |
+
for line in remaining_error.strip().split('\n'):
|
| 202 |
+
if line:
|
| 203 |
+
self._log(f"⚠️ {line}")
|
| 204 |
+
|
| 205 |
+
# Check final status
|
| 206 |
+
if self.process.returncode != 0:
|
| 207 |
+
self._log(f"⚠️ Process exited with code {self.process.returncode}")
|
| 208 |
+
else:
|
| 209 |
+
# If stopped, just clean up without processing output
|
| 210 |
+
try:
|
| 211 |
+
self.process.communicate(timeout=0.1)
|
| 212 |
+
except subprocess.TimeoutExpired:
|
| 213 |
+
pass # Ignore timeout when cleaning up
|
| 214 |
+
|
| 215 |
+
except subprocess.TimeoutExpired:
|
| 216 |
+
if not self.stop_requested:
|
| 217 |
+
self._log("⚠️ Subprocess communication timeout")
|
| 218 |
+
self._terminate_process()
|
| 219 |
+
|
| 220 |
+
except Exception as e:
|
| 221 |
+
# Only log errors if not stopping (unless it's a critical error)
|
| 222 |
+
if not self.stop_requested or "Subprocess error" in str(e):
|
| 223 |
+
self._log(f"❌ Subprocess error: {e}")
|
| 224 |
+
self.result = {
|
| 225 |
+
"success": False,
|
| 226 |
+
"error": str(e) if not self.stop_requested else "Extraction stopped by user"
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
finally:
|
| 230 |
+
self.is_running = False
|
| 231 |
+
# Store process reference before clearing it in case termination is needed
|
| 232 |
+
process_ref = self.process
|
| 233 |
+
self.process = None
|
| 234 |
+
|
| 235 |
+
# If process is still running, try to clean it up
|
| 236 |
+
if process_ref and process_ref.poll() is None:
|
| 237 |
+
try:
|
| 238 |
+
process_ref.terminate()
|
| 239 |
+
time.sleep(0.1) # Brief wait
|
| 240 |
+
if process_ref.poll() is None:
|
| 241 |
+
process_ref.kill()
|
| 242 |
+
except Exception:
|
| 243 |
+
pass # Ignore cleanup errors in finally block
|
| 244 |
+
|
| 245 |
+
# Ensure result is never None
|
| 246 |
+
if self.result is None:
|
| 247 |
+
if self.stop_requested:
|
| 248 |
+
self.result = {
|
| 249 |
+
"success": False,
|
| 250 |
+
"error": "Extraction stopped by user"
|
| 251 |
+
}
|
| 252 |
+
else:
|
| 253 |
+
self.result = {
|
| 254 |
+
"success": False,
|
| 255 |
+
"error": "Extraction process ended unexpectedly"
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# Call completion callback
|
| 259 |
+
if completion_callback:
|
| 260 |
+
completion_callback(self.result)
|
| 261 |
+
|
| 262 |
+
def stop_extraction(self):
|
| 263 |
+
"""Stop the extraction process"""
|
| 264 |
+
if not self.is_running:
|
| 265 |
+
return False
|
| 266 |
+
|
| 267 |
+
# Set stop flag first to suppress subsequent logs
|
| 268 |
+
self.stop_requested = True
|
| 269 |
+
self._log("🛑 Stopping chapter extraction...")
|
| 270 |
+
|
| 271 |
+
# Store process reference to avoid race condition
|
| 272 |
+
process_ref = self.process
|
| 273 |
+
|
| 274 |
+
# Give it a moment to stop gracefully
|
| 275 |
+
time.sleep(0.5)
|
| 276 |
+
|
| 277 |
+
# Force terminate if still running and process still exists
|
| 278 |
+
if process_ref:
|
| 279 |
+
self._terminate_process_ref(process_ref)
|
| 280 |
+
|
| 281 |
+
return True
|
| 282 |
+
|
| 283 |
+
def _terminate_process(self):
|
| 284 |
+
"""Terminate the subprocess using current process reference"""
|
| 285 |
+
if self.process:
|
| 286 |
+
self._terminate_process_ref(self.process)
|
| 287 |
+
|
| 288 |
+
def _terminate_process_ref(self, process_ref):
|
| 289 |
+
"""Terminate a specific process reference"""
|
| 290 |
+
if not process_ref:
|
| 291 |
+
return
|
| 292 |
+
|
| 293 |
+
try:
|
| 294 |
+
# Check if process is still alive before attempting termination
|
| 295 |
+
if process_ref.poll() is None:
|
| 296 |
+
process_ref.terminate()
|
| 297 |
+
# Give it a moment to terminate
|
| 298 |
+
time.sleep(0.5)
|
| 299 |
+
|
| 300 |
+
# Force kill if still running
|
| 301 |
+
if process_ref.poll() is None:
|
| 302 |
+
process_ref.kill()
|
| 303 |
+
time.sleep(0.1) # Brief wait after kill
|
| 304 |
+
|
| 305 |
+
# Only log termination if not stopping (user already knows they stopped it)
|
| 306 |
+
if not self.stop_requested:
|
| 307 |
+
self._log("✅ Process terminated")
|
| 308 |
+
else:
|
| 309 |
+
# Only log if not stopping
|
| 310 |
+
if not self.stop_requested:
|
| 311 |
+
self._log("✅ Process already terminated")
|
| 312 |
+
except Exception as e:
|
| 313 |
+
# Always log termination errors as they might indicate a problem
|
| 314 |
+
self._log(f"⚠️ Error terminating process: {e}")
|
| 315 |
+
|
| 316 |
+
def _log(self, message):
|
| 317 |
+
"""Log a message using the callback if available"""
|
| 318 |
+
# Suppress logs when stop is requested (except for stop/termination messages)
|
| 319 |
+
if self.stop_requested and not any(keyword in message for keyword in ["🛑", "✅ Process terminated", "❌ Subprocess error"]):
|
| 320 |
+
return
|
| 321 |
+
|
| 322 |
+
if self.log_callback:
|
| 323 |
+
self.log_callback(message)
|
| 324 |
+
else:
|
| 325 |
+
print(message)
|
| 326 |
+
|
| 327 |
+
def is_extraction_running(self):
|
| 328 |
+
"""Check if extraction is currently running"""
|
| 329 |
+
return self.is_running
|
| 330 |
+
|
| 331 |
+
def get_result(self):
|
| 332 |
+
"""Get the extraction result if available"""
|
| 333 |
+
return self.result
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
# Example usage
|
| 337 |
+
if __name__ == "__main__":
|
| 338 |
+
import tkinter as tk
|
| 339 |
+
from tkinter import filedialog
|
| 340 |
+
|
| 341 |
+
def test_extraction():
|
| 342 |
+
"""Test the extraction manager"""
|
| 343 |
+
|
| 344 |
+
# Create a simple GUI for testing
|
| 345 |
+
root = tk.Tk()
|
| 346 |
+
root.title("Chapter Extraction Test")
|
| 347 |
+
root.geometry("800x600")
|
| 348 |
+
|
| 349 |
+
# Text widget for logs
|
| 350 |
+
text = tk.Text(root, wrap=tk.WORD)
|
| 351 |
+
text.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
|
| 352 |
+
|
| 353 |
+
# Log callback
|
| 354 |
+
def log_message(msg):
|
| 355 |
+
text.insert(tk.END, msg + "\n")
|
| 356 |
+
text.see(tk.END)
|
| 357 |
+
root.update_idletasks()
|
| 358 |
+
|
| 359 |
+
# Progress callback
|
| 360 |
+
def progress_update(msg):
|
| 361 |
+
log_message(f"📊 Progress: {msg}")
|
| 362 |
+
|
| 363 |
+
# Completion callback
|
| 364 |
+
def extraction_complete(result):
|
| 365 |
+
if result and result.get("success"):
|
| 366 |
+
log_message(f"✅ Extraction completed!")
|
| 367 |
+
log_message(f" Chapters: {result.get('chapters', 0)}")
|
| 368 |
+
else:
|
| 369 |
+
log_message(f"❌ Extraction failed!")
|
| 370 |
+
|
| 371 |
+
# Create manager
|
| 372 |
+
manager = ChapterExtractionManager(log_callback=log_message)
|
| 373 |
+
|
| 374 |
+
# File selection
|
| 375 |
+
epub_path = filedialog.askopenfilename(
|
| 376 |
+
title="Select EPUB file",
|
| 377 |
+
filetypes=[("EPUB files", "*.epub"), ("All files", "*.*")]
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
if epub_path:
|
| 381 |
+
output_dir = os.path.splitext(os.path.basename(epub_path))[0]
|
| 382 |
+
|
| 383 |
+
# Start extraction
|
| 384 |
+
manager.extract_chapters_async(
|
| 385 |
+
epub_path,
|
| 386 |
+
output_dir,
|
| 387 |
+
extraction_mode="smart",
|
| 388 |
+
progress_callback=progress_update,
|
| 389 |
+
completion_callback=extraction_complete
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
# Button to stop
|
| 393 |
+
stop_btn = tk.Button(
|
| 394 |
+
root,
|
| 395 |
+
text="Stop Extraction",
|
| 396 |
+
command=lambda: manager.stop_extraction()
|
| 397 |
+
)
|
| 398 |
+
stop_btn.pack(pady=5)
|
| 399 |
+
|
| 400 |
+
root.mainloop()
|
| 401 |
+
|
| 402 |
+
# Run test
|
| 403 |
+
test_extraction()
|
chapter_extraction_worker.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Chapter Extraction Worker - Runs chapter extraction in a separate process to prevent GUI freezing
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
import io
|
| 9 |
+
|
| 10 |
+
# Force UTF-8 encoding for stdout/stderr on Windows
|
| 11 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
| 12 |
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
|
| 13 |
+
import json
|
| 14 |
+
import zipfile
|
| 15 |
+
import time
|
| 16 |
+
import traceback
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
# Add parent directory to path for imports
|
| 20 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 21 |
+
|
| 22 |
+
def run_chapter_extraction(epub_path, output_dir, extraction_mode="smart", progress_callback=None):
|
| 23 |
+
"""
|
| 24 |
+
Run chapter extraction in this worker process
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
epub_path: Path to EPUB file
|
| 28 |
+
output_dir: Output directory for extracted content
|
| 29 |
+
extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
|
| 30 |
+
progress_callback: Callback function for progress updates (uses print for IPC)
|
| 31 |
+
|
| 32 |
+
Returns:
|
| 33 |
+
dict: Extraction results including chapters and metadata
|
| 34 |
+
"""
|
| 35 |
+
try:
|
| 36 |
+
# Import here to avoid loading heavy modules until needed
|
| 37 |
+
from TransateKRtoEN import ChapterExtractor
|
| 38 |
+
|
| 39 |
+
# Create progress callback that prints to stdout for IPC
|
| 40 |
+
def worker_progress_callback(message):
|
| 41 |
+
# Use special prefix for progress messages
|
| 42 |
+
print(f"[PROGRESS] {message}", flush=True)
|
| 43 |
+
|
| 44 |
+
# Create extractor with progress callback
|
| 45 |
+
extractor = ChapterExtractor(progress_callback=worker_progress_callback)
|
| 46 |
+
|
| 47 |
+
# Set extraction mode
|
| 48 |
+
os.environ["EXTRACTION_MODE"] = extraction_mode
|
| 49 |
+
|
| 50 |
+
# Open EPUB and extract chapters
|
| 51 |
+
print(f"[INFO] Starting extraction of: {epub_path}", flush=True)
|
| 52 |
+
print(f"[INFO] Output directory: {output_dir}", flush=True)
|
| 53 |
+
print(f"[INFO] Extraction mode: {extraction_mode}", flush=True)
|
| 54 |
+
|
| 55 |
+
with zipfile.ZipFile(epub_path, 'r') as zf:
|
| 56 |
+
# Extract metadata first
|
| 57 |
+
metadata = extractor._extract_epub_metadata(zf)
|
| 58 |
+
print(f"[INFO] Extracted metadata: {list(metadata.keys())}", flush=True)
|
| 59 |
+
|
| 60 |
+
# Extract chapters
|
| 61 |
+
chapters = extractor.extract_chapters(zf, output_dir)
|
| 62 |
+
|
| 63 |
+
print(f"[INFO] Extracted {len(chapters)} chapters", flush=True)
|
| 64 |
+
|
| 65 |
+
# The extract_chapters method already handles OPF sorting internally
|
| 66 |
+
# Just log if OPF was used
|
| 67 |
+
opf_path = os.path.join(output_dir, 'content.opf')
|
| 68 |
+
if os.path.exists(opf_path):
|
| 69 |
+
print(f"[INFO] OPF file available for chapter ordering", flush=True)
|
| 70 |
+
|
| 71 |
+
# CRITICAL: Save the full chapters with body content!
|
| 72 |
+
# This is what the main process needs to load
|
| 73 |
+
chapters_full_path = os.path.join(output_dir, "chapters_full.json")
|
| 74 |
+
try:
|
| 75 |
+
with open(chapters_full_path, 'w', encoding='utf-8') as f:
|
| 76 |
+
json.dump(chapters, f, ensure_ascii=False)
|
| 77 |
+
print(f"[INFO] Saved full chapters data to: {chapters_full_path}", flush=True)
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"[WARNING] Could not save full chapters: {e}", flush=True)
|
| 80 |
+
# Fall back to saving individual files
|
| 81 |
+
for chapter in chapters:
|
| 82 |
+
try:
|
| 83 |
+
chapter_file = f"chapter_{chapter['num']:04d}_{chapter.get('filename', 'content').replace('/', '_')}.html"
|
| 84 |
+
chapter_path = os.path.join(output_dir, chapter_file)
|
| 85 |
+
with open(chapter_path, 'w', encoding='utf-8') as f:
|
| 86 |
+
f.write(chapter.get('body', ''))
|
| 87 |
+
print(f"[INFO] Saved chapter {chapter['num']} to {chapter_file}", flush=True)
|
| 88 |
+
except Exception as ce:
|
| 89 |
+
print(f"[WARNING] Could not save chapter {chapter.get('num')}: {ce}", flush=True)
|
| 90 |
+
|
| 91 |
+
# Return results as JSON for IPC
|
| 92 |
+
result = {
|
| 93 |
+
"success": True,
|
| 94 |
+
"chapters": len(chapters),
|
| 95 |
+
"metadata": metadata,
|
| 96 |
+
"chapter_info": [
|
| 97 |
+
{
|
| 98 |
+
"num": ch.get("num"),
|
| 99 |
+
"title": ch.get("title"),
|
| 100 |
+
"has_images": ch.get("has_images", False),
|
| 101 |
+
"file_size": ch.get("file_size", 0),
|
| 102 |
+
"content_hash": ch.get("content_hash", "")
|
| 103 |
+
}
|
| 104 |
+
for ch in chapters
|
| 105 |
+
]
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
# Output result as JSON
|
| 109 |
+
print(f"[RESULT] {json.dumps(result)}", flush=True)
|
| 110 |
+
return result
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
# Send error information
|
| 114 |
+
error_info = {
|
| 115 |
+
"success": False,
|
| 116 |
+
"error": str(e),
|
| 117 |
+
"traceback": traceback.format_exc()
|
| 118 |
+
}
|
| 119 |
+
print(f"[ERROR] {str(e)}", flush=True)
|
| 120 |
+
print(f"[RESULT] {json.dumps(error_info)}", flush=True)
|
| 121 |
+
return error_info
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def main():
|
| 125 |
+
"""Main entry point for worker process"""
|
| 126 |
+
|
| 127 |
+
# Parse command line arguments
|
| 128 |
+
if len(sys.argv) < 3:
|
| 129 |
+
print("[ERROR] Usage: chapter_extraction_worker.py <epub_path> <output_dir> [extraction_mode]", flush=True)
|
| 130 |
+
sys.exit(1)
|
| 131 |
+
|
| 132 |
+
epub_path = sys.argv[1]
|
| 133 |
+
output_dir = sys.argv[2]
|
| 134 |
+
extraction_mode = sys.argv[3] if len(sys.argv) > 3 else "smart"
|
| 135 |
+
|
| 136 |
+
# Validate inputs
|
| 137 |
+
if not os.path.exists(epub_path):
|
| 138 |
+
print(f"[ERROR] EPUB file not found: {epub_path}", flush=True)
|
| 139 |
+
sys.exit(1)
|
| 140 |
+
|
| 141 |
+
# Create output directory if needed
|
| 142 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 143 |
+
|
| 144 |
+
# Run extraction
|
| 145 |
+
result = run_chapter_extraction(epub_path, output_dir, extraction_mode)
|
| 146 |
+
|
| 147 |
+
# Exit with appropriate code
|
| 148 |
+
sys.exit(0 if result.get("success", False) else 1)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
# Ensure freeze support for Windows frozen exe
|
| 153 |
+
try:
|
| 154 |
+
import multiprocessing
|
| 155 |
+
multiprocessing.freeze_support()
|
| 156 |
+
except Exception:
|
| 157 |
+
pass
|
| 158 |
+
main()
|
chapter_splitter.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import tiktoken
|
| 4 |
+
|
| 5 |
+
class ChapterSplitter:
|
| 6 |
+
"""Split large chapters into smaller chunks while preserving structure"""
|
| 7 |
+
|
| 8 |
+
def __init__(self, model_name="gpt-3.5-turbo", target_tokens=80000, compression_factor=1.0):
|
| 9 |
+
"""
|
| 10 |
+
Initialize splitter with token counter
|
| 11 |
+
target_tokens: Target size for each chunk (leaving room for system prompt & history)
|
| 12 |
+
compression_factor: Expected compression ratio from source to target language (0.7-1.0)
|
| 13 |
+
"""
|
| 14 |
+
try:
|
| 15 |
+
self.enc = tiktoken.encoding_for_model(model_name)
|
| 16 |
+
except:
|
| 17 |
+
self.enc = tiktoken.get_encoding("cl100k_base")
|
| 18 |
+
self.target_tokens = target_tokens
|
| 19 |
+
self.compression_factor = compression_factor
|
| 20 |
+
|
| 21 |
+
def count_tokens(self, text):
|
| 22 |
+
"""Count tokens in text"""
|
| 23 |
+
try:
|
| 24 |
+
return len(self.enc.encode(text))
|
| 25 |
+
except:
|
| 26 |
+
# Fallback estimation
|
| 27 |
+
return len(text) // 4
|
| 28 |
+
|
| 29 |
+
def split_chapter(self, chapter_html, max_tokens=None):
|
| 30 |
+
"""
|
| 31 |
+
Split a chapter into smaller chunks if it exceeds token limit
|
| 32 |
+
Returns: List of (chunk_html, chunk_index, total_chunks)
|
| 33 |
+
"""
|
| 34 |
+
if max_tokens is None:
|
| 35 |
+
max_tokens = self.target_tokens
|
| 36 |
+
|
| 37 |
+
# Apply compression factor to output token limit
|
| 38 |
+
# If compression_factor is 0.7 and max_tokens is 4096,
|
| 39 |
+
# we expect output to be 4096 * 0.7 = 2867 tokens
|
| 40 |
+
effective_max_tokens = int(max_tokens * self.compression_factor)
|
| 41 |
+
|
| 42 |
+
# First check if splitting is needed
|
| 43 |
+
total_tokens = self.count_tokens(chapter_html)
|
| 44 |
+
if total_tokens <= effective_max_tokens:
|
| 45 |
+
return [(chapter_html, 1, 1)] # No split needed
|
| 46 |
+
|
| 47 |
+
# Parse HTML
|
| 48 |
+
soup = BeautifulSoup(chapter_html, 'html.parser')
|
| 49 |
+
|
| 50 |
+
# Try to find natural break points
|
| 51 |
+
chunks = []
|
| 52 |
+
current_chunk = []
|
| 53 |
+
current_tokens = 0
|
| 54 |
+
|
| 55 |
+
# Get all direct children of body, or all top-level elements
|
| 56 |
+
if soup.body:
|
| 57 |
+
elements = list(soup.body.children)
|
| 58 |
+
else:
|
| 59 |
+
elements = list(soup.children)
|
| 60 |
+
|
| 61 |
+
for element in elements:
|
| 62 |
+
if isinstance(element, str) and element.strip() == '':
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
element_html = str(element)
|
| 66 |
+
element_tokens = self.count_tokens(element_html)
|
| 67 |
+
|
| 68 |
+
# If single element is too large, try to split it
|
| 69 |
+
if element_tokens > effective_max_tokens:
|
| 70 |
+
sub_chunks = self._split_large_element(element, effective_max_tokens)
|
| 71 |
+
for sub_chunk in sub_chunks:
|
| 72 |
+
chunks.append(sub_chunk)
|
| 73 |
+
else:
|
| 74 |
+
# Check if adding this element would exceed limit
|
| 75 |
+
if current_tokens + element_tokens > effective_max_tokens and current_chunk:
|
| 76 |
+
# Save current chunk
|
| 77 |
+
chunks.append(self._create_chunk_html(current_chunk))
|
| 78 |
+
current_chunk = [element_html]
|
| 79 |
+
current_tokens = element_tokens
|
| 80 |
+
else:
|
| 81 |
+
current_chunk.append(element_html)
|
| 82 |
+
current_tokens += element_tokens
|
| 83 |
+
|
| 84 |
+
# Don't forget the last chunk
|
| 85 |
+
if current_chunk:
|
| 86 |
+
chunks.append(self._create_chunk_html(current_chunk))
|
| 87 |
+
|
| 88 |
+
# Return chunks with metadata
|
| 89 |
+
total_chunks = len(chunks)
|
| 90 |
+
return [(chunk, i+1, total_chunks) for i, chunk in enumerate(chunks)]
|
| 91 |
+
|
| 92 |
+
def _split_large_element(self, element, max_tokens):
|
| 93 |
+
"""Split a single large element (like a long paragraph)"""
|
| 94 |
+
chunks = []
|
| 95 |
+
|
| 96 |
+
if element.name == 'p' or not hasattr(element, 'children'):
|
| 97 |
+
# For paragraphs or text elements, split by sentences
|
| 98 |
+
text = element.get_text()
|
| 99 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 100 |
+
|
| 101 |
+
current_chunk = []
|
| 102 |
+
current_tokens = 0
|
| 103 |
+
|
| 104 |
+
for sentence in sentences:
|
| 105 |
+
sentence_tokens = self.count_tokens(sentence)
|
| 106 |
+
|
| 107 |
+
if current_tokens + sentence_tokens > max_tokens * 0.8 and current_chunk:
|
| 108 |
+
# Create paragraph with current sentences
|
| 109 |
+
chunk_text = ' '.join(current_chunk)
|
| 110 |
+
chunks.append(f"<p>{chunk_text}</p>")
|
| 111 |
+
current_chunk = [sentence]
|
| 112 |
+
current_tokens = sentence_tokens
|
| 113 |
+
else:
|
| 114 |
+
current_chunk.append(sentence)
|
| 115 |
+
current_tokens += sentence_tokens
|
| 116 |
+
|
| 117 |
+
if current_chunk:
|
| 118 |
+
chunk_text = ' '.join(current_chunk)
|
| 119 |
+
chunks.append(f"<p>{chunk_text}</p>")
|
| 120 |
+
|
| 121 |
+
else:
|
| 122 |
+
# For other elements, try to split by children
|
| 123 |
+
children = list(element.children)
|
| 124 |
+
current_chunk = []
|
| 125 |
+
current_tokens = 0
|
| 126 |
+
|
| 127 |
+
for child in children:
|
| 128 |
+
child_html = str(child)
|
| 129 |
+
child_tokens = self.count_tokens(child_html)
|
| 130 |
+
|
| 131 |
+
if current_tokens + child_tokens > max_tokens * 0.8 and current_chunk:
|
| 132 |
+
# Wrap in parent element type
|
| 133 |
+
wrapper = BeautifulSoup(f"<{element.name}></{element.name}>", 'html.parser')
|
| 134 |
+
wrapper_elem = wrapper.find(element.name)
|
| 135 |
+
for item in current_chunk:
|
| 136 |
+
wrapper_elem.append(BeautifulSoup(item, 'html.parser'))
|
| 137 |
+
chunks.append(str(wrapper))
|
| 138 |
+
|
| 139 |
+
current_chunk = [child_html]
|
| 140 |
+
current_tokens = child_tokens
|
| 141 |
+
else:
|
| 142 |
+
current_chunk.append(child_html)
|
| 143 |
+
current_tokens += child_tokens
|
| 144 |
+
|
| 145 |
+
if current_chunk:
|
| 146 |
+
wrapper = BeautifulSoup(f"<{element.name}></{element.name}>", 'html.parser')
|
| 147 |
+
wrapper_elem = wrapper.find(element.name)
|
| 148 |
+
for item in current_chunk:
|
| 149 |
+
wrapper_elem.append(BeautifulSoup(item, 'html.parser'))
|
| 150 |
+
chunks.append(str(wrapper))
|
| 151 |
+
|
| 152 |
+
return chunks
|
| 153 |
+
|
| 154 |
+
def _create_chunk_html(self, elements):
|
| 155 |
+
"""Create a valid HTML chunk from list of elements"""
|
| 156 |
+
# Join elements and wrap in basic HTML structure if needed
|
| 157 |
+
content = '\n'.join(elements)
|
| 158 |
+
|
| 159 |
+
# Check if it already has body tags
|
| 160 |
+
if '<body' not in content.lower():
|
| 161 |
+
# Just return the content, let the translation handle it
|
| 162 |
+
return content
|
| 163 |
+
else:
|
| 164 |
+
return content
|
| 165 |
+
|
| 166 |
+
def merge_translated_chunks(self, translated_chunks):
|
| 167 |
+
"""
|
| 168 |
+
Merge translated chunks back together
|
| 169 |
+
translated_chunks: List of (translated_html, chunk_index, total_chunks)
|
| 170 |
+
"""
|
| 171 |
+
# Sort by chunk index to ensure correct order
|
| 172 |
+
sorted_chunks = sorted(translated_chunks, key=lambda x: x[1])
|
| 173 |
+
|
| 174 |
+
# Extract just the HTML content
|
| 175 |
+
html_parts = [chunk[0] for chunk in sorted_chunks]
|
| 176 |
+
|
| 177 |
+
# Simply concatenate - the chunks should maintain structure
|
| 178 |
+
merged = '\n'.join(html_parts)
|
| 179 |
+
|
| 180 |
+
# Clean up any duplicate body tags if they exist
|
| 181 |
+
soup = BeautifulSoup(merged, 'html.parser')
|
| 182 |
+
|
| 183 |
+
# If multiple body tags, merge their contents
|
| 184 |
+
bodies = soup.find_all('body')
|
| 185 |
+
if len(bodies) > 1:
|
| 186 |
+
# Keep first body, move all content from others into it
|
| 187 |
+
main_body = bodies[0]
|
| 188 |
+
for extra_body in bodies[1:]:
|
| 189 |
+
for child in list(extra_body.children):
|
| 190 |
+
main_body.append(child)
|
| 191 |
+
extra_body.decompose()
|
| 192 |
+
|
| 193 |
+
return str(soup)
|
| 194 |
+
|
| 195 |
+
return merged
|
check_epub_directory.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
def diagnose_epub_directory(directory="."):
|
| 5 |
+
"""Diagnose issues with EPUB output directory"""
|
| 6 |
+
|
| 7 |
+
print(f"\n{'='*60}")
|
| 8 |
+
print(f"EPUB Directory Diagnostic Tool")
|
| 9 |
+
print(f"{'='*60}\n")
|
| 10 |
+
|
| 11 |
+
# Get absolute path
|
| 12 |
+
abs_path = os.path.abspath(directory)
|
| 13 |
+
print(f"📁 Checking directory: {abs_path}")
|
| 14 |
+
|
| 15 |
+
# Check if directory exists
|
| 16 |
+
if not os.path.exists(abs_path):
|
| 17 |
+
print(f"❌ ERROR: Directory does not exist!")
|
| 18 |
+
return
|
| 19 |
+
|
| 20 |
+
if not os.path.isdir(abs_path):
|
| 21 |
+
print(f"❌ ERROR: Path is not a directory!")
|
| 22 |
+
return
|
| 23 |
+
|
| 24 |
+
# List contents
|
| 25 |
+
try:
|
| 26 |
+
contents = os.listdir(abs_path)
|
| 27 |
+
print(f"✅ Directory is accessible")
|
| 28 |
+
print(f"📊 Total items: {len(contents)}\n")
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"❌ ERROR: Cannot read directory: {e}")
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
# Categorize files
|
| 34 |
+
html_files = []
|
| 35 |
+
response_files = []
|
| 36 |
+
css_files = []
|
| 37 |
+
image_files = []
|
| 38 |
+
directories = []
|
| 39 |
+
other_files = []
|
| 40 |
+
|
| 41 |
+
for item in contents:
|
| 42 |
+
item_path = os.path.join(abs_path, item)
|
| 43 |
+
|
| 44 |
+
if os.path.isdir(item_path):
|
| 45 |
+
directories.append(item)
|
| 46 |
+
elif item.endswith('.html'):
|
| 47 |
+
html_files.append(item)
|
| 48 |
+
if item.startswith('response_'):
|
| 49 |
+
response_files.append(item)
|
| 50 |
+
elif item.endswith('.css'):
|
| 51 |
+
css_files.append(item)
|
| 52 |
+
elif item.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.svg')):
|
| 53 |
+
image_files.append(item)
|
| 54 |
+
else:
|
| 55 |
+
other_files.append(item)
|
| 56 |
+
|
| 57 |
+
# Report findings
|
| 58 |
+
print("📋 Directory Contents Summary:")
|
| 59 |
+
print(f" • HTML files: {len(html_files)}")
|
| 60 |
+
print(f" • Response files (translated chapters): {len(response_files)}")
|
| 61 |
+
print(f" • CSS files: {len(css_files)}")
|
| 62 |
+
print(f" • Image files: {len(image_files)}")
|
| 63 |
+
print(f" • Subdirectories: {len(directories)}")
|
| 64 |
+
print(f" • Other files: {len(other_files)}")
|
| 65 |
+
|
| 66 |
+
# Check for required items
|
| 67 |
+
print(f"\n📍 Checking Required Items:")
|
| 68 |
+
|
| 69 |
+
# Check for metadata.json
|
| 70 |
+
if 'metadata.json' in contents:
|
| 71 |
+
print(" ✅ metadata.json found")
|
| 72 |
+
else:
|
| 73 |
+
print(" ❌ metadata.json NOT FOUND")
|
| 74 |
+
|
| 75 |
+
# Check for response files
|
| 76 |
+
if response_files:
|
| 77 |
+
print(f" ✅ {len(response_files)} translated chapter files found")
|
| 78 |
+
|
| 79 |
+
# Analyze chapter numbers
|
| 80 |
+
chapter_nums = []
|
| 81 |
+
for f in response_files:
|
| 82 |
+
m = re.match(r'response_(\d+)_', f)
|
| 83 |
+
if m:
|
| 84 |
+
chapter_nums.append(int(m.group(1)))
|
| 85 |
+
|
| 86 |
+
if chapter_nums:
|
| 87 |
+
chapter_nums.sort()
|
| 88 |
+
print(f" 📖 Chapter range: {min(chapter_nums)} to {max(chapter_nums)}")
|
| 89 |
+
|
| 90 |
+
# Check for missing chapters
|
| 91 |
+
expected = set(range(min(chapter_nums), max(chapter_nums) + 1))
|
| 92 |
+
actual = set(chapter_nums)
|
| 93 |
+
missing = expected - actual
|
| 94 |
+
if missing:
|
| 95 |
+
print(f" ⚠️ Missing chapters: {sorted(missing)}")
|
| 96 |
+
else:
|
| 97 |
+
print(" ❌ No response_*.html files found!")
|
| 98 |
+
|
| 99 |
+
if html_files:
|
| 100 |
+
print(f"\n 🔍 Found {len(html_files)} HTML files with different names:")
|
| 101 |
+
for i, f in enumerate(html_files[:5]):
|
| 102 |
+
print(f" {i+1}. {f}")
|
| 103 |
+
if len(html_files) > 5:
|
| 104 |
+
print(f" ... and {len(html_files) - 5} more")
|
| 105 |
+
|
| 106 |
+
# Check subdirectories
|
| 107 |
+
if directories:
|
| 108 |
+
print(f"\n📂 Subdirectories found:")
|
| 109 |
+
for d in directories:
|
| 110 |
+
print(f" • {d}/")
|
| 111 |
+
|
| 112 |
+
# Check contents of important subdirectories
|
| 113 |
+
if d in ['css', 'images', 'fonts']:
|
| 114 |
+
try:
|
| 115 |
+
sub_contents = os.listdir(os.path.join(abs_path, d))
|
| 116 |
+
print(f" Contains {len(sub_contents)} items")
|
| 117 |
+
except:
|
| 118 |
+
print(f" Cannot read contents")
|
| 119 |
+
|
| 120 |
+
# Sample file check
|
| 121 |
+
if response_files:
|
| 122 |
+
print(f"\n🔍 Checking a sample chapter file...")
|
| 123 |
+
sample_file = response_files[0]
|
| 124 |
+
sample_path = os.path.join(abs_path, sample_file)
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
with open(sample_path, 'r', encoding='utf-8') as f:
|
| 128 |
+
content = f.read()
|
| 129 |
+
print(f" ✅ {sample_file} is readable")
|
| 130 |
+
print(f" 📏 File size: {len(content):,} characters")
|
| 131 |
+
|
| 132 |
+
# Check for basic HTML structure
|
| 133 |
+
if '<html' in content.lower():
|
| 134 |
+
print(" ✅ Contains HTML tag")
|
| 135 |
+
if '<body' in content.lower():
|
| 136 |
+
print(" ✅ Contains BODY tag")
|
| 137 |
+
if '<p>' in content or '<p ' in content:
|
| 138 |
+
print(" ✅ Contains paragraph tags")
|
| 139 |
+
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print(f" ❌ Cannot read {sample_file}: {e}")
|
| 142 |
+
|
| 143 |
+
print(f"\n{'='*60}")
|
| 144 |
+
print("Diagnostic complete!")
|
| 145 |
+
print(f"{'='*60}\n")
|
| 146 |
+
|
| 147 |
+
if __name__ == "__main__":
|
| 148 |
+
import sys
|
| 149 |
+
if len(sys.argv) > 1:
|
| 150 |
+
diagnose_epub_directory(sys.argv[1])
|
| 151 |
+
else:
|
| 152 |
+
diagnose_epub_directory(".")
|
direct_imports.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Add the current directory to Python path so we can import our modules
|
| 5 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 6 |
+
if current_dir not in sys.path:
|
| 7 |
+
sys.path.insert(0, current_dir)
|
| 8 |
+
|
| 9 |
+
# When running as executable, modules might be in _MEIPASS
|
| 10 |
+
if hasattr(sys, '_MEIPASS'):
|
| 11 |
+
meipass_dir = sys._MEIPASS
|
| 12 |
+
if meipass_dir not in sys.path:
|
| 13 |
+
sys.path.insert(0, meipass_dir)
|
| 14 |
+
|
| 15 |
+
# Now we can safely import our modules
|
| 16 |
+
try:
|
| 17 |
+
from extract_glossary_from_epub import main as glossary_main
|
| 18 |
+
except ImportError as e:
|
| 19 |
+
print(f"Failed to import glossary module: {e}")
|
| 20 |
+
glossary_main = None
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
from TransateKRtoEN import main as translation_main
|
| 24 |
+
except ImportError as e:
|
| 25 |
+
print(f"Failed to import translation module: {e}")
|
| 26 |
+
translation_main = None
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
from epub_converter import fallback_compile_epub
|
| 30 |
+
except ImportError as e:
|
| 31 |
+
print(f"Failed to import epub converter: {e}")
|
| 32 |
+
fallback_compile_epub = None
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
from scan_html_folder import scan_html_folder
|
| 36 |
+
except ImportError as e:
|
| 37 |
+
print(f"Failed to import scanner: {e}")
|
| 38 |
+
scan_html_folder = None
|
enhanced_text_extractor.py
ADDED
|
@@ -0,0 +1,597 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
Enhanced Text Extractor Module with CJK Support
|
| 5 |
+
Provides superior text extraction from HTML with proper Unicode handling
|
| 6 |
+
Optimized for Korean, Japanese, and Chinese content extraction
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import re
|
| 11 |
+
import html
|
| 12 |
+
import unicodedata
|
| 13 |
+
from typing import Tuple, Optional
|
| 14 |
+
import chardet
|
| 15 |
+
|
| 16 |
+
# BEAUTIFUL SOUP IMPORT MONKEY FIX - Import BeautifulSoup BEFORE html2text
|
| 17 |
+
# This prevents certain parser initialization issues
|
| 18 |
+
try:
|
| 19 |
+
from bs4 import BeautifulSoup
|
| 20 |
+
# Force BeautifulSoup to initialize its parsers
|
| 21 |
+
_ = BeautifulSoup("", 'html.parser')
|
| 22 |
+
except ImportError:
|
| 23 |
+
BeautifulSoup = None
|
| 24 |
+
raise ImportError("BeautifulSoup is required. Install with: pip install beautifulsoup4")
|
| 25 |
+
|
| 26 |
+
# Now import html2text AFTER BeautifulSoup
|
| 27 |
+
try:
|
| 28 |
+
import html2text
|
| 29 |
+
except ImportError:
|
| 30 |
+
html2text = None
|
| 31 |
+
raise ImportError("html2text is required. Install with: pip install html2text")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class EnhancedTextExtractor:
|
| 35 |
+
"""Enhanced text extraction with proper Unicode and CJK handling"""
|
| 36 |
+
|
| 37 |
+
# Unicode preservation mappings
|
| 38 |
+
UNICODE_QUOTES = {
|
| 39 |
+
# Western quotes
|
| 40 |
+
'“': '\u201c', # Left double quotation mark
|
| 41 |
+
'”': '\u201d', # Right double quotation mark
|
| 42 |
+
'‘': '\u2018', # Left single quotation mark
|
| 43 |
+
'’': '\u2019', # Right single quotation mark
|
| 44 |
+
'"': '"', # Standard double quote
|
| 45 |
+
''': "'", # Standard apostrophe
|
| 46 |
+
|
| 47 |
+
# CJK quotes and punctuation
|
| 48 |
+
'「': '「', # Japanese left corner bracket
|
| 49 |
+
'」': '」', # Japanese right corner bracket
|
| 50 |
+
'『': '『', # Japanese left white corner bracket
|
| 51 |
+
'』': '』', # Japanese right white corner bracket
|
| 52 |
+
'(': '(', # Fullwidth left parenthesis
|
| 53 |
+
')': ')', # Fullwidth right parenthesis
|
| 54 |
+
'【': '【', # Left black lenticular bracket
|
| 55 |
+
'】': '】', # Right black lenticular bracket
|
| 56 |
+
'《': '《', # Left double angle bracket
|
| 57 |
+
'》': '》', # Right double angle bracket
|
| 58 |
+
';': ';', # Fullwidth semicolon
|
| 59 |
+
':': ':', # Fullwidth colon
|
| 60 |
+
'。': '。', # Ideographic full stop
|
| 61 |
+
'?': '?', # Fullwidth question mark
|
| 62 |
+
'!': '!', # Fullwidth exclamation mark
|
| 63 |
+
'、': '、', # Ideographic comma
|
| 64 |
+
|
| 65 |
+
# Numeric entities
|
| 66 |
+
'“': '\u201c', # Left double quote (numeric)
|
| 67 |
+
'”': '\u201d', # Right double quote (numeric)
|
| 68 |
+
'‘': '\u2018', # Left single quote (numeric)
|
| 69 |
+
'’': '\u2019', # Right single quote (numeric)
|
| 70 |
+
|
| 71 |
+
# Common CJK entities
|
| 72 |
+
'…': '…', # Horizontal ellipsis
|
| 73 |
+
'—': '—', # Em dash
|
| 74 |
+
'–': '–', # En dash
|
| 75 |
+
' ': '\u00A0', # Non-breaking space
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
# CJK-specific punctuation to preserve
|
| 79 |
+
CJK_PUNCTUATION = {
|
| 80 |
+
'。', '、', '!', '?', '…', '—', '~', '・',
|
| 81 |
+
'「', '」', '『', '』', '(', ')', '【', '】',
|
| 82 |
+
'《', '》', '〈', '〉', '〔', '〕', '[', ']',
|
| 83 |
+
':', ';', '"', '"', ''', ''',
|
| 84 |
+
',', '.', '?', '!', ':', ';',
|
| 85 |
+
'"', '"', '‚', '„', '«', '»',
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
# Quote protection markers
|
| 89 |
+
QUOTE_MARKERS = {
|
| 90 |
+
'"': '␥', # Opening double quote marker
|
| 91 |
+
'"': '␦', # Closing double quote marker
|
| 92 |
+
'"': '␦', # Alternative closing quote
|
| 93 |
+
"'": '␣', # Opening single quote marker
|
| 94 |
+
"'": '', # Closing single quote marker
|
| 95 |
+
"'": '', # Alternative closing quote
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def __init__(self, filtering_mode: str = "smart", preserve_structure: bool = True):
|
| 100 |
+
"""Initialize the enhanced text extractor"""
|
| 101 |
+
if not html2text:
|
| 102 |
+
raise ImportError("html2text is required for enhanced extraction")
|
| 103 |
+
|
| 104 |
+
if not BeautifulSoup:
|
| 105 |
+
raise ImportError("BeautifulSoup is required for enhanced extraction")
|
| 106 |
+
|
| 107 |
+
self.filtering_mode = filtering_mode
|
| 108 |
+
self.preserve_structure = preserve_structure
|
| 109 |
+
self.h2t = None
|
| 110 |
+
self.detected_language = None
|
| 111 |
+
|
| 112 |
+
self._configure_html2text()
|
| 113 |
+
|
| 114 |
+
def _detect_encoding(self, content: bytes) -> str:
|
| 115 |
+
"""Detect the encoding of the content"""
|
| 116 |
+
try:
|
| 117 |
+
# Try chardet detection
|
| 118 |
+
detected = chardet.detect(content)
|
| 119 |
+
if detected['confidence'] > 0.7:
|
| 120 |
+
return detected['encoding']
|
| 121 |
+
except Exception:
|
| 122 |
+
pass
|
| 123 |
+
|
| 124 |
+
# Try common CJK encodings in order
|
| 125 |
+
for encoding in ['utf-8', 'gb2312', 'gbk', 'gb18030', 'big5', 'shift_jis', 'euc-kr', 'euc-jp']:
|
| 126 |
+
try:
|
| 127 |
+
content.decode(encoding)
|
| 128 |
+
return encoding
|
| 129 |
+
except Exception:
|
| 130 |
+
continue
|
| 131 |
+
|
| 132 |
+
return 'utf-8' # Default fallback
|
| 133 |
+
|
| 134 |
+
def _detect_content_language(self, text: str) -> str:
|
| 135 |
+
"""Detect the primary language of content"""
|
| 136 |
+
if not text:
|
| 137 |
+
return 'unknown'
|
| 138 |
+
|
| 139 |
+
# Take a sample of the text
|
| 140 |
+
sample = text[:5000]
|
| 141 |
+
|
| 142 |
+
# Count characters by script
|
| 143 |
+
korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
|
| 144 |
+
japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
|
| 145 |
+
chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
|
| 146 |
+
latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A)
|
| 147 |
+
|
| 148 |
+
# Determine primary language
|
| 149 |
+
if korean_chars > 50:
|
| 150 |
+
return 'korean'
|
| 151 |
+
elif japanese_kana > 20:
|
| 152 |
+
return 'japanese'
|
| 153 |
+
elif chinese_chars > 50 and japanese_kana < 10:
|
| 154 |
+
return 'chinese'
|
| 155 |
+
elif latin_chars > 100:
|
| 156 |
+
return 'english'
|
| 157 |
+
else:
|
| 158 |
+
return 'unknown'
|
| 159 |
+
|
| 160 |
+
def _configure_html2text(self):
|
| 161 |
+
"""Configure html2text with optimal Unicode and CJK settings"""
|
| 162 |
+
self.h2t = html2text.HTML2Text()
|
| 163 |
+
|
| 164 |
+
# Core settings for Unicode preservation
|
| 165 |
+
self.h2t.unicode_snob = True
|
| 166 |
+
self.h2t.escape_snob = True
|
| 167 |
+
self.h2t.use_automatic_links = False
|
| 168 |
+
|
| 169 |
+
# Layout settings
|
| 170 |
+
self.h2t.body_width = 0
|
| 171 |
+
self.h2t.single_line_break = False
|
| 172 |
+
|
| 173 |
+
# Content filtering
|
| 174 |
+
self.h2t.ignore_links = False
|
| 175 |
+
self.h2t.ignore_images = False
|
| 176 |
+
self.h2t.ignore_anchors = False
|
| 177 |
+
self.h2t.skip_internal_links = False
|
| 178 |
+
self.h2t.ignore_tables = False
|
| 179 |
+
|
| 180 |
+
# Image handling - CRITICAL: Force html2text to preserve img tags as HTML
|
| 181 |
+
self.h2t.images_as_html = True # Keep images as <img> tags instead of ![]()
|
| 182 |
+
self.h2t.images_to_alt = False # Don't convert to alt text only
|
| 183 |
+
self.h2t.images_with_size = True # Include width/height attributes
|
| 184 |
+
|
| 185 |
+
# Additional settings
|
| 186 |
+
self.h2t.wrap_links = False
|
| 187 |
+
self.h2t.wrap_list_items = False
|
| 188 |
+
self.h2t.protect_links = True
|
| 189 |
+
|
| 190 |
+
# Structure preservation settings
|
| 191 |
+
if self.preserve_structure:
|
| 192 |
+
self.h2t.bypass_tables = False
|
| 193 |
+
self.h2t.ignore_emphasis = False
|
| 194 |
+
self.h2t.mark_code = True
|
| 195 |
+
self.h2t.ul_item_mark = '•'
|
| 196 |
+
else:
|
| 197 |
+
self.h2t.bypass_tables = True
|
| 198 |
+
self.h2t.ignore_emphasis = True
|
| 199 |
+
self.h2t.mark_code = False
|
| 200 |
+
|
| 201 |
+
def _decode_entities(self, text: str) -> str:
|
| 202 |
+
"""Decode HTML entities to Unicode characters with CJK support"""
|
| 203 |
+
if not text:
|
| 204 |
+
return text
|
| 205 |
+
|
| 206 |
+
# First pass: Apply known CJK-aware replacements
|
| 207 |
+
for entity, unicode_char in self.UNICODE_QUOTES.items():
|
| 208 |
+
text = text.replace(entity, unicode_char)
|
| 209 |
+
|
| 210 |
+
# Second pass: standard HTML unescape
|
| 211 |
+
text = html.unescape(text)
|
| 212 |
+
|
| 213 |
+
# Third pass: handle numeric entities
|
| 214 |
+
def decode_decimal(match):
|
| 215 |
+
try:
|
| 216 |
+
code = int(match.group(1))
|
| 217 |
+
if code < 0x110000:
|
| 218 |
+
return chr(code)
|
| 219 |
+
except Exception:
|
| 220 |
+
pass
|
| 221 |
+
return match.group(0)
|
| 222 |
+
|
| 223 |
+
def decode_hex(match):
|
| 224 |
+
try:
|
| 225 |
+
code = int(match.group(1), 16)
|
| 226 |
+
if code < 0x110000:
|
| 227 |
+
return chr(code)
|
| 228 |
+
except Exception:
|
| 229 |
+
pass
|
| 230 |
+
return match.group(0)
|
| 231 |
+
|
| 232 |
+
text = re.sub(r'&#(\d+);?', decode_decimal, text)
|
| 233 |
+
text = re.sub(r'&#x([0-9a-fA-F]+);?', decode_hex, text)
|
| 234 |
+
|
| 235 |
+
# Fourth pass: handle special CJK entities
|
| 236 |
+
cjk_special_entities = {
|
| 237 |
+
'⟨': '〈', '⟩': '〉',
|
| 238 |
+
'⌈': '⌈', '⌉': '⌉',
|
| 239 |
+
'⌊': '⌊', '⌋': '⌋',
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
for entity, char in cjk_special_entities.items():
|
| 243 |
+
text = text.replace(entity, char)
|
| 244 |
+
|
| 245 |
+
return text
|
| 246 |
+
|
| 247 |
+
def _normalize_unicode(self, text: str) -> str:
|
| 248 |
+
"""Normalize Unicode with CJK awareness"""
|
| 249 |
+
if self.detected_language in ['korean', 'japanese', 'chinese']:
|
| 250 |
+
return text
|
| 251 |
+
else:
|
| 252 |
+
return unicodedata.normalize('NFC', text)
|
| 253 |
+
|
| 254 |
+
def _protect_quotes(self, text: str) -> str:
|
| 255 |
+
"""Protect quotes by replacing with special markers"""
|
| 256 |
+
for original, marker in self.QUOTE_MARKERS.items():
|
| 257 |
+
text = text.replace(original, marker)
|
| 258 |
+
return text
|
| 259 |
+
|
| 260 |
+
def _restore_quotes(self, text: str) -> str:
|
| 261 |
+
"""Restore quotes from special markers"""
|
| 262 |
+
for original, marker in self.QUOTE_MARKERS.items():
|
| 263 |
+
text = text.replace(marker, original)
|
| 264 |
+
return text
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def _preprocess_html_for_quotes(self, html_content: str) -> str:
|
| 269 |
+
"""Pre-process HTML to protect quotes from conversion"""
|
| 270 |
+
def protect_quotes_in_text(match):
|
| 271 |
+
text = match.group(1)
|
| 272 |
+
return f'>{self._protect_quotes(text)}<'
|
| 273 |
+
|
| 274 |
+
# Apply to all text between tags
|
| 275 |
+
html_content = re.sub(r'>([^<]+)<', protect_quotes_in_text, html_content)
|
| 276 |
+
return html_content
|
| 277 |
+
|
| 278 |
+
def _protect_quotes_in_soup(self, soup: BeautifulSoup) -> None:
|
| 279 |
+
"""Protect quotes in BeautifulSoup object before processing"""
|
| 280 |
+
for element in soup.find_all(string=True):
|
| 281 |
+
if element.parent.name not in ['script', 'style', 'noscript']:
|
| 282 |
+
original_text = str(element)
|
| 283 |
+
protected_text = self._protect_quotes(original_text)
|
| 284 |
+
element.replace_with(protected_text)
|
| 285 |
+
|
| 286 |
+
def _minimal_parser_fix(self, html_content: str) -> str:
|
| 287 |
+
"""Apply minimal fixes only for parser errors"""
|
| 288 |
+
# Fix tags with ="" pattern
|
| 289 |
+
html_content = re.sub(r'<[^>]*?=\s*""\s*[^>]*?>', '', html_content)
|
| 290 |
+
|
| 291 |
+
# Fix malformed closing tags
|
| 292 |
+
html_content = re.sub(r'</\s+(\w+)>', r'</\1>', html_content)
|
| 293 |
+
html_content = re.sub(r'</\s*>', '', html_content)
|
| 294 |
+
html_content = re.sub(r'<//+(\w+)>', r'</\1>', html_content)
|
| 295 |
+
|
| 296 |
+
# Fix orphaned brackets
|
| 297 |
+
html_content = re.sub(r'<(?![a-zA-Z/!?])', '<', html_content)
|
| 298 |
+
html_content = re.sub(r'(?<![a-zA-Z0-9"/])>', '>', html_content)
|
| 299 |
+
|
| 300 |
+
# Fix unclosed tags at the end
|
| 301 |
+
if html_content.rstrip().endswith('<'):
|
| 302 |
+
html_content = html_content.rstrip()[:-1]
|
| 303 |
+
|
| 304 |
+
# Remove nested opening brackets
|
| 305 |
+
html_content = re.sub(r'<[^>]*?<[^>]*?>', '', html_content)
|
| 306 |
+
|
| 307 |
+
return html_content
|
| 308 |
+
|
| 309 |
+
def _clean_text_cjk_aware(self, text: str, preserve_structure: bool) -> str:
|
| 310 |
+
"""Clean extracted text with CJK awareness"""
|
| 311 |
+
if not preserve_structure and self.detected_language not in ['korean', 'japanese', 'chinese']:
|
| 312 |
+
# Only do aggressive cleanup for non-CJK text
|
| 313 |
+
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
|
| 314 |
+
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
|
| 315 |
+
text = re.sub(r'\*(.*?)\*', r'\1', text)
|
| 316 |
+
text = re.sub(r'__(.*?)__', r'\1', text)
|
| 317 |
+
text = re.sub(r'_(.*?)_', r'\1', text)
|
| 318 |
+
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
| 319 |
+
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text)
|
| 320 |
+
text = re.sub(r'`([^`]+)`', r'\1', text)
|
| 321 |
+
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
|
| 322 |
+
text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE)
|
| 323 |
+
text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)
|
| 324 |
+
text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
|
| 325 |
+
text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE)
|
| 326 |
+
|
| 327 |
+
# Clean whitespace
|
| 328 |
+
if self.detected_language in ['korean', 'japanese', 'chinese']:
|
| 329 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 330 |
+
text = re.sub(r'[ ]{3,}', ' ', text)
|
| 331 |
+
else:
|
| 332 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 333 |
+
text = re.sub(r' {2,}', ' ', text)
|
| 334 |
+
|
| 335 |
+
# Remove invisible characters
|
| 336 |
+
invisible_chars = ['\u200b', '\u200c', '\u200d', '\ufeff', '\u2060']
|
| 337 |
+
for char in invisible_chars:
|
| 338 |
+
text = text.replace(char, '')
|
| 339 |
+
|
| 340 |
+
return text.strip()
|
| 341 |
+
|
| 342 |
+
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
|
| 343 |
+
"""Extract chapter title from various sources"""
|
| 344 |
+
# Try title tag first
|
| 345 |
+
if soup.title and soup.title.string:
|
| 346 |
+
title = soup.title.string.strip()
|
| 347 |
+
title = self._decode_entities(title)
|
| 348 |
+
return title
|
| 349 |
+
|
| 350 |
+
# Try headers in order
|
| 351 |
+
for header_tag in ['h1', 'h2', 'h3', 'h4']:
|
| 352 |
+
headers = soup.find_all(header_tag)
|
| 353 |
+
for header in headers:
|
| 354 |
+
title = header.get_text(strip=True)
|
| 355 |
+
if title:
|
| 356 |
+
title = self._decode_entities(title)
|
| 357 |
+
if self._is_chapter_title(title):
|
| 358 |
+
return title
|
| 359 |
+
|
| 360 |
+
return None
|
| 361 |
+
|
| 362 |
+
def _is_chapter_title(self, text: str) -> bool:
|
| 363 |
+
"""Check if text looks like a chapter title"""
|
| 364 |
+
if not text or len(text) > 200:
|
| 365 |
+
return False
|
| 366 |
+
|
| 367 |
+
# Common chapter patterns
|
| 368 |
+
patterns = [
|
| 369 |
+
r'第.{1,10}[章回話话]',
|
| 370 |
+
r'Chapter\s+\d+',
|
| 371 |
+
r'제\s*\d+\s*화',
|
| 372 |
+
r'第\d+話',
|
| 373 |
+
r'\d+\s*화',
|
| 374 |
+
r'EP\.?\s*\d+',
|
| 375 |
+
r'Part\s+\d+',
|
| 376 |
+
]
|
| 377 |
+
|
| 378 |
+
for pattern in patterns:
|
| 379 |
+
if re.search(pattern, text, re.IGNORECASE):
|
| 380 |
+
return True
|
| 381 |
+
|
| 382 |
+
# Check if it's short and doesn't contain too much punctuation
|
| 383 |
+
if len(text) < 100:
|
| 384 |
+
punct_count = sum(1 for c in text if c in '.,;:!?。、!?')
|
| 385 |
+
if punct_count < len(text) * 0.2:
|
| 386 |
+
return True
|
| 387 |
+
|
| 388 |
+
return False
|
| 389 |
+
|
| 390 |
+
def _extract_body_content(self, soup: BeautifulSoup, full_html: str) -> str:
|
| 391 |
+
"""Extract body content while preserving Unicode"""
|
| 392 |
+
# Remove script and style elements first
|
| 393 |
+
for element in soup(['script', 'style', 'noscript']):
|
| 394 |
+
element.decompose()
|
| 395 |
+
|
| 396 |
+
if soup.body:
|
| 397 |
+
return str(soup.body)
|
| 398 |
+
else:
|
| 399 |
+
return str(soup)
|
| 400 |
+
|
| 401 |
+
def extract_chapter_content(self, html_content: str, extraction_mode: str = None) -> Tuple[str, str, Optional[str]]:
|
| 402 |
+
"""Extract chapter content with proper Unicode and CJK handling"""
|
| 403 |
+
try:
|
| 404 |
+
# Use instance filtering_mode if not overridden
|
| 405 |
+
if extraction_mode is None:
|
| 406 |
+
extraction_mode = self.filtering_mode
|
| 407 |
+
|
| 408 |
+
# Handle encoding if content is bytes
|
| 409 |
+
if isinstance(html_content, bytes):
|
| 410 |
+
encoding = self._detect_encoding(html_content)
|
| 411 |
+
html_content = html_content.decode(encoding, errors='replace')
|
| 412 |
+
|
| 413 |
+
# Pre-process HTML to protect quotes
|
| 414 |
+
html_content = self._preprocess_html_for_quotes(html_content)
|
| 415 |
+
|
| 416 |
+
# Pre-process HTML to decode all entities
|
| 417 |
+
html_content = self._decode_entities(html_content)
|
| 418 |
+
|
| 419 |
+
# Detect language early
|
| 420 |
+
self.detected_language = self._detect_content_language(html_content)
|
| 421 |
+
print(f"🌐 Detected language: {self.detected_language}")
|
| 422 |
+
|
| 423 |
+
# Parse with BeautifulSoup
|
| 424 |
+
parser = 'html.parser'
|
| 425 |
+
if self.detected_language in ['korean', 'japanese', 'chinese']:
|
| 426 |
+
# For CJK content, lxml might handle encoding better if available
|
| 427 |
+
try:
|
| 428 |
+
import lxml
|
| 429 |
+
parser = 'lxml'
|
| 430 |
+
except ImportError:
|
| 431 |
+
pass
|
| 432 |
+
|
| 433 |
+
soup = BeautifulSoup(html_content, parser)
|
| 434 |
+
|
| 435 |
+
# Protect quotes before any processing
|
| 436 |
+
self._protect_quotes_in_soup(soup)
|
| 437 |
+
|
| 438 |
+
# Extract title
|
| 439 |
+
chapter_title = self._extract_title(soup)
|
| 440 |
+
|
| 441 |
+
# Respect GUI toggles to exclude headers/titles BEFORE conversion
|
| 442 |
+
try:
|
| 443 |
+
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
|
| 444 |
+
ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
|
| 445 |
+
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
|
| 446 |
+
if ignore_title_tag and soup.title:
|
| 447 |
+
# Remove <title> so it isn't included when using full extraction
|
| 448 |
+
soup.title.decompose()
|
| 449 |
+
if ignore_header_tags:
|
| 450 |
+
# Remove visible headers from body prior to conversion
|
| 451 |
+
for tag_name in ['h1', 'h2', 'h3']:
|
| 452 |
+
for hdr in soup.find_all(tag_name):
|
| 453 |
+
hdr.decompose()
|
| 454 |
+
except Exception:
|
| 455 |
+
# Non-fatal – proceed with original soup if anything goes wrong
|
| 456 |
+
pass
|
| 457 |
+
|
| 458 |
+
# Determine content to convert (after removals)
|
| 459 |
+
if extraction_mode == "full":
|
| 460 |
+
content_to_convert = str(soup)
|
| 461 |
+
else:
|
| 462 |
+
content_to_convert = self._extract_body_content(soup, html_content)
|
| 463 |
+
|
| 464 |
+
# Convert using html2text
|
| 465 |
+
content_to_convert = self._decode_entities(content_to_convert)
|
| 466 |
+
|
| 467 |
+
# Convert to text with error handling
|
| 468 |
+
try:
|
| 469 |
+
clean_text = self.h2t.handle(content_to_convert)
|
| 470 |
+
except (AssertionError, UnboundLocalError) as e:
|
| 471 |
+
error_msg = str(e)
|
| 472 |
+
if "cannot access local variable" in error_msg or "we should not get here!" in error_msg or "unexpected call to parse_endtag" in error_msg or "unexpected call to parse_starttag" in error_msg:
|
| 473 |
+
print(f"⚠️ html2text encountered malformed HTML: {error_msg}")
|
| 474 |
+
print(f"⚠️ Applying minimal fixes...")
|
| 475 |
+
# Apply minimal fixes
|
| 476 |
+
content_to_convert = self._minimal_parser_fix(content_to_convert)
|
| 477 |
+
try:
|
| 478 |
+
clean_text = self.h2t.handle(content_to_convert)
|
| 479 |
+
print(f"✅ Successfully processed after minimal fixes")
|
| 480 |
+
except Exception as e2:
|
| 481 |
+
print(f"⚠️ html2text still failing: {e2}")
|
| 482 |
+
# Last resort fallback
|
| 483 |
+
clean_text = soup.get_text(separator='\n', strip=True)
|
| 484 |
+
print(f"✅ Used BeautifulSoup fallback")
|
| 485 |
+
else:
|
| 486 |
+
# Re-raise if it's a different error
|
| 487 |
+
raise
|
| 488 |
+
except Exception as e:
|
| 489 |
+
print(f"⚠️ Unexpected error in html2text: {e}")
|
| 490 |
+
# Fallback to BeautifulSoup
|
| 491 |
+
clean_text = soup.get_text(separator='\n', strip=True)
|
| 492 |
+
|
| 493 |
+
# Normalize only if appropriate
|
| 494 |
+
clean_text = self._normalize_unicode(clean_text)
|
| 495 |
+
|
| 496 |
+
# Clean based on settings and language
|
| 497 |
+
clean_text = self._clean_text_cjk_aware(clean_text, self.preserve_structure)
|
| 498 |
+
|
| 499 |
+
# Restore protected quotes
|
| 500 |
+
clean_text = self._restore_quotes(clean_text)
|
| 501 |
+
|
| 502 |
+
# For enhanced mode, both display and translation content are the same
|
| 503 |
+
return clean_text, clean_text, chapter_title
|
| 504 |
+
|
| 505 |
+
except Exception as e:
|
| 506 |
+
print(f"❌ Enhanced extraction failed: {e}")
|
| 507 |
+
raise
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
# Test function
|
| 511 |
+
def test_cjk_preservation():
|
| 512 |
+
"""Test that CJK characters and quotes are properly preserved"""
|
| 513 |
+
test_cases = [
|
| 514 |
+
# Korean test with quotes
|
| 515 |
+
'''<html>
|
| 516 |
+
<head><title>제국의 붉은 사신</title></head>
|
| 517 |
+
<body>
|
| 518 |
+
<p>"왜 이러는 겁니까? 우리가 무슨 잘못을 했다고!"</p>
|
| 519 |
+
<p>"......"</p>
|
| 520 |
+
<p>"한 번만 살려주시오! 가족을 지키려면 어쩔 수 없었소!"</p>
|
| 521 |
+
<p>"응애! 응애! 응애!"</p>
|
| 522 |
+
<p>"미안하구나. 모든 죄는 내가 짊어지고 사마."</p>
|
| 523 |
+
</body>
|
| 524 |
+
</html>'''
|
| 525 |
+
|
| 526 |
+
# Japanese test with quotes
|
| 527 |
+
'''<html>
|
| 528 |
+
<head><title>第1話:始まり</title></head>
|
| 529 |
+
<body>
|
| 530 |
+
<h1>第1話:始まり</h1>
|
| 531 |
+
<p>「こんにちは!これは日本語のテストです。」</p>
|
| 532 |
+
<p>彼は言った。「これで全部ですか?」</p>
|
| 533 |
+
<p>「はい、そうです」と答えた。</p>
|
| 534 |
+
</body>
|
| 535 |
+
</html>''',
|
| 536 |
+
|
| 537 |
+
# Chinese test with quotes
|
| 538 |
+
'''<html>
|
| 539 |
+
<head><title>第一章:开始</title></head>
|
| 540 |
+
<body>
|
| 541 |
+
<h1>第一章:开始</h1>
|
| 542 |
+
<p>"你好!这是中文测试。"</p>
|
| 543 |
+
<p>他说:"这就是全部吗?"</p>
|
| 544 |
+
<p>"是的,"她回答道。</p>
|
| 545 |
+
</body>
|
| 546 |
+
</html>''',
|
| 547 |
+
]
|
| 548 |
+
|
| 549 |
+
extractor = EnhancedTextExtractor()
|
| 550 |
+
|
| 551 |
+
print("=== CJK and Quote Preservation Test ===\n")
|
| 552 |
+
|
| 553 |
+
for i, test_html in enumerate(test_cases, 1):
|
| 554 |
+
print(f"--- Test Case {i} ---")
|
| 555 |
+
try:
|
| 556 |
+
content, _, title = extractor.extract_chapter_content(test_html)
|
| 557 |
+
|
| 558 |
+
print(f"Title: {title}")
|
| 559 |
+
print(f"Content:\n{content}\n")
|
| 560 |
+
|
| 561 |
+
# Check for quotes preservation
|
| 562 |
+
quote_checks = [
|
| 563 |
+
('"', 'Western double quotes'),
|
| 564 |
+
('「', 'Japanese left bracket'),
|
| 565 |
+
('」', 'Japanese right bracket'),
|
| 566 |
+
('“', 'Chinese double quote'),
|
| 567 |
+
]
|
| 568 |
+
|
| 569 |
+
print("Quote preservation check:")
|
| 570 |
+
quote_found = False
|
| 571 |
+
|
| 572 |
+
for quote_char, desc in quote_checks:
|
| 573 |
+
if quote_char in content:
|
| 574 |
+
print(f" ✓ Found {desc}: {quote_char}")
|
| 575 |
+
quote_found = True
|
| 576 |
+
|
| 577 |
+
if not quote_found:
|
| 578 |
+
print(" ❌ No quotes found!")
|
| 579 |
+
else:
|
| 580 |
+
print(" ✅ Quotes preserved successfully!")
|
| 581 |
+
|
| 582 |
+
# Check for image tag preservation (html2text now preserves them natively)
|
| 583 |
+
img_count = content.count('<img')
|
| 584 |
+
if img_count > 0:
|
| 585 |
+
print(f" ✓ Found {img_count} HTML img tags (preserved natively by html2text)")
|
| 586 |
+
print(" ✅ Image tags preserved successfully!")
|
| 587 |
+
else:
|
| 588 |
+
print(" ℹ️ No images in this test case")
|
| 589 |
+
|
| 590 |
+
except Exception as e:
|
| 591 |
+
print(f"Error processing test case {i}: {e}")
|
| 592 |
+
|
| 593 |
+
print("-" * 50 + "\n")
|
| 594 |
+
|
| 595 |
+
|
| 596 |
+
if __name__ == "__main__":
|
| 597 |
+
test_cjk_preservation()
|
epub_converter.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
extract_glossary_from_epub.py
ADDED
|
@@ -0,0 +1,2081 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# extract_glossary_from_epub.py
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import argparse
|
| 5 |
+
import zipfile
|
| 6 |
+
import time
|
| 7 |
+
import sys
|
| 8 |
+
import tiktoken
|
| 9 |
+
import threading
|
| 10 |
+
import queue
|
| 11 |
+
import ebooklib
|
| 12 |
+
import re
|
| 13 |
+
from ebooklib import epub
|
| 14 |
+
from chapter_splitter import ChapterSplitter
|
| 15 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 16 |
+
from typing import List, Dict, Tuple
|
| 17 |
+
from unified_api_client import UnifiedClient, UnifiedClientError
|
| 18 |
+
|
| 19 |
+
# Fix for PyInstaller - handle stdout reconfigure more carefully
|
| 20 |
+
if sys.platform.startswith("win"):
|
| 21 |
+
try:
|
| 22 |
+
# Try to reconfigure if the method exists
|
| 23 |
+
if hasattr(sys.stdout, 'reconfigure'):
|
| 24 |
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
| 25 |
+
except (AttributeError, ValueError):
|
| 26 |
+
# If reconfigure doesn't work, try to set up UTF-8 another way
|
| 27 |
+
import io
|
| 28 |
+
import locale
|
| 29 |
+
if sys.stdout and hasattr(sys.stdout, 'buffer'):
|
| 30 |
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
|
| 31 |
+
|
| 32 |
+
MODEL = os.getenv("MODEL", "gemini-2.0-flash")
|
| 33 |
+
|
| 34 |
+
def interruptible_sleep(duration, check_stop_fn, interval=0.1):
|
| 35 |
+
"""Sleep that can be interrupted by stop request"""
|
| 36 |
+
elapsed = 0
|
| 37 |
+
while elapsed < duration:
|
| 38 |
+
if check_stop_fn and check_stop_fn(): # Add safety check for None
|
| 39 |
+
return False # Interrupted
|
| 40 |
+
sleep_time = min(interval, duration - elapsed)
|
| 41 |
+
time.sleep(sleep_time)
|
| 42 |
+
elapsed += sleep_time
|
| 43 |
+
return True # Completed normally
|
| 44 |
+
|
| 45 |
+
def cancel_all_futures(futures):
|
| 46 |
+
"""Cancel all pending futures immediately"""
|
| 47 |
+
cancelled_count = 0
|
| 48 |
+
for future in futures:
|
| 49 |
+
if not future.done() and future.cancel():
|
| 50 |
+
cancelled_count += 1
|
| 51 |
+
return cancelled_count
|
| 52 |
+
|
| 53 |
+
def create_client_with_multi_key_support(api_key, model, output_dir, config):
|
| 54 |
+
"""Create a UnifiedClient with multi API key support if enabled"""
|
| 55 |
+
|
| 56 |
+
# Check if multi API key mode is enabled
|
| 57 |
+
use_multi_keys = config.get('use_multi_api_keys', False)
|
| 58 |
+
|
| 59 |
+
# Set environment variables for UnifiedClient to pick up
|
| 60 |
+
if use_multi_keys and 'multi_api_keys' in config and config['multi_api_keys']:
|
| 61 |
+
print("🔑 Multi API Key mode enabled for glossary extraction")
|
| 62 |
+
|
| 63 |
+
# Set environment variables that UnifiedClient will read
|
| 64 |
+
os.environ['USE_MULTI_API_KEYS'] = '1'
|
| 65 |
+
os.environ['MULTI_API_KEYS'] = json.dumps(config['multi_api_keys'])
|
| 66 |
+
os.environ['FORCE_KEY_ROTATION'] = '1' if config.get('force_key_rotation', True) else '0'
|
| 67 |
+
os.environ['ROTATION_FREQUENCY'] = str(config.get('rotation_frequency', 1))
|
| 68 |
+
|
| 69 |
+
print(f" • Keys configured: {len(config['multi_api_keys'])}")
|
| 70 |
+
print(f" • Force rotation: {config.get('force_key_rotation', True)}")
|
| 71 |
+
print(f" • Rotation frequency: every {config.get('rotation_frequency', 1)} request(s)")
|
| 72 |
+
else:
|
| 73 |
+
# Ensure multi-key mode is disabled in environment
|
| 74 |
+
os.environ['USE_MULTI_API_KEYS'] = '0'
|
| 75 |
+
|
| 76 |
+
# Create UnifiedClient normally - it will check environment variables
|
| 77 |
+
return UnifiedClient(api_key=api_key, model=model, output_dir=output_dir)
|
| 78 |
+
|
| 79 |
+
def send_with_interrupt(messages, client, temperature, max_tokens, stop_check_fn, chunk_timeout=None):
|
| 80 |
+
"""Send API request with interrupt capability and optional timeout retry"""
|
| 81 |
+
result_queue = queue.Queue()
|
| 82 |
+
|
| 83 |
+
def api_call():
|
| 84 |
+
try:
|
| 85 |
+
start_time = time.time()
|
| 86 |
+
result = client.send(messages, temperature=temperature, max_tokens=max_tokens, context='glossary')
|
| 87 |
+
elapsed = time.time() - start_time
|
| 88 |
+
result_queue.put((result, elapsed))
|
| 89 |
+
except Exception as e:
|
| 90 |
+
result_queue.put(e)
|
| 91 |
+
|
| 92 |
+
api_thread = threading.Thread(target=api_call)
|
| 93 |
+
api_thread.daemon = True
|
| 94 |
+
api_thread.start()
|
| 95 |
+
|
| 96 |
+
timeout = chunk_timeout if chunk_timeout is not None else 86400
|
| 97 |
+
check_interval = 0.1
|
| 98 |
+
elapsed = 0
|
| 99 |
+
|
| 100 |
+
while elapsed < timeout:
|
| 101 |
+
try:
|
| 102 |
+
# Check for results with shorter timeout
|
| 103 |
+
result = result_queue.get(timeout=check_interval)
|
| 104 |
+
if isinstance(result, Exception):
|
| 105 |
+
raise result
|
| 106 |
+
if isinstance(result, tuple):
|
| 107 |
+
api_result, api_time = result
|
| 108 |
+
if chunk_timeout and api_time > chunk_timeout:
|
| 109 |
+
if hasattr(client, '_in_cleanup'):
|
| 110 |
+
client._in_cleanup = True
|
| 111 |
+
if hasattr(client, 'cancel_current_operation'):
|
| 112 |
+
client.cancel_current_operation()
|
| 113 |
+
raise UnifiedClientError(f"API call took {api_time:.1f}s (timeout: {chunk_timeout}s)")
|
| 114 |
+
return api_result
|
| 115 |
+
return result
|
| 116 |
+
except queue.Empty:
|
| 117 |
+
if stop_check_fn():
|
| 118 |
+
# More aggressive cancellation
|
| 119 |
+
print("🛑 Stop requested - cancelling API call immediately...")
|
| 120 |
+
|
| 121 |
+
# Set cleanup flag
|
| 122 |
+
if hasattr(client, '_in_cleanup'):
|
| 123 |
+
client._in_cleanup = True
|
| 124 |
+
|
| 125 |
+
# Try to cancel the operation
|
| 126 |
+
if hasattr(client, 'cancel_current_operation'):
|
| 127 |
+
client.cancel_current_operation()
|
| 128 |
+
|
| 129 |
+
# Don't wait for the thread to finish - just raise immediately
|
| 130 |
+
raise UnifiedClientError("Glossary extraction stopped by user")
|
| 131 |
+
|
| 132 |
+
elapsed += check_interval
|
| 133 |
+
|
| 134 |
+
# Timeout occurred
|
| 135 |
+
if hasattr(client, '_in_cleanup'):
|
| 136 |
+
client._in_cleanup = True
|
| 137 |
+
if hasattr(client, 'cancel_current_operation'):
|
| 138 |
+
client.cancel_current_operation()
|
| 139 |
+
raise UnifiedClientError(f"API call timed out after {timeout} seconds")
|
| 140 |
+
|
| 141 |
+
# Parse token limit from environment variable (same logic as translation)
|
| 142 |
+
def parse_glossary_token_limit():
|
| 143 |
+
"""Parse token limit from environment variable"""
|
| 144 |
+
env_value = os.getenv("GLOSSARY_TOKEN_LIMIT", "1000000").strip()
|
| 145 |
+
|
| 146 |
+
if not env_value or env_value == "":
|
| 147 |
+
return None, "unlimited"
|
| 148 |
+
|
| 149 |
+
if env_value.lower() == "unlimited":
|
| 150 |
+
return None, "unlimited"
|
| 151 |
+
|
| 152 |
+
if env_value.isdigit() and int(env_value) > 0:
|
| 153 |
+
limit = int(env_value)
|
| 154 |
+
return limit, str(limit)
|
| 155 |
+
|
| 156 |
+
# Default fallback
|
| 157 |
+
return 1000000, "1000000 (default)"
|
| 158 |
+
|
| 159 |
+
MAX_GLOSSARY_TOKENS, GLOSSARY_LIMIT_STR = parse_glossary_token_limit()
|
| 160 |
+
|
| 161 |
+
# Global stop flag for GUI integration
|
| 162 |
+
_stop_requested = False
|
| 163 |
+
|
| 164 |
+
def set_stop_flag(value):
|
| 165 |
+
"""Set the global stop flag"""
|
| 166 |
+
global _stop_requested
|
| 167 |
+
_stop_requested = value
|
| 168 |
+
|
| 169 |
+
# When clearing the stop flag, also clear the multi-key environment variable
|
| 170 |
+
if not value:
|
| 171 |
+
os.environ['TRANSLATION_CANCELLED'] = '0'
|
| 172 |
+
|
| 173 |
+
# Also clear UnifiedClient global flag
|
| 174 |
+
try:
|
| 175 |
+
import unified_api_client
|
| 176 |
+
if hasattr(unified_api_client, 'UnifiedClient'):
|
| 177 |
+
unified_api_client.UnifiedClient._global_cancelled = False
|
| 178 |
+
except:
|
| 179 |
+
pass
|
| 180 |
+
|
| 181 |
+
def is_stop_requested():
|
| 182 |
+
"""Check if stop was requested"""
|
| 183 |
+
global _stop_requested
|
| 184 |
+
return _stop_requested
|
| 185 |
+
|
| 186 |
+
# ─── resilient tokenizer setup ───
|
| 187 |
+
try:
|
| 188 |
+
enc = tiktoken.encoding_for_model(MODEL)
|
| 189 |
+
except Exception:
|
| 190 |
+
try:
|
| 191 |
+
enc = tiktoken.get_encoding("cl100k_base")
|
| 192 |
+
except Exception:
|
| 193 |
+
enc = None
|
| 194 |
+
|
| 195 |
+
def count_tokens(text: str) -> int:
|
| 196 |
+
if enc:
|
| 197 |
+
return len(enc.encode(text))
|
| 198 |
+
# crude fallback: assume ~1 token per 4 chars
|
| 199 |
+
return max(1, len(text) // 4)
|
| 200 |
+
|
| 201 |
+
from ebooklib import epub
|
| 202 |
+
from bs4 import BeautifulSoup
|
| 203 |
+
from unified_api_client import UnifiedClient
|
| 204 |
+
from typing import List, Dict
|
| 205 |
+
import re
|
| 206 |
+
|
| 207 |
+
PROGRESS_FILE = "glossary_progress.json"
|
| 208 |
+
|
| 209 |
+
def remove_honorifics(name):
|
| 210 |
+
"""Remove common honorifics from names"""
|
| 211 |
+
if not name:
|
| 212 |
+
return name
|
| 213 |
+
|
| 214 |
+
# Check if honorifics filtering is disabled
|
| 215 |
+
if os.getenv('GLOSSARY_DISABLE_HONORIFICS_FILTER', '0') == '1':
|
| 216 |
+
return name.strip()
|
| 217 |
+
|
| 218 |
+
# Modern Korean honorifics
|
| 219 |
+
korean_honorifics = [
|
| 220 |
+
'님', '씨', '씨는', '군', '양', '선생님', '선생', '사장님', '사장',
|
| 221 |
+
'과장님', '과장', '대리님', '대리', '주임님', '주임', '이사님', '이사',
|
| 222 |
+
'부장님', '부장', '차장님', '차장', '팀장님', '팀장', '실장님', '실장',
|
| 223 |
+
'교수님', '교수', '박사님', '박사', '원장님', '원장', '회장님', '회장',
|
| 224 |
+
'소장님', '소장', '전무님', '전무', '상무님', '상무', '이사장님', '이사장'
|
| 225 |
+
]
|
| 226 |
+
|
| 227 |
+
# Archaic/Historical Korean honorifics
|
| 228 |
+
korean_archaic = [
|
| 229 |
+
'공', '옹', '어른', '나리', '나으리', '대감', '영감', '마님', '마마',
|
| 230 |
+
'대군', '군', '옹주', '공주', '왕자', '세자', '영애', '영식', '도령',
|
| 231 |
+
'낭자', '낭군', '서방', '영감님', '대감님', '마님', '아씨', '도련님',
|
| 232 |
+
'아가씨', '나으리', '진사', '첨지', '영의정', '좌의정', '우의정',
|
| 233 |
+
'판서', '참판', '정승', '대원군'
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
# Modern Japanese honorifics
|
| 237 |
+
japanese_honorifics = [
|
| 238 |
+
'さん', 'さま', '様', 'くん', '君', 'ちゃん', 'せんせい', '先生',
|
| 239 |
+
'どの', '殿', 'たん', 'ぴょん', 'ぽん', 'ちん', 'りん', 'せんぱい',
|
| 240 |
+
'先輩', 'こうはい', '後輩', 'し', '氏', 'ふじん', '夫人', 'かちょう',
|
| 241 |
+
'課長', 'ぶちょう', '部長', 'しゃちょう', '社長'
|
| 242 |
+
]
|
| 243 |
+
|
| 244 |
+
# Archaic/Historical Japanese honorifics
|
| 245 |
+
japanese_archaic = [
|
| 246 |
+
'どの', '殿', 'たいゆう', '大夫', 'きみ', '公', 'あそん', '朝臣',
|
| 247 |
+
'おみ', '臣', 'むらじ', '連', 'みこと', '命', '尊', 'ひめ', '姫',
|
| 248 |
+
'みや', '宮', 'おう', '王', 'こう', '侯', 'はく', '伯', 'し', '子',
|
| 249 |
+
'だん', '男', 'じょ', '女', 'ひこ', '彦', 'ひめみこ', '姫御子',
|
| 250 |
+
'すめらみこと', '天皇', 'きさき', '后', 'みかど', '帝'
|
| 251 |
+
]
|
| 252 |
+
|
| 253 |
+
# Modern Chinese honorifics
|
| 254 |
+
chinese_honorifics = [
|
| 255 |
+
'先生', '女士', '小姐', '老师', '师傅', '大人', '公', '君', '总',
|
| 256 |
+
'老总', '老板', '经理', '主任', '处长', '科长', '股长', '教授',
|
| 257 |
+
'博士', '院长', '校长', '同志', '师兄', '师姐', '师弟', '师妹',
|
| 258 |
+
'学长', '学姐', '前辈', '阁下'
|
| 259 |
+
]
|
| 260 |
+
|
| 261 |
+
# Archaic/Historical Chinese honorifics
|
| 262 |
+
chinese_archaic = [
|
| 263 |
+
'公', '侯', '伯', '子', '男', '王', '君', '卿', '大夫', '士',
|
| 264 |
+
'陛下', '殿下', '阁下', '爷', '老爷', '大人', '夫人', '娘娘',
|
| 265 |
+
'公子', '公主', '郡主', '世子', '太子', '皇上', '皇后', '贵妃',
|
| 266 |
+
'娘子', '相公', '官人', '郎君', '小姐', '姑娘', '公公', '嬷嬷',
|
| 267 |
+
'大侠', '少侠', '前辈', '晚辈', '在下', '足下', '兄台', '仁兄',
|
| 268 |
+
'贤弟', '老夫', '老朽', '本座', '本尊', '真人', '上人', '尊者'
|
| 269 |
+
]
|
| 270 |
+
|
| 271 |
+
# Combine all honorifics
|
| 272 |
+
all_honorifics = (
|
| 273 |
+
korean_honorifics + korean_archaic +
|
| 274 |
+
japanese_honorifics + japanese_archaic +
|
| 275 |
+
chinese_honorifics + chinese_archaic
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
# Remove honorifics from the end of the name
|
| 279 |
+
name_cleaned = name.strip()
|
| 280 |
+
|
| 281 |
+
# Sort by length (longest first) to avoid partial matches
|
| 282 |
+
sorted_honorifics = sorted(all_honorifics, key=len, reverse=True)
|
| 283 |
+
|
| 284 |
+
for honorific in sorted_honorifics:
|
| 285 |
+
if name_cleaned.endswith(honorific):
|
| 286 |
+
name_cleaned = name_cleaned[:-len(honorific)].strip()
|
| 287 |
+
# Only remove one honorific per pass
|
| 288 |
+
break
|
| 289 |
+
|
| 290 |
+
return name_cleaned
|
| 291 |
+
|
| 292 |
+
def set_output_redirect(log_callback=None):
|
| 293 |
+
"""Redirect print statements to a callback function for GUI integration"""
|
| 294 |
+
if log_callback:
|
| 295 |
+
import sys
|
| 296 |
+
import io
|
| 297 |
+
|
| 298 |
+
class CallbackWriter:
|
| 299 |
+
def __init__(self, callback):
|
| 300 |
+
self.callback = callback
|
| 301 |
+
self.buffer = ""
|
| 302 |
+
|
| 303 |
+
def write(self, text):
|
| 304 |
+
if text.strip():
|
| 305 |
+
self.callback(text.strip())
|
| 306 |
+
|
| 307 |
+
def flush(self):
|
| 308 |
+
pass
|
| 309 |
+
|
| 310 |
+
sys.stdout = CallbackWriter(log_callback)
|
| 311 |
+
|
| 312 |
+
def load_config(path: str) -> Dict:
|
| 313 |
+
with open(path, 'r', encoding='utf-8') as f:
|
| 314 |
+
cfg = json.load(f)
|
| 315 |
+
|
| 316 |
+
# override context_limit_chapters if GUI passed GLOSSARY_CONTEXT_LIMIT
|
| 317 |
+
env_limit = os.getenv("GLOSSARY_CONTEXT_LIMIT")
|
| 318 |
+
if env_limit is not None:
|
| 319 |
+
try:
|
| 320 |
+
cfg['context_limit_chapters'] = int(env_limit)
|
| 321 |
+
except ValueError:
|
| 322 |
+
pass # keep existing config value on parse error
|
| 323 |
+
|
| 324 |
+
# override temperature if GUI passed GLOSSARY_TEMPERATURE
|
| 325 |
+
env_temp = os.getenv("GLOSSARY_TEMPERATURE")
|
| 326 |
+
if env_temp is not None:
|
| 327 |
+
try:
|
| 328 |
+
cfg['temperature'] = float(env_temp)
|
| 329 |
+
except ValueError:
|
| 330 |
+
pass # keep existing config value on parse error
|
| 331 |
+
|
| 332 |
+
return cfg
|
| 333 |
+
|
| 334 |
+
def get_custom_entry_types():
|
| 335 |
+
"""Get custom entry types configuration from environment"""
|
| 336 |
+
try:
|
| 337 |
+
types_json = os.getenv('GLOSSARY_CUSTOM_ENTRY_TYPES', '{}')
|
| 338 |
+
result = json.loads(types_json)
|
| 339 |
+
# If empty, return defaults
|
| 340 |
+
if not result:
|
| 341 |
+
return {
|
| 342 |
+
'character': {'enabled': True, 'has_gender': True},
|
| 343 |
+
'term': {'enabled': True, 'has_gender': False}
|
| 344 |
+
}
|
| 345 |
+
return result
|
| 346 |
+
except:
|
| 347 |
+
# Default configuration
|
| 348 |
+
return {
|
| 349 |
+
'character': {'enabled': True, 'has_gender': True},
|
| 350 |
+
'term': {'enabled': True, 'has_gender': False}
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
def save_glossary_json(glossary: List[Dict], output_path: str):
|
| 354 |
+
"""Save glossary in the new simple format with automatic sorting by type"""
|
| 355 |
+
# Get custom types for sorting order
|
| 356 |
+
custom_types = get_custom_entry_types()
|
| 357 |
+
|
| 358 |
+
# Create sorting order: character=0, term=1, others alphabetically starting from 2
|
| 359 |
+
type_order = {'character': 0, 'term': 1}
|
| 360 |
+
other_types = sorted([t for t in custom_types.keys() if t not in ['character', 'term']])
|
| 361 |
+
for i, t in enumerate(other_types):
|
| 362 |
+
type_order[t] = i + 2
|
| 363 |
+
|
| 364 |
+
# Sort glossary by type order, then by raw_name
|
| 365 |
+
sorted_glossary = sorted(glossary, key=lambda x: (
|
| 366 |
+
type_order.get(x.get('type', 'term'), 999), # Unknown types go last
|
| 367 |
+
x.get('raw_name', '').lower()
|
| 368 |
+
))
|
| 369 |
+
|
| 370 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 371 |
+
json.dump(sorted_glossary, f, ensure_ascii=False, indent=2)
|
| 372 |
+
|
| 373 |
+
def save_glossary_csv(glossary: List[Dict], output_path: str):
|
| 374 |
+
"""Save glossary in CSV or token-efficient format based on environment variable"""
|
| 375 |
+
import csv
|
| 376 |
+
|
| 377 |
+
csv_path = output_path.replace('.json', '.csv')
|
| 378 |
+
|
| 379 |
+
# Get custom types for sorting order and gender info
|
| 380 |
+
custom_types = get_custom_entry_types()
|
| 381 |
+
|
| 382 |
+
# Create sorting order
|
| 383 |
+
type_order = {'character': 0, 'term': 1}
|
| 384 |
+
other_types = sorted([t for t in custom_types.keys() if t not in ['character', 'term']])
|
| 385 |
+
for i, t in enumerate(other_types):
|
| 386 |
+
type_order[t] = i + 2
|
| 387 |
+
|
| 388 |
+
# Sort glossary
|
| 389 |
+
sorted_glossary = sorted(glossary, key=lambda x: (
|
| 390 |
+
type_order.get(x.get('type', 'term'), 999),
|
| 391 |
+
x.get('raw_name', '').lower()
|
| 392 |
+
))
|
| 393 |
+
|
| 394 |
+
# Check if we should use legacy CSV format
|
| 395 |
+
use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1'
|
| 396 |
+
|
| 397 |
+
if use_legacy_format:
|
| 398 |
+
# LEGACY CSV FORMAT
|
| 399 |
+
with open(csv_path, 'w', encoding='utf-8', newline='') as f:
|
| 400 |
+
writer = csv.writer(f)
|
| 401 |
+
|
| 402 |
+
# Build header row
|
| 403 |
+
header = ['type', 'raw_name', 'translated_name', 'gender']
|
| 404 |
+
|
| 405 |
+
# Add any custom fields to header
|
| 406 |
+
custom_fields_json = os.getenv('GLOSSARY_CUSTOM_FIELDS', '[]')
|
| 407 |
+
try:
|
| 408 |
+
custom_fields = json.loads(custom_fields_json)
|
| 409 |
+
header.extend(custom_fields)
|
| 410 |
+
except:
|
| 411 |
+
custom_fields = []
|
| 412 |
+
|
| 413 |
+
# Write header row
|
| 414 |
+
writer.writerow(header)
|
| 415 |
+
|
| 416 |
+
# Write data rows
|
| 417 |
+
for entry in sorted_glossary:
|
| 418 |
+
entry_type = entry.get('type', 'term')
|
| 419 |
+
type_config = custom_types.get(entry_type, {})
|
| 420 |
+
|
| 421 |
+
# Base row: type, raw_name, translated_name
|
| 422 |
+
row = [entry_type, entry.get('raw_name', ''), entry.get('translated_name', '')]
|
| 423 |
+
|
| 424 |
+
# Add gender only if type supports it
|
| 425 |
+
if type_config.get('has_gender', False):
|
| 426 |
+
row.append(entry.get('gender', ''))
|
| 427 |
+
|
| 428 |
+
# Add custom field values
|
| 429 |
+
for field in custom_fields:
|
| 430 |
+
row.append(entry.get(field, ''))
|
| 431 |
+
|
| 432 |
+
# Count how many fields we SHOULD have
|
| 433 |
+
expected_fields = 4 + len(custom_fields) # type, raw_name, translated_name, gender + custom fields
|
| 434 |
+
|
| 435 |
+
# Only trim if we have MORE than expected (extra trailing empties)
|
| 436 |
+
while len(row) > expected_fields and row[-1] == '':
|
| 437 |
+
row.pop()
|
| 438 |
+
|
| 439 |
+
# Ensure minimum required fields (type, raw_name, translated_name)
|
| 440 |
+
while len(row) < 3:
|
| 441 |
+
row.append('')
|
| 442 |
+
|
| 443 |
+
# Write row
|
| 444 |
+
writer.writerow(row)
|
| 445 |
+
|
| 446 |
+
print(f"✅ Saved legacy CSV format: {csv_path}")
|
| 447 |
+
|
| 448 |
+
else:
|
| 449 |
+
# NEW TOKEN-EFFICIENT FORMAT (DEFAULT)
|
| 450 |
+
# Group entries by type
|
| 451 |
+
grouped_entries = {}
|
| 452 |
+
for entry in sorted_glossary:
|
| 453 |
+
entry_type = entry.get('type', 'term')
|
| 454 |
+
if entry_type not in grouped_entries:
|
| 455 |
+
grouped_entries[entry_type] = []
|
| 456 |
+
grouped_entries[entry_type].append(entry)
|
| 457 |
+
|
| 458 |
+
# Get custom fields configuration
|
| 459 |
+
custom_fields_json = os.getenv('GLOSSARY_CUSTOM_FIELDS', '[]')
|
| 460 |
+
try:
|
| 461 |
+
custom_fields = json.loads(custom_fields_json)
|
| 462 |
+
except:
|
| 463 |
+
custom_fields = []
|
| 464 |
+
|
| 465 |
+
# Write as plain text format for token efficiency
|
| 466 |
+
with open(csv_path, 'w', encoding='utf-8') as f:
|
| 467 |
+
# Write header
|
| 468 |
+
f.write("Glossary: Characters, Terms, and Important Elements\n\n")
|
| 469 |
+
|
| 470 |
+
# Process each type group
|
| 471 |
+
for entry_type in sorted(grouped_entries.keys(), key=lambda x: type_order.get(x, 999)):
|
| 472 |
+
entries = grouped_entries[entry_type]
|
| 473 |
+
type_config = custom_types.get(entry_type, {})
|
| 474 |
+
|
| 475 |
+
# Write section header
|
| 476 |
+
section_name = entry_type.upper() + 'S' if not entry_type.upper().endswith('S') else entry_type.upper()
|
| 477 |
+
f.write(f"=== {section_name} ===\n")
|
| 478 |
+
|
| 479 |
+
# Write entries for this type with indentation
|
| 480 |
+
for entry in entries:
|
| 481 |
+
# Build the entry line
|
| 482 |
+
raw_name = entry.get('raw_name', '')
|
| 483 |
+
translated_name = entry.get('translated_name', '')
|
| 484 |
+
|
| 485 |
+
# Start with asterisk and name
|
| 486 |
+
line = f"* {translated_name} ({raw_name})"
|
| 487 |
+
|
| 488 |
+
# Add gender if applicable and not Unknown
|
| 489 |
+
if type_config.get('has_gender', False):
|
| 490 |
+
gender = entry.get('gender', '')
|
| 491 |
+
if gender and gender != 'Unknown':
|
| 492 |
+
line += f" [{gender}]"
|
| 493 |
+
|
| 494 |
+
# Add custom field values if they exist
|
| 495 |
+
custom_field_parts = []
|
| 496 |
+
for field in custom_fields:
|
| 497 |
+
value = entry.get(field, '').strip()
|
| 498 |
+
if value:
|
| 499 |
+
# For description fields, add as continuation
|
| 500 |
+
if field.lower() in ['description', 'notes', 'details']:
|
| 501 |
+
line += f": {value}"
|
| 502 |
+
else:
|
| 503 |
+
custom_field_parts.append(f"{field}: {value}")
|
| 504 |
+
|
| 505 |
+
# Add other custom fields in parentheses
|
| 506 |
+
if custom_field_parts:
|
| 507 |
+
line += f" ({', '.join(custom_field_parts)})"
|
| 508 |
+
|
| 509 |
+
# Write the line
|
| 510 |
+
f.write(line + "\n")
|
| 511 |
+
|
| 512 |
+
# Add blank line between sections
|
| 513 |
+
f.write("\n")
|
| 514 |
+
|
| 515 |
+
print(f"✅ Saved token-efficient glossary: {csv_path}")
|
| 516 |
+
|
| 517 |
+
# Print summary for both formats
|
| 518 |
+
type_counts = {}
|
| 519 |
+
for entry_type in grouped_entries:
|
| 520 |
+
type_counts[entry_type] = len(grouped_entries[entry_type])
|
| 521 |
+
total = sum(type_counts.values())
|
| 522 |
+
print(f" Total entries: {total}")
|
| 523 |
+
for entry_type, count in type_counts.items():
|
| 524 |
+
print(f" - {entry_type}: {count} entries")
|
| 525 |
+
|
| 526 |
+
def extract_chapters_from_epub(epub_path: str) -> List[str]:
|
| 527 |
+
chapters = []
|
| 528 |
+
items = []
|
| 529 |
+
|
| 530 |
+
# Add this helper function
|
| 531 |
+
def is_html_document(item):
|
| 532 |
+
"""Check if an EPUB item is an HTML document"""
|
| 533 |
+
if hasattr(item, 'media_type'):
|
| 534 |
+
return item.media_type in [
|
| 535 |
+
'application/xhtml+xml',
|
| 536 |
+
'text/html',
|
| 537 |
+
'application/html+xml',
|
| 538 |
+
'text/xml'
|
| 539 |
+
]
|
| 540 |
+
# Fallback for items that don't have media_type
|
| 541 |
+
if hasattr(item, 'get_name'):
|
| 542 |
+
name = item.get_name()
|
| 543 |
+
return name.lower().endswith(('.html', '.xhtml', '.htm'))
|
| 544 |
+
return False
|
| 545 |
+
|
| 546 |
+
try:
|
| 547 |
+
# Add stop check before reading
|
| 548 |
+
if is_stop_requested():
|
| 549 |
+
return []
|
| 550 |
+
|
| 551 |
+
book = epub.read_epub(epub_path)
|
| 552 |
+
# Replace the problematic line with media type checking
|
| 553 |
+
items = [item for item in book.get_items() if is_html_document(item)]
|
| 554 |
+
except Exception as e:
|
| 555 |
+
print(f"[Warning] Manifest load failed, falling back to raw EPUB scan: {e}")
|
| 556 |
+
try:
|
| 557 |
+
with zipfile.ZipFile(epub_path, 'r') as zf:
|
| 558 |
+
names = [n for n in zf.namelist() if n.lower().endswith(('.html', '.xhtml'))]
|
| 559 |
+
for name in names:
|
| 560 |
+
# Add stop check in loop
|
| 561 |
+
if is_stop_requested():
|
| 562 |
+
return chapters
|
| 563 |
+
|
| 564 |
+
try:
|
| 565 |
+
data = zf.read(name)
|
| 566 |
+
items.append(type('X', (), {
|
| 567 |
+
'get_content': lambda self, data=data: data,
|
| 568 |
+
'get_name': lambda self, name=name: name,
|
| 569 |
+
'media_type': 'text/html' # Add media_type for consistency
|
| 570 |
+
})())
|
| 571 |
+
except Exception:
|
| 572 |
+
print(f"[Warning] Could not read zip file entry: {name}")
|
| 573 |
+
except Exception as ze:
|
| 574 |
+
print(f"[Fatal] Cannot open EPUB as zip: {ze}")
|
| 575 |
+
return chapters
|
| 576 |
+
|
| 577 |
+
for item in items:
|
| 578 |
+
# Add stop check before processing each chapter
|
| 579 |
+
if is_stop_requested():
|
| 580 |
+
return chapters
|
| 581 |
+
|
| 582 |
+
try:
|
| 583 |
+
raw = item.get_content()
|
| 584 |
+
soup = BeautifulSoup(raw, 'html.parser')
|
| 585 |
+
text = soup.get_text("\n", strip=True)
|
| 586 |
+
if text:
|
| 587 |
+
chapters.append(text)
|
| 588 |
+
except Exception as e:
|
| 589 |
+
name = item.get_name() if hasattr(item, 'get_name') else repr(item)
|
| 590 |
+
print(f"[Warning] Skipped corrupted chapter {name}: {e}")
|
| 591 |
+
|
| 592 |
+
return chapters
|
| 593 |
+
|
| 594 |
+
def trim_context_history(history: List[Dict], limit: int, rolling_window: bool = False) -> List[Dict]:
|
| 595 |
+
"""
|
| 596 |
+
Handle context history with either reset or rolling window mode
|
| 597 |
+
|
| 598 |
+
Args:
|
| 599 |
+
history: List of conversation history
|
| 600 |
+
limit: Maximum number of exchanges to keep
|
| 601 |
+
rolling_window: Whether to use rolling window mode
|
| 602 |
+
"""
|
| 603 |
+
# Count current exchanges
|
| 604 |
+
current_exchanges = len(history)
|
| 605 |
+
|
| 606 |
+
# Handle based on mode
|
| 607 |
+
if limit > 0 and current_exchanges >= limit:
|
| 608 |
+
if rolling_window:
|
| 609 |
+
# Rolling window: keep the most recent exchanges
|
| 610 |
+
print(f"🔄 Rolling glossary context window: keeping last {limit} chapters")
|
| 611 |
+
# Keep only the most recent exchanges
|
| 612 |
+
history = history[-(limit-1):] if limit > 1 else []
|
| 613 |
+
else:
|
| 614 |
+
# Reset mode (original behavior)
|
| 615 |
+
print(f"🔄 Reset glossary context after {limit} chapters")
|
| 616 |
+
return [] # Return empty to reset context
|
| 617 |
+
|
| 618 |
+
# Convert to message format
|
| 619 |
+
trimmed = []
|
| 620 |
+
for entry in history:
|
| 621 |
+
trimmed.append({"role": "user", "content": entry["user"]})
|
| 622 |
+
trimmed.append({"role": "assistant", "content": entry["assistant"]})
|
| 623 |
+
return trimmed
|
| 624 |
+
|
| 625 |
+
def load_progress() -> Dict:
|
| 626 |
+
if os.path.exists(PROGRESS_FILE):
|
| 627 |
+
with open(PROGRESS_FILE, 'r', encoding='utf-8') as f:
|
| 628 |
+
return json.load(f)
|
| 629 |
+
return {"completed": [], "glossary": [], "context_history": []}
|
| 630 |
+
|
| 631 |
+
def parse_api_response(response_text: str) -> List[Dict]:
|
| 632 |
+
"""Parse API response to extract glossary entries - handles custom types"""
|
| 633 |
+
entries = []
|
| 634 |
+
|
| 635 |
+
# Get enabled types from custom configuration
|
| 636 |
+
custom_types = get_custom_entry_types()
|
| 637 |
+
enabled_types = [t for t, cfg in custom_types.items() if cfg.get('enabled', True)]
|
| 638 |
+
|
| 639 |
+
# First try JSON parsing
|
| 640 |
+
try:
|
| 641 |
+
# Clean up response text
|
| 642 |
+
cleaned_text = response_text.strip()
|
| 643 |
+
|
| 644 |
+
# Remove markdown code blocks if present
|
| 645 |
+
if '```json' in cleaned_text or '```' in cleaned_text:
|
| 646 |
+
import re
|
| 647 |
+
code_block_match = re.search(r'```(?:json)?\s*(.*?)\s*```', cleaned_text, re.DOTALL)
|
| 648 |
+
if code_block_match:
|
| 649 |
+
cleaned_text = code_block_match.group(1)
|
| 650 |
+
|
| 651 |
+
# Try to find JSON array or object
|
| 652 |
+
import re
|
| 653 |
+
json_match = re.search(r'[\[\{].*[\]\}]', cleaned_text, re.DOTALL)
|
| 654 |
+
if json_match:
|
| 655 |
+
json_str = json_match.group(0)
|
| 656 |
+
data = json.loads(json_str)
|
| 657 |
+
|
| 658 |
+
if isinstance(data, list):
|
| 659 |
+
for item in data:
|
| 660 |
+
if isinstance(item, dict):
|
| 661 |
+
# Check if entry type is enabled
|
| 662 |
+
entry_type = item.get('type', '').lower()
|
| 663 |
+
|
| 664 |
+
# Handle legacy format where type is the key
|
| 665 |
+
if not entry_type:
|
| 666 |
+
for type_name in enabled_types:
|
| 667 |
+
if type_name in item:
|
| 668 |
+
entry_type = type_name
|
| 669 |
+
fixed_entry = {
|
| 670 |
+
'type': type_name,
|
| 671 |
+
'raw_name': item.get(type_name, ''),
|
| 672 |
+
'translated_name': item.get('translated_name', '')
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
# Add gender if type supports it
|
| 676 |
+
if custom_types.get(type_name, {}).get('has_gender', False):
|
| 677 |
+
fixed_entry['gender'] = item.get('gender', 'Unknown')
|
| 678 |
+
|
| 679 |
+
# Copy other fields
|
| 680 |
+
for k, v in item.items():
|
| 681 |
+
if k not in [type_name, 'translated_name', 'gender', 'type', 'raw_name']:
|
| 682 |
+
fixed_entry[k] = v
|
| 683 |
+
|
| 684 |
+
entries.append(fixed_entry)
|
| 685 |
+
break
|
| 686 |
+
else:
|
| 687 |
+
# Standard format with type field
|
| 688 |
+
if entry_type in enabled_types:
|
| 689 |
+
entries.append(item)
|
| 690 |
+
|
| 691 |
+
return entries
|
| 692 |
+
|
| 693 |
+
elif isinstance(data, dict):
|
| 694 |
+
# Handle single entry
|
| 695 |
+
entry_type = data.get('type', '').lower()
|
| 696 |
+
if entry_type in enabled_types:
|
| 697 |
+
return [data]
|
| 698 |
+
|
| 699 |
+
# Check for wrapper
|
| 700 |
+
for key in ['entries', 'glossary', 'characters', 'terms', 'data']:
|
| 701 |
+
if key in data and isinstance(data[key], list):
|
| 702 |
+
return parse_api_response(json.dumps(data[key]))
|
| 703 |
+
|
| 704 |
+
return []
|
| 705 |
+
|
| 706 |
+
except (json.JSONDecodeError, AttributeError) as e:
|
| 707 |
+
print(f"[Debug] JSON parsing failed: {e}")
|
| 708 |
+
pass
|
| 709 |
+
|
| 710 |
+
# CSV-like format parsing
|
| 711 |
+
lines = response_text.strip().split('\n')
|
| 712 |
+
|
| 713 |
+
for line in lines:
|
| 714 |
+
line = line.strip()
|
| 715 |
+
if not line or line.startswith('#'):
|
| 716 |
+
continue
|
| 717 |
+
|
| 718 |
+
# Skip header lines
|
| 719 |
+
if 'type' in line.lower() and 'raw_name' in line.lower():
|
| 720 |
+
continue
|
| 721 |
+
|
| 722 |
+
# Parse CSV
|
| 723 |
+
parts = []
|
| 724 |
+
current_part = []
|
| 725 |
+
in_quotes = False
|
| 726 |
+
|
| 727 |
+
for char in line + ',':
|
| 728 |
+
if char == '"':
|
| 729 |
+
in_quotes = not in_quotes
|
| 730 |
+
elif char == ',' and not in_quotes:
|
| 731 |
+
parts.append(''.join(current_part).strip())
|
| 732 |
+
current_part = []
|
| 733 |
+
else:
|
| 734 |
+
current_part.append(char)
|
| 735 |
+
|
| 736 |
+
if parts and parts[-1] == '':
|
| 737 |
+
parts = parts[:-1]
|
| 738 |
+
|
| 739 |
+
if len(parts) >= 3:
|
| 740 |
+
entry_type = parts[0].lower()
|
| 741 |
+
|
| 742 |
+
# Check if type is enabled
|
| 743 |
+
if entry_type not in enabled_types:
|
| 744 |
+
continue
|
| 745 |
+
|
| 746 |
+
entry = {
|
| 747 |
+
'type': entry_type,
|
| 748 |
+
'raw_name': parts[1],
|
| 749 |
+
'translated_name': parts[2]
|
| 750 |
+
}
|
| 751 |
+
|
| 752 |
+
# Add gender if type supports it and it's provided
|
| 753 |
+
type_config = custom_types.get(entry_type, {})
|
| 754 |
+
if type_config.get('has_gender', False) and len(parts) > 3 and parts[3]:
|
| 755 |
+
entry['gender'] = parts[3]
|
| 756 |
+
elif type_config.get('has_gender', False):
|
| 757 |
+
entry['gender'] = 'Unknown'
|
| 758 |
+
|
| 759 |
+
# Add any custom fields
|
| 760 |
+
custom_fields_json = os.getenv('GLOSSARY_CUSTOM_FIELDS', '[]')
|
| 761 |
+
try:
|
| 762 |
+
custom_fields = json.loads(custom_fields_json)
|
| 763 |
+
start_idx = 4 # Always 4, not conditional
|
| 764 |
+
for i, field in enumerate(custom_fields):
|
| 765 |
+
if len(parts) > start_idx + i:
|
| 766 |
+
field_value = parts[start_idx + i]
|
| 767 |
+
if field_value: # Only add if not empty
|
| 768 |
+
entry[field] = field_value
|
| 769 |
+
except:
|
| 770 |
+
pass
|
| 771 |
+
|
| 772 |
+
entries.append(entry)
|
| 773 |
+
|
| 774 |
+
return entries
|
| 775 |
+
|
| 776 |
+
def validate_extracted_entry(entry):
|
| 777 |
+
"""Validate that extracted entry has required fields and enabled type"""
|
| 778 |
+
if 'type' not in entry:
|
| 779 |
+
return False
|
| 780 |
+
|
| 781 |
+
# Check if type is enabled
|
| 782 |
+
custom_types = get_custom_entry_types()
|
| 783 |
+
entry_type = entry.get('type', '').lower()
|
| 784 |
+
|
| 785 |
+
if entry_type not in custom_types:
|
| 786 |
+
return False
|
| 787 |
+
|
| 788 |
+
if not custom_types[entry_type].get('enabled', True):
|
| 789 |
+
return False
|
| 790 |
+
|
| 791 |
+
# Must have raw_name and translated_name
|
| 792 |
+
if 'raw_name' not in entry or not entry['raw_name']:
|
| 793 |
+
return False
|
| 794 |
+
if 'translated_name' not in entry or not entry['translated_name']:
|
| 795 |
+
return False
|
| 796 |
+
|
| 797 |
+
return True
|
| 798 |
+
|
| 799 |
+
def build_prompt(chapter_text: str) -> tuple:
|
| 800 |
+
"""Build the extraction prompt with custom types - returns (system_prompt, user_prompt)"""
|
| 801 |
+
custom_prompt = os.getenv('GLOSSARY_SYSTEM_PROMPT', '').strip()
|
| 802 |
+
|
| 803 |
+
if not custom_prompt:
|
| 804 |
+
# If no custom prompt, create a default
|
| 805 |
+
custom_prompt = """Extract all character names and important terms from the text.
|
| 806 |
+
|
| 807 |
+
{fields}
|
| 808 |
+
|
| 809 |
+
Only include entries that appear in the text.
|
| 810 |
+
Return the data in the exact format specified above."""
|
| 811 |
+
|
| 812 |
+
# Check if the prompt contains {fields} placeholder
|
| 813 |
+
if '{fields}' in custom_prompt:
|
| 814 |
+
# Get enabled types
|
| 815 |
+
custom_types = get_custom_entry_types()
|
| 816 |
+
|
| 817 |
+
enabled_types = [(t, cfg) for t, cfg in custom_types.items() if cfg.get('enabled', True)]
|
| 818 |
+
|
| 819 |
+
# Get custom fields
|
| 820 |
+
custom_fields_json = os.getenv('GLOSSARY_CUSTOM_FIELDS', '[]')
|
| 821 |
+
try:
|
| 822 |
+
custom_fields = json.loads(custom_fields_json)
|
| 823 |
+
except:
|
| 824 |
+
custom_fields = []
|
| 825 |
+
|
| 826 |
+
# Build fields specification based on what the prompt expects
|
| 827 |
+
# Check if the prompt mentions CSV or JSON to determine format
|
| 828 |
+
if 'CSV' in custom_prompt.upper():
|
| 829 |
+
# CSV format
|
| 830 |
+
fields_spec = []
|
| 831 |
+
|
| 832 |
+
# Show the header format
|
| 833 |
+
header_parts = ['type', 'raw_name', 'translated_name', 'gender']
|
| 834 |
+
if custom_fields:
|
| 835 |
+
header_parts.extend(custom_fields)
|
| 836 |
+
fields_spec.append(','.join(header_parts))
|
| 837 |
+
|
| 838 |
+
# Show examples for each type
|
| 839 |
+
for type_name, type_config in enabled_types:
|
| 840 |
+
example_parts = [type_name, '<name in original language>', '<English translation>']
|
| 841 |
+
|
| 842 |
+
# Add gender field
|
| 843 |
+
if type_config.get('has_gender', False):
|
| 844 |
+
example_parts.append('<Male/Female/Unknown>')
|
| 845 |
+
else:
|
| 846 |
+
example_parts.append('') # Empty for non-character types
|
| 847 |
+
|
| 848 |
+
# Add custom field placeholders
|
| 849 |
+
for field in custom_fields:
|
| 850 |
+
example_parts.append(f'<{field} value>')
|
| 851 |
+
|
| 852 |
+
fields_spec.append(','.join(example_parts))
|
| 853 |
+
|
| 854 |
+
fields_str = '\n'.join(fields_spec)
|
| 855 |
+
else:
|
| 856 |
+
# JSON format (default)
|
| 857 |
+
fields_spec = []
|
| 858 |
+
fields_spec.append("Extract entities and return as a JSON array.")
|
| 859 |
+
fields_spec.append("Each entry must be a JSON object with these exact fields:")
|
| 860 |
+
fields_spec.append("")
|
| 861 |
+
|
| 862 |
+
for type_name, type_config in enabled_types:
|
| 863 |
+
fields_spec.append(f"For {type_name}s:")
|
| 864 |
+
fields_spec.append(f' "type": "{type_name}" (required)')
|
| 865 |
+
fields_spec.append(' "raw_name": the name in original language/script (required)')
|
| 866 |
+
fields_spec.append(' "translated_name": English translation or romanization (required)')
|
| 867 |
+
if type_config.get('has_gender', False):
|
| 868 |
+
fields_spec.append(' "gender": "Male", "Female", or "Unknown" (required for characters)')
|
| 869 |
+
fields_spec.append("")
|
| 870 |
+
|
| 871 |
+
# Add custom fields info
|
| 872 |
+
if custom_fields:
|
| 873 |
+
fields_spec.append("Additional custom fields to include:")
|
| 874 |
+
for field in custom_fields:
|
| 875 |
+
fields_spec.append(f' "{field}": appropriate value')
|
| 876 |
+
fields_spec.append("")
|
| 877 |
+
|
| 878 |
+
# Add example
|
| 879 |
+
if enabled_types:
|
| 880 |
+
fields_spec.append("Example output format:")
|
| 881 |
+
fields_spec.append('[')
|
| 882 |
+
examples = []
|
| 883 |
+
if 'character' in [t[0] for t in enabled_types]:
|
| 884 |
+
example = ' {"type": "character", "raw_name": "田中太郎", "translated_name": "Tanaka Taro", "gender": "Male"'
|
| 885 |
+
for field in custom_fields:
|
| 886 |
+
example += f', "{field}": "example value"'
|
| 887 |
+
example += '}'
|
| 888 |
+
examples.append(example)
|
| 889 |
+
if 'term' in [t[0] for t in enabled_types]:
|
| 890 |
+
example = ' {"type": "term", "raw_name": "東京駅", "translated_name": "Tokyo Station"'
|
| 891 |
+
for field in custom_fields:
|
| 892 |
+
example += f', "{field}": "example value"'
|
| 893 |
+
example += '}'
|
| 894 |
+
examples.append(example)
|
| 895 |
+
fields_spec.append(',\n'.join(examples))
|
| 896 |
+
fields_spec.append(']')
|
| 897 |
+
|
| 898 |
+
fields_str = '\n'.join(fields_spec)
|
| 899 |
+
|
| 900 |
+
# Replace {fields} placeholder
|
| 901 |
+
system_prompt = custom_prompt.replace('{fields}', fields_str)
|
| 902 |
+
else:
|
| 903 |
+
# No {fields} placeholder - use the prompt as-is
|
| 904 |
+
system_prompt = custom_prompt
|
| 905 |
+
|
| 906 |
+
# Remove any {chapter_text} placeholders from system prompt
|
| 907 |
+
system_prompt = system_prompt.replace('{chapter_text}', '')
|
| 908 |
+
system_prompt = system_prompt.replace('{{chapter_text}}', '')
|
| 909 |
+
system_prompt = system_prompt.replace('{text}', '')
|
| 910 |
+
system_prompt = system_prompt.replace('{{text}}', '')
|
| 911 |
+
|
| 912 |
+
# Strip any trailing "Text:" or similar
|
| 913 |
+
system_prompt = system_prompt.rstrip()
|
| 914 |
+
if system_prompt.endswith('Text:'):
|
| 915 |
+
system_prompt = system_prompt[:-5].rstrip()
|
| 916 |
+
|
| 917 |
+
# User prompt is just the chapter text
|
| 918 |
+
user_prompt = chapter_text
|
| 919 |
+
|
| 920 |
+
return (system_prompt, user_prompt)
|
| 921 |
+
|
| 922 |
+
|
| 923 |
+
def skip_duplicate_entries(glossary):
|
| 924 |
+
"""
|
| 925 |
+
Skip entries with duplicate raw names using fuzzy matching.
|
| 926 |
+
Returns deduplicated list maintaining first occurrence of each unique raw name.
|
| 927 |
+
"""
|
| 928 |
+
import difflib
|
| 929 |
+
|
| 930 |
+
# Get fuzzy threshold from environment
|
| 931 |
+
fuzzy_threshold = float(os.getenv('GLOSSARY_FUZZY_THRESHOLD', '0.9'))
|
| 932 |
+
|
| 933 |
+
seen_raw_names = [] # List of (cleaned_name, original_entry) tuples
|
| 934 |
+
deduplicated = []
|
| 935 |
+
skipped_count = 0
|
| 936 |
+
|
| 937 |
+
for entry in glossary:
|
| 938 |
+
# Get raw_name and clean it
|
| 939 |
+
raw_name = entry.get('raw_name', '')
|
| 940 |
+
if not raw_name:
|
| 941 |
+
continue
|
| 942 |
+
|
| 943 |
+
# Remove honorifics for comparison (unless disabled)
|
| 944 |
+
cleaned_name = remove_honorifics(raw_name)
|
| 945 |
+
|
| 946 |
+
# Check for fuzzy matches with seen names
|
| 947 |
+
is_duplicate = False
|
| 948 |
+
for seen_clean, seen_original in seen_raw_names:
|
| 949 |
+
similarity = difflib.SequenceMatcher(None, cleaned_name.lower(), seen_clean.lower()).ratio()
|
| 950 |
+
|
| 951 |
+
if similarity >= fuzzy_threshold:
|
| 952 |
+
skipped_count += 1
|
| 953 |
+
print(f"[Skip] Duplicate entry: {raw_name} (cleaned: {cleaned_name}) - {similarity*100:.1f}% match with {seen_original}")
|
| 954 |
+
is_duplicate = True
|
| 955 |
+
break
|
| 956 |
+
|
| 957 |
+
if not is_duplicate:
|
| 958 |
+
# Add to seen list and keep the entry
|
| 959 |
+
seen_raw_names.append((cleaned_name, entry.get('raw_name', '')))
|
| 960 |
+
deduplicated.append(entry)
|
| 961 |
+
|
| 962 |
+
if skipped_count > 0:
|
| 963 |
+
print(f"⏭️ Skipped {skipped_count} duplicate entries (threshold: {fuzzy_threshold:.2f})")
|
| 964 |
+
print(f"✅ Kept {len(deduplicated)} unique entries")
|
| 965 |
+
|
| 966 |
+
return deduplicated
|
| 967 |
+
|
| 968 |
+
# Batch processing functions
|
| 969 |
+
def process_chapter_batch(chapters_batch: List[Tuple[int, str]],
|
| 970 |
+
client: UnifiedClient,
|
| 971 |
+
config: Dict,
|
| 972 |
+
contextual_enabled: bool,
|
| 973 |
+
history: List[Dict],
|
| 974 |
+
ctx_limit: int,
|
| 975 |
+
rolling_window: bool,
|
| 976 |
+
check_stop,
|
| 977 |
+
chunk_timeout: int = None) -> List[Dict]:
|
| 978 |
+
"""
|
| 979 |
+
Process a batch of chapters in parallel with improved interrupt support
|
| 980 |
+
"""
|
| 981 |
+
temp = float(os.getenv("GLOSSARY_TEMPERATURE") or config.get('temperature', 0.1))
|
| 982 |
+
|
| 983 |
+
env_max_output = os.getenv("MAX_OUTPUT_TOKENS")
|
| 984 |
+
if env_max_output and env_max_output.isdigit():
|
| 985 |
+
mtoks = int(env_max_output)
|
| 986 |
+
else:
|
| 987 |
+
mtoks = config.get('max_tokens', 4196)
|
| 988 |
+
|
| 989 |
+
results = []
|
| 990 |
+
|
| 991 |
+
with ThreadPoolExecutor(max_workers=len(chapters_batch)) as executor:
|
| 992 |
+
futures = {}
|
| 993 |
+
|
| 994 |
+
for idx, chap in chapters_batch:
|
| 995 |
+
if check_stop():
|
| 996 |
+
break
|
| 997 |
+
|
| 998 |
+
# Get system and user prompts
|
| 999 |
+
system_prompt, user_prompt = build_prompt(chap)
|
| 1000 |
+
|
| 1001 |
+
# Build messages correctly with system and user prompts
|
| 1002 |
+
if not contextual_enabled:
|
| 1003 |
+
msgs = [
|
| 1004 |
+
{"role": "system", "content": system_prompt},
|
| 1005 |
+
{"role": "user", "content": user_prompt}
|
| 1006 |
+
]
|
| 1007 |
+
else:
|
| 1008 |
+
msgs = [{"role": "system", "content": system_prompt}] \
|
| 1009 |
+
+ trim_context_history(history, ctx_limit, rolling_window) \
|
| 1010 |
+
+ [{"role": "user", "content": user_prompt}]
|
| 1011 |
+
|
| 1012 |
+
|
| 1013 |
+
# Submit to thread pool
|
| 1014 |
+
future = executor.submit(
|
| 1015 |
+
process_single_chapter_api_call,
|
| 1016 |
+
idx, chap, msgs, client, temp, mtoks, check_stop, chunk_timeout
|
| 1017 |
+
)
|
| 1018 |
+
futures[future] = (idx, chap)
|
| 1019 |
+
|
| 1020 |
+
# Process results with better cancellation
|
| 1021 |
+
for future in as_completed(futures): # Removed timeout - let futures complete
|
| 1022 |
+
if check_stop():
|
| 1023 |
+
print("🛑 Stop detected - cancelling all pending operations...")
|
| 1024 |
+
# Cancel all pending futures immediately
|
| 1025 |
+
cancelled = cancel_all_futures(list(futures.keys()))
|
| 1026 |
+
if cancelled > 0:
|
| 1027 |
+
print(f"✅ Cancelled {cancelled} pending API calls")
|
| 1028 |
+
# Shutdown executor immediately
|
| 1029 |
+
executor.shutdown(wait=False)
|
| 1030 |
+
break
|
| 1031 |
+
|
| 1032 |
+
idx, chap = futures[future]
|
| 1033 |
+
try:
|
| 1034 |
+
result = future.result(timeout=0.5) # Short timeout on result retrieval
|
| 1035 |
+
# Ensure chap is added to result here if not already present
|
| 1036 |
+
if 'chap' not in result:
|
| 1037 |
+
result['chap'] = chap
|
| 1038 |
+
results.append(result)
|
| 1039 |
+
except Exception as e:
|
| 1040 |
+
if "stopped by user" in str(e).lower():
|
| 1041 |
+
print(f"✅ Chapter {idx+1} stopped by user")
|
| 1042 |
+
else:
|
| 1043 |
+
print(f"Error processing chapter {idx+1}: {e}")
|
| 1044 |
+
results.append({
|
| 1045 |
+
'idx': idx,
|
| 1046 |
+
'data': [],
|
| 1047 |
+
'resp': "",
|
| 1048 |
+
'chap': chap,
|
| 1049 |
+
'error': str(e)
|
| 1050 |
+
})
|
| 1051 |
+
|
| 1052 |
+
# Sort results by chapter index
|
| 1053 |
+
results.sort(key=lambda x: x['idx'])
|
| 1054 |
+
return results
|
| 1055 |
+
|
| 1056 |
+
def process_single_chapter_api_call(idx: int, chap: str, msgs: List[Dict],
|
| 1057 |
+
client: UnifiedClient, temp: float, mtoks: int,
|
| 1058 |
+
stop_check_fn, chunk_timeout: int = None) -> Dict:
|
| 1059 |
+
"""Process a single chapter API call with thread-safe payload handling"""
|
| 1060 |
+
|
| 1061 |
+
# APPLY INTERRUPTIBLE THREADING DELAY FIRST
|
| 1062 |
+
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
|
| 1063 |
+
if thread_delay > 0:
|
| 1064 |
+
# Check if we need to wait (same logic as unified_api_client)
|
| 1065 |
+
if hasattr(client, '_thread_submission_lock') and hasattr(client, '_last_thread_submission_time'):
|
| 1066 |
+
with client._thread_submission_lock:
|
| 1067 |
+
current_time = time.time()
|
| 1068 |
+
time_since_last = current_time - client._last_thread_submission_time
|
| 1069 |
+
|
| 1070 |
+
if time_since_last < thread_delay:
|
| 1071 |
+
sleep_time = thread_delay - time_since_last
|
| 1072 |
+
thread_name = threading.current_thread().name
|
| 1073 |
+
|
| 1074 |
+
# PRINT BEFORE THE DELAY STARTS
|
| 1075 |
+
print(f"🧵 [{thread_name}] Applying thread delay: {sleep_time:.1f}s for Chapter {idx+1}")
|
| 1076 |
+
|
| 1077 |
+
# Interruptible sleep - check stop flag every 0.1 seconds
|
| 1078 |
+
elapsed = 0
|
| 1079 |
+
check_interval = 0.1
|
| 1080 |
+
while elapsed < sleep_time:
|
| 1081 |
+
if stop_check_fn():
|
| 1082 |
+
print(f"🛑 Threading delay interrupted by stop flag")
|
| 1083 |
+
raise UnifiedClientError("Glossary extraction stopped by user during threading delay")
|
| 1084 |
+
|
| 1085 |
+
sleep_chunk = min(check_interval, sleep_time - elapsed)
|
| 1086 |
+
time.sleep(sleep_chunk)
|
| 1087 |
+
elapsed += sleep_chunk
|
| 1088 |
+
|
| 1089 |
+
client._last_thread_submission_time = time.time()
|
| 1090 |
+
if not hasattr(client, '_thread_submission_count'):
|
| 1091 |
+
client._thread_submission_count = 0
|
| 1092 |
+
client._thread_submission_count += 1
|
| 1093 |
+
start_time = time.time()
|
| 1094 |
+
print(f"[BATCH] Starting API call for Chapter {idx+1} at {time.strftime('%H:%M:%S')}")
|
| 1095 |
+
|
| 1096 |
+
# Thread-safe payload directory
|
| 1097 |
+
thread_name = threading.current_thread().name
|
| 1098 |
+
thread_id = threading.current_thread().ident
|
| 1099 |
+
thread_dir = os.path.join("Payloads", "glossary", f"{thread_name}_{thread_id}")
|
| 1100 |
+
os.makedirs(thread_dir, exist_ok=True)
|
| 1101 |
+
|
| 1102 |
+
try:
|
| 1103 |
+
# Save request payload before API call
|
| 1104 |
+
payload_file = os.path.join(thread_dir, f"chapter_{idx+1}_request.json")
|
| 1105 |
+
with open(payload_file, 'w', encoding='utf-8') as f:
|
| 1106 |
+
json.dump({
|
| 1107 |
+
'chapter': idx + 1,
|
| 1108 |
+
'messages': msgs,
|
| 1109 |
+
'temperature': temp,
|
| 1110 |
+
'max_tokens': mtoks,
|
| 1111 |
+
'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
|
| 1112 |
+
}, f, indent=2, ensure_ascii=False)
|
| 1113 |
+
|
| 1114 |
+
# Use send_with_interrupt for API call
|
| 1115 |
+
raw = send_with_interrupt(
|
| 1116 |
+
messages=msgs,
|
| 1117 |
+
client=client,
|
| 1118 |
+
temperature=temp,
|
| 1119 |
+
max_tokens=mtoks,
|
| 1120 |
+
stop_check_fn=stop_check_fn,
|
| 1121 |
+
chunk_timeout=chunk_timeout
|
| 1122 |
+
)
|
| 1123 |
+
|
| 1124 |
+
# Handle the response - it might be a tuple or a string
|
| 1125 |
+
if raw is None:
|
| 1126 |
+
print(f"⚠️ API returned None for chapter {idx+1}")
|
| 1127 |
+
return {
|
| 1128 |
+
'idx': idx,
|
| 1129 |
+
'data': [],
|
| 1130 |
+
'resp': "",
|
| 1131 |
+
'chap': chap,
|
| 1132 |
+
'error': "API returned None"
|
| 1133 |
+
}
|
| 1134 |
+
|
| 1135 |
+
if isinstance(raw, tuple):
|
| 1136 |
+
resp = raw[0] if raw[0] is not None else ""
|
| 1137 |
+
elif isinstance(raw, str):
|
| 1138 |
+
resp = raw
|
| 1139 |
+
elif hasattr(raw, 'content'):
|
| 1140 |
+
resp = raw.content if raw.content is not None else ""
|
| 1141 |
+
elif hasattr(raw, 'text'):
|
| 1142 |
+
resp = raw.text if raw.text is not None else ""
|
| 1143 |
+
else:
|
| 1144 |
+
resp = str(raw) if raw is not None else ""
|
| 1145 |
+
|
| 1146 |
+
# Ensure resp is never None
|
| 1147 |
+
if resp is None:
|
| 1148 |
+
resp = ""
|
| 1149 |
+
|
| 1150 |
+
# Save the raw response in thread-safe location
|
| 1151 |
+
response_file = os.path.join(thread_dir, f"chapter_{idx+1}_response.txt")
|
| 1152 |
+
with open(response_file, "w", encoding="utf-8", errors="replace") as f:
|
| 1153 |
+
f.write(resp)
|
| 1154 |
+
|
| 1155 |
+
# Parse response using the new parser
|
| 1156 |
+
data = parse_api_response(resp)
|
| 1157 |
+
|
| 1158 |
+
# More detailed debug logging
|
| 1159 |
+
print(f"[BATCH] Chapter {idx+1} - Raw response length: {len(resp)} chars")
|
| 1160 |
+
print(f"[BATCH] Chapter {idx+1} - Parsed {len(data)} entries before validation")
|
| 1161 |
+
|
| 1162 |
+
# Filter out invalid entries
|
| 1163 |
+
valid_data = []
|
| 1164 |
+
for entry in data:
|
| 1165 |
+
if validate_extracted_entry(entry):
|
| 1166 |
+
# Clean the raw_name
|
| 1167 |
+
if 'raw_name' in entry:
|
| 1168 |
+
entry['raw_name'] = entry['raw_name'].strip()
|
| 1169 |
+
valid_data.append(entry)
|
| 1170 |
+
else:
|
| 1171 |
+
print(f"[BATCH] Chapter {idx+1} - Invalid entry: {entry}")
|
| 1172 |
+
|
| 1173 |
+
elapsed = time.time() - start_time
|
| 1174 |
+
print(f"[BATCH] Completed Chapter {idx+1} in {elapsed:.1f}s at {time.strftime('%H:%M:%S')} - Extracted {len(valid_data)} valid entries")
|
| 1175 |
+
|
| 1176 |
+
return {
|
| 1177 |
+
'idx': idx,
|
| 1178 |
+
'data': valid_data,
|
| 1179 |
+
'resp': resp,
|
| 1180 |
+
'chap': chap, # Include the chapter text in the result
|
| 1181 |
+
'error': None
|
| 1182 |
+
}
|
| 1183 |
+
|
| 1184 |
+
except UnifiedClientError as e:
|
| 1185 |
+
print(f"[Error] API call interrupted/failed for chapter {idx+1}: {e}")
|
| 1186 |
+
return {
|
| 1187 |
+
'idx': idx,
|
| 1188 |
+
'data': [],
|
| 1189 |
+
'resp': "",
|
| 1190 |
+
'chap': chap, # Include chapter even on error
|
| 1191 |
+
'error': str(e)
|
| 1192 |
+
}
|
| 1193 |
+
except Exception as e:
|
| 1194 |
+
print(f"[Error] Unexpected error for chapter {idx+1}: {e}")
|
| 1195 |
+
import traceback
|
| 1196 |
+
print(f"[Error] Traceback: {traceback.format_exc()}")
|
| 1197 |
+
return {
|
| 1198 |
+
'idx': idx,
|
| 1199 |
+
'data': [],
|
| 1200 |
+
'resp': "",
|
| 1201 |
+
'chap': chap, # Include chapter even on error
|
| 1202 |
+
'error': str(e)
|
| 1203 |
+
}
|
| 1204 |
+
|
| 1205 |
+
# Update main function to support batch processing:
|
| 1206 |
+
def main(log_callback=None, stop_callback=None):
|
| 1207 |
+
"""Modified main function that can accept a logging callback and stop callback"""
|
| 1208 |
+
if log_callback:
|
| 1209 |
+
set_output_redirect(log_callback)
|
| 1210 |
+
|
| 1211 |
+
# Set up stop checking
|
| 1212 |
+
def check_stop():
|
| 1213 |
+
if stop_callback and stop_callback():
|
| 1214 |
+
print("❌ Glossary extraction stopped by user request.")
|
| 1215 |
+
return True
|
| 1216 |
+
return is_stop_requested()
|
| 1217 |
+
|
| 1218 |
+
start = time.time()
|
| 1219 |
+
|
| 1220 |
+
# Handle both command line and GUI calls
|
| 1221 |
+
if '--epub' in sys.argv:
|
| 1222 |
+
# Command line mode
|
| 1223 |
+
parser = argparse.ArgumentParser(description='Extract glossary from EPUB/TXT')
|
| 1224 |
+
parser.add_argument('--epub', required=True, help='Path to EPUB/TXT file')
|
| 1225 |
+
parser.add_argument('--output', required=True, help='Output glossary path')
|
| 1226 |
+
parser.add_argument('--config', help='Config file path')
|
| 1227 |
+
|
| 1228 |
+
args = parser.parse_args()
|
| 1229 |
+
epub_path = args.epub
|
| 1230 |
+
else:
|
| 1231 |
+
# GUI mode - get from environment
|
| 1232 |
+
epub_path = os.getenv("EPUB_PATH", "")
|
| 1233 |
+
if not epub_path and len(sys.argv) > 1:
|
| 1234 |
+
epub_path = sys.argv[1]
|
| 1235 |
+
|
| 1236 |
+
# Create args object for GUI mode
|
| 1237 |
+
import types
|
| 1238 |
+
args = types.SimpleNamespace()
|
| 1239 |
+
args.epub = epub_path
|
| 1240 |
+
args.output = os.getenv("OUTPUT_PATH", "glossary.json")
|
| 1241 |
+
args.config = os.getenv("CONFIG_PATH", "config.json")
|
| 1242 |
+
|
| 1243 |
+
is_text_file = epub_path.lower().endswith('.txt')
|
| 1244 |
+
|
| 1245 |
+
if is_text_file:
|
| 1246 |
+
# Import text processor
|
| 1247 |
+
from extract_glossary_from_txt import extract_chapters_from_txt
|
| 1248 |
+
chapters = extract_chapters_from_txt(epub_path)
|
| 1249 |
+
file_base = os.path.splitext(os.path.basename(epub_path))[0]
|
| 1250 |
+
else:
|
| 1251 |
+
# Existing EPUB code
|
| 1252 |
+
chapters = extract_chapters_from_epub(epub_path)
|
| 1253 |
+
epub_base = os.path.splitext(os.path.basename(epub_path))[0]
|
| 1254 |
+
file_base = epub_base
|
| 1255 |
+
|
| 1256 |
+
# If user didn't override --output, derive it from the EPUB filename:
|
| 1257 |
+
if args.output == 'glossary.json':
|
| 1258 |
+
args.output = f"{file_base}_glossary.json"
|
| 1259 |
+
|
| 1260 |
+
# ensure we have a Glossary subfolder next to the JSON/MD outputs
|
| 1261 |
+
glossary_dir = os.path.join(os.path.dirname(args.output), "Glossary")
|
| 1262 |
+
os.makedirs(glossary_dir, exist_ok=True)
|
| 1263 |
+
|
| 1264 |
+
# override the module‐level PROGRESS_FILE to include epub name
|
| 1265 |
+
global PROGRESS_FILE
|
| 1266 |
+
PROGRESS_FILE = os.path.join(
|
| 1267 |
+
glossary_dir,
|
| 1268 |
+
f"{file_base}_glossary_progress.json"
|
| 1269 |
+
)
|
| 1270 |
+
|
| 1271 |
+
config = load_config(args.config)
|
| 1272 |
+
|
| 1273 |
+
# Get API key from environment variables (set by GUI) or config file
|
| 1274 |
+
api_key = (os.getenv("API_KEY") or
|
| 1275 |
+
os.getenv("OPENAI_API_KEY") or
|
| 1276 |
+
os.getenv("OPENAI_OR_Gemini_API_KEY") or
|
| 1277 |
+
os.getenv("GEMINI_API_KEY") or
|
| 1278 |
+
config.get('api_key'))
|
| 1279 |
+
|
| 1280 |
+
# Get model from environment or config
|
| 1281 |
+
model = os.getenv("MODEL") or config.get('model', 'gemini-1.5-flash')
|
| 1282 |
+
|
| 1283 |
+
# Define output directory (use current directory as default)
|
| 1284 |
+
out = os.path.dirname(args.output) if hasattr(args, 'output') else os.getcwd()
|
| 1285 |
+
|
| 1286 |
+
# Use the variables we just retrieved
|
| 1287 |
+
client = create_client_with_multi_key_support(api_key, model, out, config)
|
| 1288 |
+
|
| 1289 |
+
# Check for batch mode
|
| 1290 |
+
batch_enabled = os.getenv("BATCH_TRANSLATION", "0") == "1"
|
| 1291 |
+
batch_size = int(os.getenv("BATCH_SIZE", "5"))
|
| 1292 |
+
conservative_batching = os.getenv("CONSERVATIVE_BATCHING", "0") == "1"
|
| 1293 |
+
|
| 1294 |
+
print(f"[DEBUG] BATCH_TRANSLATION = {os.getenv('BATCH_TRANSLATION')} (enabled: {batch_enabled})")
|
| 1295 |
+
print(f"[DEBUG] BATCH_SIZE = {batch_size}")
|
| 1296 |
+
print(f"[DEBUG] CONSERVATIVE_BATCHING = {os.getenv('CONSERVATIVE_BATCHING')} (enabled: {conservative_batching})")
|
| 1297 |
+
|
| 1298 |
+
if batch_enabled:
|
| 1299 |
+
print(f"🚀 Glossary batch mode enabled with size: {batch_size}")
|
| 1300 |
+
print(f"📑 Note: Glossary extraction uses direct batching (not affected by conservative batching setting)")
|
| 1301 |
+
|
| 1302 |
+
#API call delay
|
| 1303 |
+
api_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
|
| 1304 |
+
print(f"⏱️ API call delay: {api_delay} seconds")
|
| 1305 |
+
|
| 1306 |
+
# Get compression factor from environment
|
| 1307 |
+
compression_factor = float(os.getenv("COMPRESSION_FACTOR", "1.0"))
|
| 1308 |
+
print(f"📐 Compression Factor: {compression_factor}")
|
| 1309 |
+
|
| 1310 |
+
# Initialize chapter splitter with compression factor
|
| 1311 |
+
chapter_splitter = ChapterSplitter(model_name=model, compression_factor=compression_factor)
|
| 1312 |
+
|
| 1313 |
+
# Get temperature from environment or config
|
| 1314 |
+
temp = float(os.getenv("GLOSSARY_TEMPERATURE") or config.get('temperature', 0.1))
|
| 1315 |
+
|
| 1316 |
+
env_max_output = os.getenv("MAX_OUTPUT_TOKENS")
|
| 1317 |
+
if env_max_output and env_max_output.isdigit():
|
| 1318 |
+
mtoks = int(env_max_output)
|
| 1319 |
+
print(f"[DEBUG] Output Token Limit: {mtoks} (from GUI)")
|
| 1320 |
+
else:
|
| 1321 |
+
mtoks = config.get('max_tokens', 4196)
|
| 1322 |
+
print(f"[DEBUG] Output Token Limit: {mtoks} (from config)")
|
| 1323 |
+
|
| 1324 |
+
# Get context limit from environment or config
|
| 1325 |
+
ctx_limit = int(os.getenv("GLOSSARY_CONTEXT_LIMIT") or config.get('context_limit_chapters', 3))
|
| 1326 |
+
|
| 1327 |
+
# Parse chapter range from environment
|
| 1328 |
+
chapter_range = os.getenv("CHAPTER_RANGE", "").strip()
|
| 1329 |
+
range_start = None
|
| 1330 |
+
range_end = None
|
| 1331 |
+
if chapter_range and re.match(r"^\d+\s*-\s*\d+$", chapter_range):
|
| 1332 |
+
range_start, range_end = map(int, chapter_range.split("-", 1))
|
| 1333 |
+
print(f"📊 Chapter Range Filter: {range_start} to {range_end}")
|
| 1334 |
+
elif chapter_range:
|
| 1335 |
+
print(f"⚠️ Invalid chapter range format: {chapter_range} (use format: 5-10)")
|
| 1336 |
+
|
| 1337 |
+
# Log settings
|
| 1338 |
+
format_parts = ["type", "raw_name", "translated_name", "gender"]
|
| 1339 |
+
custom_fields_json = os.getenv('GLOSSARY_CUSTOM_FIELDS', '[]')
|
| 1340 |
+
try:
|
| 1341 |
+
custom_fields = json.loads(custom_fields_json)
|
| 1342 |
+
if custom_fields:
|
| 1343 |
+
format_parts.extend(custom_fields)
|
| 1344 |
+
except:
|
| 1345 |
+
pass
|
| 1346 |
+
print(f"📑 Glossary Format: Simple ({', '.join(format_parts)})")
|
| 1347 |
+
|
| 1348 |
+
# Check honorifics filter toggle
|
| 1349 |
+
honorifics_disabled = os.getenv('GLOSSARY_DISABLE_HONORIFICS_FILTER', '0') == '1'
|
| 1350 |
+
if honorifics_disabled:
|
| 1351 |
+
print("📑 Honorifics Filtering: ❌ DISABLED")
|
| 1352 |
+
else:
|
| 1353 |
+
print("📑 Honorifics Filtering: ✅ ENABLED")
|
| 1354 |
+
|
| 1355 |
+
# Log custom fields
|
| 1356 |
+
custom_fields_json = os.getenv('GLOSSARY_CUSTOM_FIELDS', '[]')
|
| 1357 |
+
try:
|
| 1358 |
+
custom_fields = json.loads(custom_fields_json)
|
| 1359 |
+
if custom_fields:
|
| 1360 |
+
print(f"📑 Custom Fields: {', '.join(custom_fields)}")
|
| 1361 |
+
except:
|
| 1362 |
+
pass
|
| 1363 |
+
|
| 1364 |
+
# Check if custom prompt is being used
|
| 1365 |
+
if os.getenv('GLOSSARY_SYSTEM_PROMPT'):
|
| 1366 |
+
print("📑 Using custom extraction prompt")
|
| 1367 |
+
else:
|
| 1368 |
+
print("📑 Using default extraction prompt")
|
| 1369 |
+
|
| 1370 |
+
if is_text_file:
|
| 1371 |
+
from extract_glossary_from_txt import extract_chapters_from_txt
|
| 1372 |
+
chapters = extract_chapters_from_txt(args.epub)
|
| 1373 |
+
else:
|
| 1374 |
+
chapters = extract_chapters_from_epub(args.epub)
|
| 1375 |
+
|
| 1376 |
+
if not chapters:
|
| 1377 |
+
print("No chapters found. Exiting.")
|
| 1378 |
+
return
|
| 1379 |
+
|
| 1380 |
+
# Check for stop before starting processing
|
| 1381 |
+
if check_stop():
|
| 1382 |
+
return
|
| 1383 |
+
|
| 1384 |
+
prog = load_progress()
|
| 1385 |
+
completed = prog['completed']
|
| 1386 |
+
glossary = prog['glossary']
|
| 1387 |
+
history = prog['context_history']
|
| 1388 |
+
total_chapters = len(chapters)
|
| 1389 |
+
|
| 1390 |
+
# Get both settings
|
| 1391 |
+
contextual_enabled = os.getenv('CONTEXTUAL', '1') == '1'
|
| 1392 |
+
rolling_window = os.getenv('GLOSSARY_HISTORY_ROLLING', '0') == '1'
|
| 1393 |
+
|
| 1394 |
+
# Count chapters that will be processed with range filter
|
| 1395 |
+
chapters_to_process = []
|
| 1396 |
+
for idx, chap in enumerate(chapters):
|
| 1397 |
+
# Skip if chapter is outside the range
|
| 1398 |
+
if range_start is not None and range_end is not None:
|
| 1399 |
+
chapter_num = idx + 1 # 1-based chapter numbering
|
| 1400 |
+
if not (range_start <= chapter_num <= range_end):
|
| 1401 |
+
continue
|
| 1402 |
+
if idx not in completed:
|
| 1403 |
+
chapters_to_process.append((idx, chap))
|
| 1404 |
+
|
| 1405 |
+
if len(chapters_to_process) < total_chapters:
|
| 1406 |
+
print(f"📊 Processing {len(chapters_to_process)} out of {total_chapters} chapters")
|
| 1407 |
+
|
| 1408 |
+
# Get chunk timeout from environment
|
| 1409 |
+
chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "900")) # 15 minutes default
|
| 1410 |
+
|
| 1411 |
+
# Process chapters based on mode
|
| 1412 |
+
if batch_enabled and len(chapters_to_process) > 0:
|
| 1413 |
+
# BATCH MODE: Process in batches with per-entry saving
|
| 1414 |
+
total_batches = (len(chapters_to_process) + batch_size - 1) // batch_size
|
| 1415 |
+
|
| 1416 |
+
for batch_num in range(total_batches):
|
| 1417 |
+
# Check for stop at the beginning of each batch
|
| 1418 |
+
if check_stop():
|
| 1419 |
+
print(f"❌ Glossary extraction stopped at batch {batch_num+1}")
|
| 1420 |
+
# Apply deduplication before stopping
|
| 1421 |
+
if glossary:
|
| 1422 |
+
print("🔀 Applying deduplication and sorting before exit...")
|
| 1423 |
+
glossary[:] = skip_duplicate_entries(glossary)
|
| 1424 |
+
|
| 1425 |
+
# Sort glossary
|
| 1426 |
+
custom_types = get_custom_entry_types()
|
| 1427 |
+
type_order = {'character': 0, 'term': 1}
|
| 1428 |
+
other_types = sorted([t for t in custom_types.keys() if t not in ['character', 'term']])
|
| 1429 |
+
for i, t in enumerate(other_types):
|
| 1430 |
+
type_order[t] = i + 2
|
| 1431 |
+
glossary.sort(key=lambda x: (
|
| 1432 |
+
type_order.get(x.get('type', 'term'), 999),
|
| 1433 |
+
x.get('raw_name', '').lower()
|
| 1434 |
+
))
|
| 1435 |
+
|
| 1436 |
+
save_progress(completed, glossary, history)
|
| 1437 |
+
save_glossary_json(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1438 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1439 |
+
print(f"✅ Saved {len(glossary)} deduplicated entries before exit")
|
| 1440 |
+
return
|
| 1441 |
+
|
| 1442 |
+
# Get current batch
|
| 1443 |
+
batch_start = batch_num * batch_size
|
| 1444 |
+
batch_end = min(batch_start + batch_size, len(chapters_to_process))
|
| 1445 |
+
current_batch = chapters_to_process[batch_start:batch_end]
|
| 1446 |
+
|
| 1447 |
+
print(f"\n🔄 Processing Batch {batch_num+1}/{total_batches} (Chapters: {[idx+1 for idx, _ in current_batch]})")
|
| 1448 |
+
print(f"[BATCH] Submitting {len(current_batch)} chapters for parallel processing...")
|
| 1449 |
+
batch_start_time = time.time()
|
| 1450 |
+
|
| 1451 |
+
# Process batch in parallel BUT handle results as they complete
|
| 1452 |
+
temp = float(os.getenv("GLOSSARY_TEMPERATURE") or config.get('temperature', 0.1))
|
| 1453 |
+
env_max_output = os.getenv("MAX_OUTPUT_TOKENS")
|
| 1454 |
+
if env_max_output and env_max_output.isdigit():
|
| 1455 |
+
mtoks = int(env_max_output)
|
| 1456 |
+
else:
|
| 1457 |
+
mtoks = config.get('max_tokens', 4196)
|
| 1458 |
+
|
| 1459 |
+
batch_entry_count = 0
|
| 1460 |
+
|
| 1461 |
+
with ThreadPoolExecutor(max_workers=len(current_batch)) as executor:
|
| 1462 |
+
futures = {}
|
| 1463 |
+
|
| 1464 |
+
# Submit all chapters in the batch
|
| 1465 |
+
for idx, chap in current_batch:
|
| 1466 |
+
if check_stop():
|
| 1467 |
+
# Apply deduplication before breaking
|
| 1468 |
+
if glossary:
|
| 1469 |
+
print("🔀 Applying deduplication before stopping...")
|
| 1470 |
+
glossary[:] = skip_duplicate_entries(glossary)
|
| 1471 |
+
save_glossary_json(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1472 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1473 |
+
break
|
| 1474 |
+
|
| 1475 |
+
# Get system and user prompts
|
| 1476 |
+
system_prompt, user_prompt = build_prompt(chap)
|
| 1477 |
+
|
| 1478 |
+
# Build messages
|
| 1479 |
+
if not contextual_enabled:
|
| 1480 |
+
msgs = [
|
| 1481 |
+
{"role": "system", "content": system_prompt},
|
| 1482 |
+
{"role": "user", "content": user_prompt}
|
| 1483 |
+
]
|
| 1484 |
+
else:
|
| 1485 |
+
msgs = [{"role": "system", "content": system_prompt}] \
|
| 1486 |
+
+ trim_context_history(history, ctx_limit, rolling_window) \
|
| 1487 |
+
+ [{"role": "user", "content": user_prompt}]
|
| 1488 |
+
|
| 1489 |
+
# Submit to thread pool
|
| 1490 |
+
future = executor.submit(
|
| 1491 |
+
process_single_chapter_api_call,
|
| 1492 |
+
idx, chap, msgs, client, temp, mtoks, check_stop, chunk_timeout
|
| 1493 |
+
)
|
| 1494 |
+
futures[future] = (idx, chap)
|
| 1495 |
+
# Small yield to keep GUI responsive when submitting many tasks
|
| 1496 |
+
if idx % 5 == 0:
|
| 1497 |
+
time.sleep(0.001)
|
| 1498 |
+
# Small yield to keep GUI responsive when submitting many tasks
|
| 1499 |
+
if idx % 5 == 0:
|
| 1500 |
+
time.sleep(0.001)
|
| 1501 |
+
|
| 1502 |
+
# Process results AS THEY COMPLETE, not all at once
|
| 1503 |
+
for future in as_completed(futures):
|
| 1504 |
+
if check_stop():
|
| 1505 |
+
print("🛑 Stop detected - cancelling all pending operations...")
|
| 1506 |
+
cancelled = cancel_all_futures(list(futures.keys()))
|
| 1507 |
+
if cancelled > 0:
|
| 1508 |
+
print(f"✅ Cancelled {cancelled} pending API calls")
|
| 1509 |
+
|
| 1510 |
+
# Apply deduplication before stopping
|
| 1511 |
+
if glossary:
|
| 1512 |
+
print("🔀 Applying deduplication and sorting before exit...")
|
| 1513 |
+
glossary[:] = skip_duplicate_entries(glossary)
|
| 1514 |
+
|
| 1515 |
+
# Sort glossary
|
| 1516 |
+
custom_types = get_custom_entry_types()
|
| 1517 |
+
type_order = {'character': 0, 'term': 1}
|
| 1518 |
+
other_types = sorted([t for t in custom_types.keys() if t not in ['character', 'term']])
|
| 1519 |
+
for i, t in enumerate(other_types):
|
| 1520 |
+
type_order[t] = i + 2
|
| 1521 |
+
glossary.sort(key=lambda x: (
|
| 1522 |
+
type_order.get(x.get('type', 'term'), 999),
|
| 1523 |
+
x.get('raw_name', '').lower()
|
| 1524 |
+
))
|
| 1525 |
+
|
| 1526 |
+
save_progress(completed, glossary, history)
|
| 1527 |
+
save_glossary_json(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1528 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1529 |
+
print(f"✅ Saved {len(glossary)} deduplicated entries before exit")
|
| 1530 |
+
|
| 1531 |
+
executor.shutdown(wait=False)
|
| 1532 |
+
break
|
| 1533 |
+
|
| 1534 |
+
idx, chap = futures[future]
|
| 1535 |
+
|
| 1536 |
+
try:
|
| 1537 |
+
result = future.result(timeout=0.5)
|
| 1538 |
+
|
| 1539 |
+
# Process this chapter's results immediately
|
| 1540 |
+
data = result.get('data', [])
|
| 1541 |
+
resp = result.get('resp', '')
|
| 1542 |
+
error = result.get('error')
|
| 1543 |
+
|
| 1544 |
+
if error:
|
| 1545 |
+
print(f"[Chapter {idx+1}] Error: {error}")
|
| 1546 |
+
completed.append(idx)
|
| 1547 |
+
continue
|
| 1548 |
+
|
| 1549 |
+
# Process and save entries IMMEDIATELY as each chapter completes
|
| 1550 |
+
if data and len(data) > 0:
|
| 1551 |
+
total_ent = len(data)
|
| 1552 |
+
batch_entry_count += total_ent
|
| 1553 |
+
|
| 1554 |
+
for eidx, entry in enumerate(data, start=1):
|
| 1555 |
+
elapsed = time.time() - start
|
| 1556 |
+
|
| 1557 |
+
# Get entry info
|
| 1558 |
+
entry_type = entry.get("type", "?")
|
| 1559 |
+
raw_name = entry.get("raw_name", "?")
|
| 1560 |
+
trans_name = entry.get("translated_name", "?")
|
| 1561 |
+
|
| 1562 |
+
print(f'[Chapter {idx+1}/{total_chapters}] [{eidx}/{total_ent}] ({elapsed:.1f}s elapsed) → {entry_type}: {raw_name} ({trans_name})')
|
| 1563 |
+
|
| 1564 |
+
# Add entry immediately WITHOUT deduplication
|
| 1565 |
+
glossary.append(entry)
|
| 1566 |
+
|
| 1567 |
+
# Save immediately after EACH entry
|
| 1568 |
+
save_progress(completed, glossary, history)
|
| 1569 |
+
save_glossary_json(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1570 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1571 |
+
|
| 1572 |
+
completed.append(idx)
|
| 1573 |
+
|
| 1574 |
+
# Add to history if contextual is enabled
|
| 1575 |
+
if contextual_enabled and resp and chap:
|
| 1576 |
+
system_prompt, user_prompt = build_prompt(chap)
|
| 1577 |
+
history.append({"user": user_prompt, "assistant": resp})
|
| 1578 |
+
|
| 1579 |
+
except Exception as e:
|
| 1580 |
+
if "stopped by user" in str(e).lower():
|
| 1581 |
+
print(f"✅ Chapter {idx+1} stopped by user")
|
| 1582 |
+
else:
|
| 1583 |
+
print(f"Error processing chapter {idx+1}: {e}")
|
| 1584 |
+
completed.append(idx)
|
| 1585 |
+
|
| 1586 |
+
batch_elapsed = time.time() - batch_start_time
|
| 1587 |
+
print(f"[BATCH] Batch {batch_num+1} completed in {batch_elapsed:.1f}s total")
|
| 1588 |
+
|
| 1589 |
+
# After batch completes, apply deduplication and sorting
|
| 1590 |
+
if batch_entry_count > 0:
|
| 1591 |
+
print(f"\n🔀 Applying deduplication and sorting after batch {batch_num+1}/{total_batches}")
|
| 1592 |
+
original_size = len(glossary)
|
| 1593 |
+
|
| 1594 |
+
# Apply deduplication to entire glossary
|
| 1595 |
+
glossary[:] = skip_duplicate_entries(glossary)
|
| 1596 |
+
|
| 1597 |
+
# Sort glossary by type and name
|
| 1598 |
+
custom_types = get_custom_entry_types()
|
| 1599 |
+
type_order = {'character': 0, 'term': 1}
|
| 1600 |
+
other_types = sorted([t for t in custom_types.keys() if t not in ['character', 'term']])
|
| 1601 |
+
for i, t in enumerate(other_types):
|
| 1602 |
+
type_order[t] = i + 2
|
| 1603 |
+
|
| 1604 |
+
glossary.sort(key=lambda x: (
|
| 1605 |
+
type_order.get(x.get('type', 'term'), 999),
|
| 1606 |
+
x.get('raw_name', '').lower()
|
| 1607 |
+
))
|
| 1608 |
+
|
| 1609 |
+
deduplicated_size = len(glossary)
|
| 1610 |
+
removed = original_size - deduplicated_size
|
| 1611 |
+
|
| 1612 |
+
if removed > 0:
|
| 1613 |
+
print(f"✅ Removed {removed} duplicates (fuzzy threshold: {os.getenv('GLOSSARY_FUZZY_THRESHOLD', '0.90')})")
|
| 1614 |
+
print(f"📊 Glossary size: {deduplicated_size} unique entries")
|
| 1615 |
+
|
| 1616 |
+
# Save final deduplicated and sorted glossary
|
| 1617 |
+
save_progress(completed, glossary, history)
|
| 1618 |
+
save_glossary_json(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1619 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1620 |
+
|
| 1621 |
+
# Print batch summary
|
| 1622 |
+
if batch_entry_count > 0:
|
| 1623 |
+
print(f"\n📊 Batch {batch_num+1}/{total_batches} Summary:")
|
| 1624 |
+
print(f" • Chapters processed: {len(current_batch)}")
|
| 1625 |
+
print(f" • Total entries extracted: {batch_entry_count}")
|
| 1626 |
+
print(f" • Glossary size: {len(glossary)} unique entries")
|
| 1627 |
+
|
| 1628 |
+
# Handle context history
|
| 1629 |
+
if contextual_enabled:
|
| 1630 |
+
if not rolling_window and len(history) >= ctx_limit and ctx_limit > 0:
|
| 1631 |
+
print(f"🔄 Resetting glossary context (reached {ctx_limit} chapter limit)")
|
| 1632 |
+
history = []
|
| 1633 |
+
prog['context_history'] = []
|
| 1634 |
+
|
| 1635 |
+
# Add delay between batches (but not after the last batch)
|
| 1636 |
+
if batch_num < total_batches - 1:
|
| 1637 |
+
print(f"\n⏱️ Waiting {api_delay}s before next batch...")
|
| 1638 |
+
if not interruptible_sleep(api_delay, check_stop, 0.1):
|
| 1639 |
+
print(f"❌ Glossary extraction stopped during delay")
|
| 1640 |
+
# Apply deduplication before stopping
|
| 1641 |
+
if glossary:
|
| 1642 |
+
print("🔀 Applying deduplication and sorting before exit...")
|
| 1643 |
+
glossary[:] = skip_duplicate_entries(glossary)
|
| 1644 |
+
|
| 1645 |
+
# Sort glossary
|
| 1646 |
+
custom_types = get_custom_entry_types()
|
| 1647 |
+
type_order = {'character': 0, 'term': 1}
|
| 1648 |
+
other_types = sorted([t for t in custom_types.keys() if t not in ['character', 'term']])
|
| 1649 |
+
for i, t in enumerate(other_types):
|
| 1650 |
+
type_order[t] = i + 2
|
| 1651 |
+
glossary.sort(key=lambda x: (
|
| 1652 |
+
type_order.get(x.get('type', 'term'), 999),
|
| 1653 |
+
x.get('raw_name', '').lower()
|
| 1654 |
+
))
|
| 1655 |
+
|
| 1656 |
+
save_progress(completed, glossary, history)
|
| 1657 |
+
save_glossary_json(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1658 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 1659 |
+
print(f"✅ Saved {len(glossary)} deduplicated entries before exit")
|
| 1660 |
+
return
|
| 1661 |
+
|
| 1662 |
+
else:
|
| 1663 |
+
# SEQUENTIAL MODE: Original behavior
|
| 1664 |
+
for idx, chap in enumerate(chapters):
|
| 1665 |
+
# Check for stop at the beginning of each chapter
|
| 1666 |
+
if check_stop():
|
| 1667 |
+
print(f"❌ Glossary extraction stopped at chapter {idx+1}")
|
| 1668 |
+
return
|
| 1669 |
+
|
| 1670 |
+
# Apply chapter range filter
|
| 1671 |
+
if range_start is not None and range_end is not None:
|
| 1672 |
+
chapter_num = idx + 1 # 1-based chapter numbering
|
| 1673 |
+
if not (range_start <= chapter_num <= range_end):
|
| 1674 |
+
# Check if this is from a text file
|
| 1675 |
+
is_text_chapter = hasattr(chap, 'filename') and chap.get('filename', '').endswith('.txt')
|
| 1676 |
+
terminology = "Section" if is_text_chapter else "Chapter"
|
| 1677 |
+
print(f"[SKIP] {terminology} {chapter_num} - outside range filter")
|
| 1678 |
+
continue
|
| 1679 |
+
|
| 1680 |
+
if idx in completed:
|
| 1681 |
+
# Check if processing text file chapters
|
| 1682 |
+
is_text_chapter = hasattr(chap, 'filename') and chap.get('filename', '').endswith('.txt')
|
| 1683 |
+
terminology = "section" if is_text_chapter else "chapter"
|
| 1684 |
+
print(f"Skipping {terminology} {idx+1} (already processed)")
|
| 1685 |
+
continue
|
| 1686 |
+
|
| 1687 |
+
print(f"🔄 Processing Chapter {idx+1}/{total_chapters}")
|
| 1688 |
+
|
| 1689 |
+
# Check if history will reset on this chapter
|
| 1690 |
+
if contextual_enabled and len(history) >= ctx_limit and ctx_limit > 0 and not rolling_window:
|
| 1691 |
+
print(f" 📌 Glossary context will reset after this chapter (current: {len(history)}/{ctx_limit} chapters)")
|
| 1692 |
+
|
| 1693 |
+
try:
|
| 1694 |
+
# Get system and user prompts from build_prompt
|
| 1695 |
+
system_prompt, user_prompt = build_prompt(chap)
|
| 1696 |
+
|
| 1697 |
+
if not contextual_enabled:
|
| 1698 |
+
# No context at all
|
| 1699 |
+
msgs = [
|
| 1700 |
+
{"role": "system", "content": system_prompt},
|
| 1701 |
+
{"role": "user", "content": user_prompt}
|
| 1702 |
+
]
|
| 1703 |
+
else:
|
| 1704 |
+
# Use context with trim_context_history handling the mode
|
| 1705 |
+
msgs = [{"role": "system", "content": system_prompt}] \
|
| 1706 |
+
+ trim_context_history(history, ctx_limit, rolling_window) \
|
| 1707 |
+
+ [{"role": "user", "content": user_prompt}]
|
| 1708 |
+
|
| 1709 |
+
total_tokens = sum(count_tokens(m["content"]) for m in msgs)
|
| 1710 |
+
|
| 1711 |
+
# READ THE TOKEN LIMIT
|
| 1712 |
+
env_value = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
|
| 1713 |
+
if not env_value or env_value == "":
|
| 1714 |
+
token_limit = None
|
| 1715 |
+
limit_str = "unlimited"
|
| 1716 |
+
elif env_value.isdigit() and int(env_value) > 0:
|
| 1717 |
+
token_limit = int(env_value)
|
| 1718 |
+
limit_str = str(token_limit)
|
| 1719 |
+
else:
|
| 1720 |
+
token_limit = 1000000
|
| 1721 |
+
limit_str = "1000000 (default)"
|
| 1722 |
+
|
| 1723 |
+
print(f"[DEBUG] Glossary prompt tokens = {total_tokens} / {limit_str}")
|
| 1724 |
+
|
| 1725 |
+
# Check if we're over the token limit and need to split
|
| 1726 |
+
if token_limit is not None and total_tokens > token_limit:
|
| 1727 |
+
print(f"⚠️ Chapter {idx+1} exceeds token limit: {total_tokens} > {token_limit}")
|
| 1728 |
+
print(f"📄 Using ChapterSplitter to split into smaller chunks...")
|
| 1729 |
+
|
| 1730 |
+
# Calculate available tokens for content
|
| 1731 |
+
system_tokens = chapter_splitter.count_tokens(system_prompt)
|
| 1732 |
+
context_tokens = sum(chapter_splitter.count_tokens(m["content"]) for m in trim_context_history(history, ctx_limit, rolling_window))
|
| 1733 |
+
safety_margin = 1000
|
| 1734 |
+
available_tokens = token_limit - system_tokens - context_tokens - safety_margin
|
| 1735 |
+
|
| 1736 |
+
# Since glossary extraction works with plain text, wrap it in a simple HTML structure
|
| 1737 |
+
chapter_html = f"<html><body><p>{chap.replace(chr(10)+chr(10), '</p><p>')}</p></body></html>"
|
| 1738 |
+
|
| 1739 |
+
# Use ChapterSplitter to split the chapter
|
| 1740 |
+
chunks = chapter_splitter.split_chapter(chapter_html, available_tokens)
|
| 1741 |
+
print(f"📄 Chapter split into {len(chunks)} chunks")
|
| 1742 |
+
|
| 1743 |
+
# Process each chunk
|
| 1744 |
+
chapter_glossary_data = [] # Collect data from all chunks
|
| 1745 |
+
|
| 1746 |
+
for chunk_html, chunk_idx, total_chunks in chunks:
|
| 1747 |
+
if check_stop():
|
| 1748 |
+
print(f"❌ Glossary extraction stopped during chunk {chunk_idx} of chapter {idx+1}")
|
| 1749 |
+
return
|
| 1750 |
+
|
| 1751 |
+
print(f"🔄 Processing chunk {chunk_idx}/{total_chunks} of Chapter {idx+1}")
|
| 1752 |
+
|
| 1753 |
+
# Extract text from the chunk HTML
|
| 1754 |
+
from bs4 import BeautifulSoup
|
| 1755 |
+
soup = BeautifulSoup(chunk_html, 'html.parser')
|
| 1756 |
+
chunk_text = soup.get_text(strip=True)
|
| 1757 |
+
|
| 1758 |
+
# Get system and user prompts for chunk
|
| 1759 |
+
chunk_system_prompt, chunk_user_prompt = build_prompt(chunk_text)
|
| 1760 |
+
|
| 1761 |
+
# Build chunk messages
|
| 1762 |
+
if not contextual_enabled:
|
| 1763 |
+
chunk_msgs = [
|
| 1764 |
+
{"role": "system", "content": chunk_system_prompt},
|
| 1765 |
+
{"role": "user", "content": chunk_user_prompt}
|
| 1766 |
+
]
|
| 1767 |
+
else:
|
| 1768 |
+
chunk_msgs = [{"role": "system", "content": chunk_system_prompt}] \
|
| 1769 |
+
+ trim_context_history(history, ctx_limit, rolling_window) \
|
| 1770 |
+
+ [{"role": "user", "content": chunk_user_prompt}]
|
| 1771 |
+
|
| 1772 |
+
# API call for chunk
|
| 1773 |
+
try:
|
| 1774 |
+
chunk_raw = send_with_interrupt(
|
| 1775 |
+
messages=chunk_msgs,
|
| 1776 |
+
client=client,
|
| 1777 |
+
temperature=temp,
|
| 1778 |
+
max_tokens=mtoks,
|
| 1779 |
+
stop_check_fn=check_stop,
|
| 1780 |
+
chunk_timeout=chunk_timeout
|
| 1781 |
+
)
|
| 1782 |
+
except UnifiedClientError as e:
|
| 1783 |
+
if "stopped by user" in str(e).lower():
|
| 1784 |
+
print(f"❌ Glossary extraction stopped during chunk {chunk_idx} API call")
|
| 1785 |
+
return
|
| 1786 |
+
elif "timeout" in str(e).lower():
|
| 1787 |
+
print(f"⚠️ Chunk {chunk_idx} API call timed out: {e}")
|
| 1788 |
+
continue # Skip this chunk
|
| 1789 |
+
else:
|
| 1790 |
+
print(f"❌ Chunk {chunk_idx} API error: {e}")
|
| 1791 |
+
continue # Skip this chunk
|
| 1792 |
+
except Exception as e:
|
| 1793 |
+
print(f"❌ Unexpected error in chunk {chunk_idx}: {e}")
|
| 1794 |
+
continue # Skip this chunk
|
| 1795 |
+
|
| 1796 |
+
# Process chunk response
|
| 1797 |
+
if chunk_raw is None:
|
| 1798 |
+
print(f"❌ API returned None for chunk {chunk_idx}")
|
| 1799 |
+
continue
|
| 1800 |
+
|
| 1801 |
+
# Handle different response types
|
| 1802 |
+
if isinstance(chunk_raw, tuple):
|
| 1803 |
+
chunk_resp = chunk_raw[0] if chunk_raw[0] is not None else ""
|
| 1804 |
+
elif isinstance(chunk_raw, str):
|
| 1805 |
+
chunk_resp = chunk_raw
|
| 1806 |
+
elif hasattr(chunk_raw, 'content'):
|
| 1807 |
+
chunk_resp = chunk_raw.content if chunk_raw.content is not None else ""
|
| 1808 |
+
elif hasattr(chunk_raw, 'text'):
|
| 1809 |
+
chunk_resp = chunk_raw.text if chunk_raw.text is not None else ""
|
| 1810 |
+
else:
|
| 1811 |
+
print(f"❌ Unexpected response type for chunk {chunk_idx}: {type(chunk_raw)}")
|
| 1812 |
+
chunk_resp = str(chunk_raw) if chunk_raw is not None else ""
|
| 1813 |
+
|
| 1814 |
+
# Ensure resp is a string
|
| 1815 |
+
if not isinstance(chunk_resp, str):
|
| 1816 |
+
print(f"⚠️ Converting non-string response to string for chunk {chunk_idx}")
|
| 1817 |
+
chunk_resp = str(chunk_resp) if chunk_resp is not None else ""
|
| 1818 |
+
|
| 1819 |
+
# Check if response is empty
|
| 1820 |
+
if not chunk_resp or chunk_resp.strip() == "":
|
| 1821 |
+
print(f"⚠️ Empty response for chunk {chunk_idx}, skipping...")
|
| 1822 |
+
continue
|
| 1823 |
+
|
| 1824 |
+
# Save chunk response with thread-safe location
|
| 1825 |
+
thread_name = threading.current_thread().name
|
| 1826 |
+
thread_id = threading.current_thread().ident
|
| 1827 |
+
thread_dir = os.path.join("Payloads", "glossary", f"{thread_name}_{thread_id}")
|
| 1828 |
+
os.makedirs(thread_dir, exist_ok=True)
|
| 1829 |
+
|
| 1830 |
+
with open(os.path.join(thread_dir, f"chunk_response_chap{idx+1}_chunk{chunk_idx}.txt"), "w", encoding="utf-8", errors="replace") as f:
|
| 1831 |
+
f.write(chunk_resp)
|
| 1832 |
+
|
| 1833 |
+
# Extract data from chunk
|
| 1834 |
+
chunk_resp_data = parse_api_response(chunk_resp)
|
| 1835 |
+
|
| 1836 |
+
if not chunk_resp_data:
|
| 1837 |
+
print(f"[Warning] No data found in chunk {chunk_idx}, skipping...")
|
| 1838 |
+
continue
|
| 1839 |
+
|
| 1840 |
+
# The parse_api_response already returns parsed data, no need to parse again
|
| 1841 |
+
try:
|
| 1842 |
+
# Filter out invalid entries directly from chunk_resp_data
|
| 1843 |
+
valid_chunk_data = []
|
| 1844 |
+
for entry in chunk_resp_data:
|
| 1845 |
+
if validate_extracted_entry(entry):
|
| 1846 |
+
# Clean the raw_name
|
| 1847 |
+
if 'raw_name' in entry:
|
| 1848 |
+
entry['raw_name'] = entry['raw_name'].strip()
|
| 1849 |
+
valid_chunk_data.append(entry)
|
| 1850 |
+
else:
|
| 1851 |
+
print(f"[Debug] Skipped invalid entry in chunk {chunk_idx}: {entry}")
|
| 1852 |
+
|
| 1853 |
+
chapter_glossary_data.extend(valid_chunk_data)
|
| 1854 |
+
print(f"✅ Chunk {chunk_idx}/{total_chunks}: extracted {len(valid_chunk_data)} entries")
|
| 1855 |
+
|
| 1856 |
+
# Add chunk to history if contextual
|
| 1857 |
+
if contextual_enabled:
|
| 1858 |
+
history.append({"user": chunk_user_prompt, "assistant": chunk_resp})
|
| 1859 |
+
|
| 1860 |
+
except Exception as e:
|
| 1861 |
+
print(f"[Warning] Error processing chunk {chunk_idx} data: {e}")
|
| 1862 |
+
continue
|
| 1863 |
+
|
| 1864 |
+
# Add delay between chunks (but not after last chunk)
|
| 1865 |
+
if chunk_idx < total_chunks:
|
| 1866 |
+
print(f"⏱️ Waiting {api_delay}s before next chunk...")
|
| 1867 |
+
if not interruptible_sleep(api_delay, check_stop, 0.1):
|
| 1868 |
+
print(f"❌ Glossary extraction stopped during chunk delay")
|
| 1869 |
+
return
|
| 1870 |
+
|
| 1871 |
+
# Use the collected data from all chunks
|
| 1872 |
+
data = chapter_glossary_data
|
| 1873 |
+
resp = "" # Combined response not needed for progress tracking
|
| 1874 |
+
print(f"✅ Chapter {idx+1} processed in {len(chunks)} chunks, total entries: {len(data)}")
|
| 1875 |
+
|
| 1876 |
+
else:
|
| 1877 |
+
# Original single-chapter processing
|
| 1878 |
+
# Check for stop before API call
|
| 1879 |
+
if check_stop():
|
| 1880 |
+
print(f"❌ Glossary extraction stopped before API call for chapter {idx+1}")
|
| 1881 |
+
return
|
| 1882 |
+
|
| 1883 |
+
try:
|
| 1884 |
+
# Use send_with_interrupt for API call
|
| 1885 |
+
raw = send_with_interrupt(
|
| 1886 |
+
messages=msgs,
|
| 1887 |
+
client=client,
|
| 1888 |
+
temperature=temp,
|
| 1889 |
+
max_tokens=mtoks,
|
| 1890 |
+
stop_check_fn=check_stop,
|
| 1891 |
+
chunk_timeout=chunk_timeout
|
| 1892 |
+
)
|
| 1893 |
+
except UnifiedClientError as e:
|
| 1894 |
+
if "stopped by user" in str(e).lower():
|
| 1895 |
+
print(f"❌ Glossary extraction stopped during API call for chapter {idx+1}")
|
| 1896 |
+
return
|
| 1897 |
+
elif "timeout" in str(e).lower():
|
| 1898 |
+
print(f"⚠️ API call timed out for chapter {idx+1}: {e}")
|
| 1899 |
+
continue
|
| 1900 |
+
else:
|
| 1901 |
+
print(f"❌ API error for chapter {idx+1}: {e}")
|
| 1902 |
+
continue
|
| 1903 |
+
except Exception as e:
|
| 1904 |
+
print(f"❌ Unexpected error for chapter {idx+1}: {e}")
|
| 1905 |
+
continue
|
| 1906 |
+
|
| 1907 |
+
# Handle response
|
| 1908 |
+
if raw is None:
|
| 1909 |
+
print(f"❌ API returned None for chapter {idx+1}")
|
| 1910 |
+
continue
|
| 1911 |
+
|
| 1912 |
+
# Handle different response types
|
| 1913 |
+
if isinstance(raw, tuple):
|
| 1914 |
+
resp = raw[0] if raw[0] is not None else ""
|
| 1915 |
+
elif isinstance(raw, str):
|
| 1916 |
+
resp = raw
|
| 1917 |
+
elif hasattr(raw, 'content'):
|
| 1918 |
+
resp = raw.content if raw.content is not None else ""
|
| 1919 |
+
elif hasattr(raw, 'text'):
|
| 1920 |
+
resp = raw.text if raw.text is not None else ""
|
| 1921 |
+
else:
|
| 1922 |
+
print(f"❌ Unexpected response type for chapter {idx+1}: {type(raw)}")
|
| 1923 |
+
resp = str(raw) if raw is not None else ""
|
| 1924 |
+
|
| 1925 |
+
# Ensure resp is a string
|
| 1926 |
+
if not isinstance(resp, str):
|
| 1927 |
+
print(f"⚠️ Converting non-string response to string for chapter {idx+1}")
|
| 1928 |
+
resp = str(resp) if resp is not None else ""
|
| 1929 |
+
|
| 1930 |
+
# NULL CHECK before checking if response is empty
|
| 1931 |
+
if resp is None:
|
| 1932 |
+
print(f"⚠️ Response is None for chapter {idx+1}, skipping...")
|
| 1933 |
+
continue
|
| 1934 |
+
|
| 1935 |
+
# Check if response is empty
|
| 1936 |
+
if not resp or resp.strip() == "":
|
| 1937 |
+
print(f"⚠️ Empty response for chapter {idx+1}, skipping...")
|
| 1938 |
+
continue
|
| 1939 |
+
|
| 1940 |
+
# Save the raw response with thread-safe location
|
| 1941 |
+
thread_name = threading.current_thread().name
|
| 1942 |
+
thread_id = threading.current_thread().ident
|
| 1943 |
+
thread_dir = os.path.join("Payloads", "glossary", f"{thread_name}_{thread_id}")
|
| 1944 |
+
os.makedirs(thread_dir, exist_ok=True)
|
| 1945 |
+
|
| 1946 |
+
with open(os.path.join(thread_dir, f"response_chap{idx+1}.txt"), "w", encoding="utf-8", errors="replace") as f:
|
| 1947 |
+
f.write(resp)
|
| 1948 |
+
|
| 1949 |
+
# Parse response using the new parser
|
| 1950 |
+
try:
|
| 1951 |
+
data = parse_api_response(resp)
|
| 1952 |
+
except Exception as e:
|
| 1953 |
+
print(f"❌ Error parsing response for chapter {idx+1}: {e}")
|
| 1954 |
+
print(f" Response preview: {resp[:200] if resp else 'None'}...")
|
| 1955 |
+
continue
|
| 1956 |
+
|
| 1957 |
+
# Filter out invalid entries
|
| 1958 |
+
valid_data = []
|
| 1959 |
+
for entry in data:
|
| 1960 |
+
if validate_extracted_entry(entry):
|
| 1961 |
+
# Clean the raw_name
|
| 1962 |
+
if 'raw_name' in entry:
|
| 1963 |
+
entry['raw_name'] = entry['raw_name'].strip()
|
| 1964 |
+
valid_data.append(entry)
|
| 1965 |
+
else:
|
| 1966 |
+
print(f"[Debug] Skipped invalid entry: {entry}")
|
| 1967 |
+
|
| 1968 |
+
data = valid_data
|
| 1969 |
+
total_ent = len(data)
|
| 1970 |
+
|
| 1971 |
+
# Log entries
|
| 1972 |
+
for eidx, entry in enumerate(data, start=1):
|
| 1973 |
+
if check_stop():
|
| 1974 |
+
print(f"❌ Glossary extraction stopped during entry processing for chapter {idx+1}")
|
| 1975 |
+
return
|
| 1976 |
+
|
| 1977 |
+
elapsed = time.time() - start
|
| 1978 |
+
if idx == 0 and eidx == 1:
|
| 1979 |
+
eta = 0
|
| 1980 |
+
else:
|
| 1981 |
+
avg = elapsed / ((idx * 100) + eidx)
|
| 1982 |
+
eta = avg * (total_chapters * 100 - ((idx * 100) + eidx))
|
| 1983 |
+
|
| 1984 |
+
# Get entry info based on new format
|
| 1985 |
+
entry_type = entry.get("type", "?")
|
| 1986 |
+
raw_name = entry.get("raw_name", "?")
|
| 1987 |
+
trans_name = entry.get("translated_name", "?")
|
| 1988 |
+
|
| 1989 |
+
print(f'[Chapter {idx+1}/{total_chapters}] [{eidx}/{total_ent}] ({elapsed:.1f}s elapsed, ETA {eta:.1f}s) → {entry_type}: {raw_name} ({trans_name})')
|
| 1990 |
+
|
| 1991 |
+
# Apply skip logic and save
|
| 1992 |
+
glossary.extend(data)
|
| 1993 |
+
glossary[:] = skip_duplicate_entries(glossary)
|
| 1994 |
+
completed.append(idx)
|
| 1995 |
+
|
| 1996 |
+
# Only add to history if contextual is enabled
|
| 1997 |
+
if contextual_enabled and 'resp' in locals() and resp:
|
| 1998 |
+
history.append({"user": user_prompt, "assistant": resp})
|
| 1999 |
+
|
| 2000 |
+
# Reset history when limit reached without rolling window
|
| 2001 |
+
if not rolling_window and len(history) >= ctx_limit and ctx_limit > 0:
|
| 2002 |
+
print(f"🔄 Resetting glossary context (reached {ctx_limit} chapter limit)")
|
| 2003 |
+
history = []
|
| 2004 |
+
prog['context_history'] = []
|
| 2005 |
+
|
| 2006 |
+
save_progress(completed, glossary, history)
|
| 2007 |
+
save_glossary_json(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 2008 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 2009 |
+
|
| 2010 |
+
# Add delay before next API call (but not after the last chapter)
|
| 2011 |
+
if idx < len(chapters) - 1:
|
| 2012 |
+
# Check if we're within the range or if there are more chapters to process
|
| 2013 |
+
next_chapter_in_range = True
|
| 2014 |
+
if range_start is not None and range_end is not None:
|
| 2015 |
+
next_chapter_num = idx + 2 # idx+1 is current, idx+2 is next
|
| 2016 |
+
next_chapter_in_range = (range_start <= next_chapter_num <= range_end)
|
| 2017 |
+
else:
|
| 2018 |
+
# No range filter, check if next chapter is already completed
|
| 2019 |
+
next_chapter_in_range = (idx + 1) not in completed
|
| 2020 |
+
|
| 2021 |
+
if next_chapter_in_range:
|
| 2022 |
+
print(f"⏱️ Waiting {api_delay}s before next chapter...")
|
| 2023 |
+
if not interruptible_sleep(api_delay, check_stop, 0.1):
|
| 2024 |
+
print(f"❌ Glossary extraction stopped during delay")
|
| 2025 |
+
return
|
| 2026 |
+
|
| 2027 |
+
# Check for stop after processing chapter
|
| 2028 |
+
if check_stop():
|
| 2029 |
+
print(f"❌ Glossary extraction stopped after processing chapter {idx+1}")
|
| 2030 |
+
return
|
| 2031 |
+
|
| 2032 |
+
except Exception as e:
|
| 2033 |
+
print(f"Error at chapter {idx+1}: {e}")
|
| 2034 |
+
import traceback
|
| 2035 |
+
print(f"Full traceback: {traceback.format_exc()}")
|
| 2036 |
+
# Check for stop even after error
|
| 2037 |
+
if check_stop():
|
| 2038 |
+
print(f"❌ Glossary extraction stopped after error in chapter {idx+1}")
|
| 2039 |
+
return
|
| 2040 |
+
|
| 2041 |
+
print(f"Done. Glossary saved to {args.output}")
|
| 2042 |
+
|
| 2043 |
+
# Also save as CSV format for compatibility
|
| 2044 |
+
try:
|
| 2045 |
+
csv_output = args.output.replace('.json', '.csv')
|
| 2046 |
+
csv_path = os.path.join(glossary_dir, os.path.basename(csv_output))
|
| 2047 |
+
save_glossary_csv(glossary, os.path.join(glossary_dir, os.path.basename(args.output)))
|
| 2048 |
+
print(f"Also saved as CSV: {csv_path}")
|
| 2049 |
+
except Exception as e:
|
| 2050 |
+
print(f"[Warning] Could not save CSV format: {e}")
|
| 2051 |
+
|
| 2052 |
+
def save_progress(completed: List[int], glossary: List[Dict], context_history: List[Dict]):
|
| 2053 |
+
"""Save progress to JSON file"""
|
| 2054 |
+
progress_data = {
|
| 2055 |
+
"completed": completed,
|
| 2056 |
+
"glossary": glossary,
|
| 2057 |
+
"context_history": context_history
|
| 2058 |
+
}
|
| 2059 |
+
|
| 2060 |
+
try:
|
| 2061 |
+
# Use atomic write to prevent corruption
|
| 2062 |
+
temp_file = PROGRESS_FILE + '.tmp'
|
| 2063 |
+
with open(temp_file, 'w', encoding='utf-8') as f:
|
| 2064 |
+
json.dump(progress_data, f, ensure_ascii=False, indent=2)
|
| 2065 |
+
|
| 2066 |
+
# Replace the old file with the new one
|
| 2067 |
+
if os.path.exists(PROGRESS_FILE):
|
| 2068 |
+
os.remove(PROGRESS_FILE)
|
| 2069 |
+
os.rename(temp_file, PROGRESS_FILE)
|
| 2070 |
+
|
| 2071 |
+
except Exception as e:
|
| 2072 |
+
print(f"[Warning] Failed to save progress: {e}")
|
| 2073 |
+
# Try direct write as fallback
|
| 2074 |
+
try:
|
| 2075 |
+
with open(PROGRESS_FILE, 'w', encoding='utf-8') as f:
|
| 2076 |
+
json.dump(progress_data, f, ensure_ascii=False, indent=2)
|
| 2077 |
+
except Exception as e2:
|
| 2078 |
+
print(f"[Error] Could not save progress: {e2}")
|
| 2079 |
+
|
| 2080 |
+
if __name__=='__main__':
|
| 2081 |
+
main()
|
extract_glossary_from_txt.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# extract_glossary_from_txt.py
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
from typing import List
|
| 5 |
+
from txt_processor import TextFileProcessor
|
| 6 |
+
from chapter_splitter import ChapterSplitter
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
|
| 9 |
+
def extract_chapters_from_txt(txt_path: str) -> List[str]:
|
| 10 |
+
"""Extract chapters from text file for glossary extraction"""
|
| 11 |
+
processor = TextFileProcessor(txt_path, os.path.dirname(txt_path))
|
| 12 |
+
chapters = processor.extract_chapters()
|
| 13 |
+
|
| 14 |
+
# Initialize chapter splitter
|
| 15 |
+
model_name = os.getenv("MODEL", "gpt-3.5-turbo")
|
| 16 |
+
chapter_splitter = ChapterSplitter(model_name=model_name)
|
| 17 |
+
|
| 18 |
+
# Get max tokens from environment
|
| 19 |
+
max_input_tokens_str = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
|
| 20 |
+
if not max_input_tokens_str or max_input_tokens_str == "":
|
| 21 |
+
# Token limit disabled - use a very large number
|
| 22 |
+
max_input_tokens = 10000000 # 10M tokens
|
| 23 |
+
else:
|
| 24 |
+
max_input_tokens = int(max_input_tokens_str)
|
| 25 |
+
|
| 26 |
+
# Calculate available tokens (leaving room for system prompt and context)
|
| 27 |
+
system_prompt_size = 2000 # Estimate for glossary system prompt
|
| 28 |
+
context_size = 5000 # Estimate for context history
|
| 29 |
+
safety_margin = 1000
|
| 30 |
+
available_tokens = max_input_tokens - system_prompt_size - context_size - safety_margin
|
| 31 |
+
|
| 32 |
+
text_chapters = []
|
| 33 |
+
|
| 34 |
+
for idx, chapter in enumerate(chapters):
|
| 35 |
+
# Check if chapter needs splitting
|
| 36 |
+
chapter_tokens = chapter_splitter.count_tokens(chapter['body'])
|
| 37 |
+
|
| 38 |
+
if chapter_tokens > available_tokens:
|
| 39 |
+
print(f"Chapter {idx+1} has {chapter_tokens} tokens, splitting into smaller chunks...")
|
| 40 |
+
|
| 41 |
+
# Use ChapterSplitter to split the HTML content
|
| 42 |
+
chunks = chapter_splitter.split_chapter(chapter['body'], available_tokens)
|
| 43 |
+
|
| 44 |
+
# Extract text from each chunk
|
| 45 |
+
for chunk_html, chunk_idx, total_chunks in chunks:
|
| 46 |
+
soup = BeautifulSoup(chunk_html, 'html.parser')
|
| 47 |
+
text = soup.get_text(strip=True)
|
| 48 |
+
if text:
|
| 49 |
+
text_chapters.append(text)
|
| 50 |
+
print(f" Added chunk {chunk_idx}/{total_chunks} ({chapter_splitter.count_tokens(text)} tokens)")
|
| 51 |
+
else:
|
| 52 |
+
# Chapter is small enough, extract text as-is
|
| 53 |
+
soup = BeautifulSoup(chapter['body'], 'html.parser')
|
| 54 |
+
text = soup.get_text(strip=True)
|
| 55 |
+
if text:
|
| 56 |
+
text_chapters.append(text)
|
| 57 |
+
|
| 58 |
+
print(f"Total text chunks for glossary extraction: {len(text_chapters)}")
|
| 59 |
+
return text_chapters
|
glossarion_web.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
glossary_process_worker.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Process-safe glossary generation worker
|
| 3 |
+
========================================
|
| 4 |
+
This module provides a pickleable function for glossary generation
|
| 5 |
+
that can be run in a separate process using ProcessPoolExecutor.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import json
|
| 11 |
+
import time
|
| 12 |
+
|
| 13 |
+
def generate_glossary_in_process(output_dir, chapters_data, instructions, env_vars, log_queue=None):
|
| 14 |
+
"""
|
| 15 |
+
Generate glossary in a separate process to avoid GIL blocking.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
output_dir: Output directory path
|
| 19 |
+
chapters_data: Serialized chapters data
|
| 20 |
+
instructions: Glossary instructions
|
| 21 |
+
env_vars: Environment variables to set
|
| 22 |
+
log_queue: Queue to send logs back to main process
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
Dictionary with glossary results or error info
|
| 26 |
+
"""
|
| 27 |
+
import io
|
| 28 |
+
import sys
|
| 29 |
+
from io import StringIO
|
| 30 |
+
|
| 31 |
+
# Capture ALL output - both stdout and stderr
|
| 32 |
+
captured_logs = []
|
| 33 |
+
|
| 34 |
+
class LogCapture:
|
| 35 |
+
def __init__(self, queue=None):
|
| 36 |
+
self.queue = queue
|
| 37 |
+
self.buffer = ""
|
| 38 |
+
|
| 39 |
+
def write(self, text):
|
| 40 |
+
if text:
|
| 41 |
+
# Buffer text and send complete lines
|
| 42 |
+
self.buffer += text
|
| 43 |
+
while '\n' in self.buffer:
|
| 44 |
+
line, self.buffer = self.buffer.split('\n', 1)
|
| 45 |
+
if line:
|
| 46 |
+
captured_logs.append(line)
|
| 47 |
+
if self.queue:
|
| 48 |
+
try:
|
| 49 |
+
self.queue.put(line)
|
| 50 |
+
except:
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
def flush(self):
|
| 54 |
+
if self.buffer:
|
| 55 |
+
captured_logs.append(self.buffer)
|
| 56 |
+
if self.queue:
|
| 57 |
+
try:
|
| 58 |
+
self.queue.put(self.buffer)
|
| 59 |
+
except:
|
| 60 |
+
pass
|
| 61 |
+
self.buffer = ""
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
# Redirect BOTH stdout and stderr to capture ALL output
|
| 65 |
+
log_capture = LogCapture(log_queue)
|
| 66 |
+
old_stdout = sys.stdout
|
| 67 |
+
old_stderr = sys.stderr
|
| 68 |
+
sys.stdout = log_capture
|
| 69 |
+
sys.stderr = log_capture
|
| 70 |
+
|
| 71 |
+
# Set environment variables from parent process
|
| 72 |
+
for key, value in env_vars.items():
|
| 73 |
+
os.environ[key] = str(value)
|
| 74 |
+
|
| 75 |
+
# Import here to avoid circular imports
|
| 76 |
+
from TransateKRtoEN import GlossaryManager
|
| 77 |
+
|
| 78 |
+
# Create glossary manager instance
|
| 79 |
+
glossary_manager = GlossaryManager()
|
| 80 |
+
|
| 81 |
+
# Generate glossary
|
| 82 |
+
print(f"📑 Starting glossary generation in subprocess...")
|
| 83 |
+
result = glossary_manager.save_glossary(output_dir, chapters_data, instructions)
|
| 84 |
+
|
| 85 |
+
print(f"📑 Glossary generation completed")
|
| 86 |
+
|
| 87 |
+
# Flush any remaining output
|
| 88 |
+
log_capture.flush()
|
| 89 |
+
|
| 90 |
+
# Restore stdout and stderr
|
| 91 |
+
sys.stdout = old_stdout
|
| 92 |
+
sys.stderr = old_stderr
|
| 93 |
+
|
| 94 |
+
return {
|
| 95 |
+
'success': True,
|
| 96 |
+
'result': result,
|
| 97 |
+
'pid': os.getpid(),
|
| 98 |
+
'logs': captured_logs
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
except Exception as e:
|
| 102 |
+
import traceback
|
| 103 |
+
|
| 104 |
+
# Restore stdout and stderr if needed
|
| 105 |
+
if 'old_stdout' in locals():
|
| 106 |
+
sys.stdout = old_stdout
|
| 107 |
+
if 'old_stderr' in locals():
|
| 108 |
+
sys.stderr = old_stderr
|
| 109 |
+
|
| 110 |
+
error_msg = f"Glossary generation error: {str(e)}"
|
| 111 |
+
captured_logs.append(f"📑 ❌ {error_msg}")
|
| 112 |
+
|
| 113 |
+
return {
|
| 114 |
+
'success': False,
|
| 115 |
+
'error': error_msg,
|
| 116 |
+
'traceback': traceback.format_exc(),
|
| 117 |
+
'pid': os.getpid(),
|
| 118 |
+
'logs': captured_logs
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
def generate_glossary_async(output_dir, chapters, instructions, extraction_workers=None):
|
| 122 |
+
"""
|
| 123 |
+
Generate glossary asynchronously using ProcessPoolExecutor.
|
| 124 |
+
|
| 125 |
+
This function completely bypasses the GIL by running in a separate process,
|
| 126 |
+
ensuring the GUI remains fully responsive.
|
| 127 |
+
"""
|
| 128 |
+
import concurrent.futures
|
| 129 |
+
import multiprocessing
|
| 130 |
+
|
| 131 |
+
# Ensure freeze support for Windows frozen executables
|
| 132 |
+
try:
|
| 133 |
+
multiprocessing.freeze_support()
|
| 134 |
+
except Exception:
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
# Determine worker count
|
| 138 |
+
if extraction_workers is None:
|
| 139 |
+
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
|
| 140 |
+
|
| 141 |
+
if extraction_workers == 1:
|
| 142 |
+
# Auto-detect optimal workers
|
| 143 |
+
extraction_workers = min(multiprocessing.cpu_count() or 4, 4)
|
| 144 |
+
print(f"📑 Auto-detected {extraction_workers} CPU cores for glossary generation")
|
| 145 |
+
|
| 146 |
+
# Collect relevant environment variables
|
| 147 |
+
env_vars = {}
|
| 148 |
+
important_vars = [
|
| 149 |
+
'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES',
|
| 150 |
+
'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS',
|
| 151 |
+
'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'AUTO_GLOSSARY_PROMPT',
|
| 152 |
+
'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED',
|
| 153 |
+
'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION',
|
| 154 |
+
'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS',
|
| 155 |
+
'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY'
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
for var in important_vars:
|
| 159 |
+
if var in os.environ:
|
| 160 |
+
env_vars[var] = os.environ[var]
|
| 161 |
+
|
| 162 |
+
# Use ProcessPoolExecutor for true parallelism
|
| 163 |
+
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
|
| 164 |
+
# Submit the task
|
| 165 |
+
future = executor.submit(
|
| 166 |
+
generate_glossary_in_process,
|
| 167 |
+
output_dir,
|
| 168 |
+
chapters,
|
| 169 |
+
instructions,
|
| 170 |
+
env_vars
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
# Return the future for the caller to monitor
|
| 174 |
+
return future
|
| 175 |
+
|
| 176 |
+
def check_glossary_completion(future, timeout=0.01):
|
| 177 |
+
"""
|
| 178 |
+
Check if glossary generation is complete without blocking.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
future: Future object from generate_glossary_async
|
| 182 |
+
timeout: Timeout in seconds for checking
|
| 183 |
+
|
| 184 |
+
Returns:
|
| 185 |
+
Tuple of (is_done, result_or_none)
|
| 186 |
+
"""
|
| 187 |
+
try:
|
| 188 |
+
if future.done():
|
| 189 |
+
result = future.result(timeout=timeout)
|
| 190 |
+
return True, result
|
| 191 |
+
else:
|
| 192 |
+
# Not done yet
|
| 193 |
+
return False, None
|
| 194 |
+
except concurrent.futures.TimeoutError:
|
| 195 |
+
return False, None
|
| 196 |
+
except Exception as e:
|
| 197 |
+
# Error occurred
|
| 198 |
+
return True, {'success': False, 'error': str(e)}
|
history_manager.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import tempfile
|
| 5 |
+
import shutil
|
| 6 |
+
from threading import Lock
|
| 7 |
+
from contextlib import contextmanager
|
| 8 |
+
|
| 9 |
+
class HistoryManager:
|
| 10 |
+
"""Thread-safe history management with file locking"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, payloads_dir):
|
| 13 |
+
self.payloads_dir = payloads_dir
|
| 14 |
+
self.hist_path = os.path.join(payloads_dir, "translation_history.json")
|
| 15 |
+
self.lock = Lock()
|
| 16 |
+
self._file_locks = {}
|
| 17 |
+
|
| 18 |
+
@contextmanager
|
| 19 |
+
def _file_lock(self, filepath):
|
| 20 |
+
"""Simple file locking mechanism"""
|
| 21 |
+
lock_file = filepath + '.lock'
|
| 22 |
+
acquired = False
|
| 23 |
+
try:
|
| 24 |
+
# Try to acquire lock with timeout
|
| 25 |
+
start_time = time.time()
|
| 26 |
+
while time.time() - start_time < 30: # 30 second timeout
|
| 27 |
+
try:
|
| 28 |
+
# Create lock file atomically
|
| 29 |
+
fd = os.open(lock_file, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
| 30 |
+
os.close(fd)
|
| 31 |
+
acquired = True
|
| 32 |
+
break
|
| 33 |
+
except FileExistsError:
|
| 34 |
+
time.sleep(0.1)
|
| 35 |
+
|
| 36 |
+
if not acquired:
|
| 37 |
+
raise TimeoutError(f"Could not acquire lock for {filepath}")
|
| 38 |
+
|
| 39 |
+
yield
|
| 40 |
+
|
| 41 |
+
finally:
|
| 42 |
+
if acquired and os.path.exists(lock_file):
|
| 43 |
+
try:
|
| 44 |
+
os.remove(lock_file)
|
| 45 |
+
except:
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
def load_history(self):
|
| 49 |
+
"""Load history with retry logic and file locking"""
|
| 50 |
+
with self.lock:
|
| 51 |
+
for attempt in range(3):
|
| 52 |
+
try:
|
| 53 |
+
with self._file_lock(self.hist_path):
|
| 54 |
+
if os.path.exists(self.hist_path):
|
| 55 |
+
with open(self.hist_path, "r", encoding="utf-8") as f:
|
| 56 |
+
return json.load(f)
|
| 57 |
+
return []
|
| 58 |
+
except (json.JSONDecodeError, IOError) as e:
|
| 59 |
+
print(f"[WARNING] Failed to load history (attempt {attempt + 1}): {e}")
|
| 60 |
+
if attempt < 2:
|
| 61 |
+
time.sleep(0.5)
|
| 62 |
+
else:
|
| 63 |
+
# Return empty history if all attempts fail
|
| 64 |
+
return []
|
| 65 |
+
return []
|
| 66 |
+
|
| 67 |
+
def save_history(self, history):
|
| 68 |
+
"""Save history atomically with file locking"""
|
| 69 |
+
with self.lock:
|
| 70 |
+
with self._file_lock(self.hist_path):
|
| 71 |
+
# Write to temporary file first
|
| 72 |
+
temp_fd, temp_path = tempfile.mkstemp(dir=self.payloads_dir, text=True)
|
| 73 |
+
try:
|
| 74 |
+
with os.fdopen(temp_fd, 'w', encoding='utf-8') as f:
|
| 75 |
+
json.dump(history, f, ensure_ascii=False, indent=2)
|
| 76 |
+
|
| 77 |
+
# Atomically replace the old file
|
| 78 |
+
shutil.move(temp_path, self.hist_path)
|
| 79 |
+
|
| 80 |
+
except Exception as e:
|
| 81 |
+
# Clean up temp file on error
|
| 82 |
+
if os.path.exists(temp_path):
|
| 83 |
+
os.remove(temp_path)
|
| 84 |
+
raise e
|
| 85 |
+
|
| 86 |
+
def append_to_history(self, user_content, assistant_content, hist_limit, reset_on_limit=True, rolling_window=False):
|
| 87 |
+
"""
|
| 88 |
+
Append to history with automatic reset or rolling window when limit is reached
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
user_content: User message content
|
| 92 |
+
assistant_content: Assistant message content
|
| 93 |
+
hist_limit: Maximum number of exchanges to keep (0 = no history)
|
| 94 |
+
reset_on_limit: Whether to reset when limit is reached (old behavior)
|
| 95 |
+
rolling_window: Whether to use rolling window mode (new behavior)
|
| 96 |
+
"""
|
| 97 |
+
# CRITICAL FIX: If hist_limit is 0 or negative, don't maintain any history
|
| 98 |
+
if hist_limit <= 0:
|
| 99 |
+
# Don't load, save, or maintain any history when contextual is disabled
|
| 100 |
+
return []
|
| 101 |
+
|
| 102 |
+
history = self.load_history()
|
| 103 |
+
|
| 104 |
+
# Count current exchanges (each exchange = 2 messages: user + assistant)
|
| 105 |
+
current_exchanges = len(history) // 2
|
| 106 |
+
|
| 107 |
+
# Handle limit reached
|
| 108 |
+
if current_exchanges >= hist_limit:
|
| 109 |
+
if rolling_window:
|
| 110 |
+
# Rolling window mode: keep only the most recent (limit-1) exchanges
|
| 111 |
+
# We keep limit-1 to make room for the new exchange
|
| 112 |
+
messages_to_keep = (hist_limit - 1) * 2
|
| 113 |
+
if messages_to_keep > 0:
|
| 114 |
+
history = history[-messages_to_keep:]
|
| 115 |
+
print(f"🔄 Rolling history window: keeping last {hist_limit-1} exchanges")
|
| 116 |
+
else:
|
| 117 |
+
history = []
|
| 118 |
+
elif reset_on_limit:
|
| 119 |
+
# Old behavior: complete reset
|
| 120 |
+
history = []
|
| 121 |
+
print(f"🔄 Reset history after reaching limit of {hist_limit} exchanges")
|
| 122 |
+
|
| 123 |
+
# Append new entries
|
| 124 |
+
history.append({"role": "user", "content": user_content})
|
| 125 |
+
history.append({"role": "assistant", "content": assistant_content})
|
| 126 |
+
|
| 127 |
+
self.save_history(history)
|
| 128 |
+
return history
|
| 129 |
+
|
| 130 |
+
def will_reset_on_next_append(self, hist_limit, rolling_window=False):
|
| 131 |
+
"""Check if the next append will trigger a reset or rolling window"""
|
| 132 |
+
if hist_limit <= 0:
|
| 133 |
+
return False
|
| 134 |
+
history = self.load_history()
|
| 135 |
+
current_exchanges = len(history) // 2
|
| 136 |
+
return current_exchanges >= hist_limit
|
image_translator.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
individual_endpoint_dialog.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# individual_endpoint_dialog.py
|
| 2 |
+
"""
|
| 3 |
+
Individual Endpoint Configuration Dialog for Glossarion
|
| 4 |
+
- Uses the application's WindowManager for consistent UI
|
| 5 |
+
- Allows enabling/disabling per-key custom endpoint (e.g., Azure, Ollama/local OpenAI-compatible)
|
| 6 |
+
- Persists changes to the in-memory key object and refreshes the parent list
|
| 7 |
+
"""
|
| 8 |
+
import tkinter as tk
|
| 9 |
+
from tkinter import ttk, messagebox
|
| 10 |
+
import ttkbootstrap as tb
|
| 11 |
+
from typing import Callable
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
# For type hints only; not required at runtime
|
| 15 |
+
from multi_api_key_manager import APIKeyEntry # noqa: F401
|
| 16 |
+
except Exception:
|
| 17 |
+
pass
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class IndividualEndpointDialog:
|
| 21 |
+
def __init__(self, parent, translator_gui, key, refresh_callback: Callable[[], None], status_callback: Callable[[str], None]):
|
| 22 |
+
self.parent = parent
|
| 23 |
+
self.translator_gui = translator_gui
|
| 24 |
+
self.key = key
|
| 25 |
+
self.refresh_callback = refresh_callback
|
| 26 |
+
self.status_callback = status_callback
|
| 27 |
+
self.dialog = None
|
| 28 |
+
self.canvas = None
|
| 29 |
+
|
| 30 |
+
self._build()
|
| 31 |
+
|
| 32 |
+
def _build(self):
|
| 33 |
+
title = f"Configure Individual Endpoint — {getattr(self.key, 'model', '')}"
|
| 34 |
+
|
| 35 |
+
if hasattr(self.translator_gui, 'wm'):
|
| 36 |
+
# Use WindowManager scrollable dialog for consistency
|
| 37 |
+
self.dialog, scrollable_frame, self.canvas = self.translator_gui.wm.setup_scrollable(
|
| 38 |
+
self.parent,
|
| 39 |
+
title,
|
| 40 |
+
width=700,
|
| 41 |
+
height=420,
|
| 42 |
+
max_width_ratio=0.85,
|
| 43 |
+
max_height_ratio=0.45
|
| 44 |
+
)
|
| 45 |
+
else:
|
| 46 |
+
self.dialog = tk.Toplevel(self.parent)
|
| 47 |
+
self.dialog.title(title)
|
| 48 |
+
self.dialog.geometry("700x420")
|
| 49 |
+
scrollable_frame = self.dialog
|
| 50 |
+
|
| 51 |
+
main = tk.Frame(scrollable_frame, padx=20, pady=16)
|
| 52 |
+
main.pack(fill=tk.BOTH, expand=True)
|
| 53 |
+
|
| 54 |
+
# Header
|
| 55 |
+
header = tk.Frame(main)
|
| 56 |
+
header.pack(fill=tk.X, pady=(0, 10))
|
| 57 |
+
tk.Label(header, text="Per-Key Custom Endpoint", font=("TkDefaultFont", 14, "bold")).pack(side=tk.LEFT)
|
| 58 |
+
|
| 59 |
+
# Enable toggle
|
| 60 |
+
self.enable_var = tk.BooleanVar(value=bool(getattr(self.key, 'use_individual_endpoint', False)))
|
| 61 |
+
tb.Checkbutton(header, text="Enable", variable=self.enable_var, bootstyle="round-toggle",
|
| 62 |
+
command=self._toggle_fields).pack(side=tk.RIGHT)
|
| 63 |
+
|
| 64 |
+
# Description
|
| 65 |
+
desc = (
|
| 66 |
+
"Use a custom endpoint for this API key only. Works with OpenAI-compatible servers\n"
|
| 67 |
+
"like Azure OpenAI or local providers (e.g., Ollama at http://localhost:11434/v1)."
|
| 68 |
+
)
|
| 69 |
+
tk.Label(main, text=desc, fg='gray', justify=tk.LEFT).pack(anchor=tk.W)
|
| 70 |
+
|
| 71 |
+
# Form
|
| 72 |
+
form = tk.LabelFrame(main, text="Endpoint Settings", padx=14, pady=12)
|
| 73 |
+
form.pack(fill=tk.BOTH, expand=False, pady=(10, 0))
|
| 74 |
+
|
| 75 |
+
# Endpoint URL
|
| 76 |
+
tk.Label(form, text="Endpoint Base URL:").grid(row=0, column=0, sticky=tk.W, padx=(0, 10), pady=6)
|
| 77 |
+
self.endpoint_var = tk.StringVar(value=getattr(self.key, 'azure_endpoint', '') or '')
|
| 78 |
+
self.endpoint_entry = tb.Entry(form, textvariable=self.endpoint_var)
|
| 79 |
+
self.endpoint_entry.grid(row=0, column=1, sticky=tk.EW, pady=6)
|
| 80 |
+
|
| 81 |
+
# Azure API version (optional; required if using Azure)
|
| 82 |
+
tk.Label(form, text="Azure API Version:").grid(row=1, column=0, sticky=tk.W, padx=(0, 10), pady=6)
|
| 83 |
+
self.api_version_var = tk.StringVar(value=getattr(self.key, 'azure_api_version', '2025-01-01-preview') or '2025-01-01-preview')
|
| 84 |
+
self.api_version_combo = ttk.Combobox(
|
| 85 |
+
form,
|
| 86 |
+
textvariable=self.api_version_var,
|
| 87 |
+
values=[
|
| 88 |
+
'2025-01-01-preview',
|
| 89 |
+
'2024-12-01-preview',
|
| 90 |
+
'2024-10-01-preview',
|
| 91 |
+
'2024-08-01-preview',
|
| 92 |
+
'2024-06-01',
|
| 93 |
+
'2024-02-01',
|
| 94 |
+
'2023-12-01-preview'
|
| 95 |
+
],
|
| 96 |
+
width=24,
|
| 97 |
+
state='readonly'
|
| 98 |
+
)
|
| 99 |
+
self.api_version_combo.grid(row=1, column=1, sticky=tk.W, pady=6)
|
| 100 |
+
|
| 101 |
+
# Helper text
|
| 102 |
+
hint = (
|
| 103 |
+
"Hints:\n"
|
| 104 |
+
"- Ollama: http://localhost:11434/v1\n"
|
| 105 |
+
"- Azure OpenAI: https://<resource>.openai.azure.com/ (version required)\n"
|
| 106 |
+
"- Other OpenAI-compatible: Provide the base URL ending with /v1 if applicable"
|
| 107 |
+
)
|
| 108 |
+
tk.Label(form, text=hint, fg='gray', justify=tk.LEFT, font=('TkDefaultFont', 9)).grid(
|
| 109 |
+
row=2, column=0, columnspan=2, sticky=tk.W, pady=(4, 0)
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
# Grid weights
|
| 113 |
+
form.columnconfigure(1, weight=1)
|
| 114 |
+
|
| 115 |
+
# Buttons
|
| 116 |
+
btns = tk.Frame(main)
|
| 117 |
+
btns.pack(fill=tk.X, pady=(14, 0))
|
| 118 |
+
|
| 119 |
+
tb.Button(btns, text="Save", bootstyle="success", command=self._on_save).pack(side=tk.RIGHT)
|
| 120 |
+
tb.Button(btns, text="Cancel", bootstyle="secondary", command=self._on_close).pack(side=tk.RIGHT, padx=(0, 8))
|
| 121 |
+
tb.Button(btns, text="Disable", bootstyle="danger-outline", command=self._on_disable).pack(side=tk.LEFT)
|
| 122 |
+
|
| 123 |
+
# Initial toggle state
|
| 124 |
+
self._toggle_fields()
|
| 125 |
+
|
| 126 |
+
# Window close protocol
|
| 127 |
+
self.dialog.protocol("WM_DELETE_WINDOW", self._on_close)
|
| 128 |
+
|
| 129 |
+
# Auto-size with WM if available
|
| 130 |
+
if hasattr(self.translator_gui, 'wm') and self.canvas is not None:
|
| 131 |
+
self.translator_gui.wm.auto_resize_dialog(self.dialog, self.canvas, max_width_ratio=0.9, max_height_ratio=0.45)
|
| 132 |
+
|
| 133 |
+
def _toggle_fields(self):
|
| 134 |
+
enabled = self.enable_var.get()
|
| 135 |
+
state = tk.NORMAL if enabled else tk.DISABLED
|
| 136 |
+
self.endpoint_entry.config(state=state)
|
| 137 |
+
# API version is only relevant for Azure but we leave it enabled while toggle is on
|
| 138 |
+
self.api_version_combo.config(state='readonly' if enabled else 'disabled')
|
| 139 |
+
|
| 140 |
+
def _is_azure_endpoint(self, url: str) -> bool:
|
| 141 |
+
if not url:
|
| 142 |
+
return False
|
| 143 |
+
url_l = url.lower()
|
| 144 |
+
return (".openai.azure.com" in url_l) or ("azure.com/openai" in url_l) or ("/openai/deployments/" in url_l)
|
| 145 |
+
|
| 146 |
+
def _validate(self) -> bool:
|
| 147 |
+
if not self.enable_var.get():
|
| 148 |
+
return True
|
| 149 |
+
url = (self.endpoint_var.get() or '').strip()
|
| 150 |
+
if not url:
|
| 151 |
+
messagebox.showerror("Validation Error", "Endpoint Base URL is required when Enable is ON.")
|
| 152 |
+
return False
|
| 153 |
+
if not (url.startswith("http://") or url.startswith("https://")):
|
| 154 |
+
messagebox.showerror("Validation Error", "Endpoint URL must start with http:// or https://")
|
| 155 |
+
return False
|
| 156 |
+
if self._is_azure_endpoint(url):
|
| 157 |
+
ver = (self.api_version_var.get() or '').strip()
|
| 158 |
+
if not ver:
|
| 159 |
+
messagebox.showerror("Validation Error", "Azure API Version is required for Azure endpoints.")
|
| 160 |
+
return False
|
| 161 |
+
return True
|
| 162 |
+
|
| 163 |
+
def _persist_to_config_if_possible(self):
|
| 164 |
+
"""Best-effort persistence: update translator_gui.config['multi_api_keys'] for this key entry.
|
| 165 |
+
We match by api_key and model to find the entry. If not found, skip silently.
|
| 166 |
+
"""
|
| 167 |
+
try:
|
| 168 |
+
cfg = getattr(self.translator_gui, 'config', None)
|
| 169 |
+
if not isinstance(cfg, dict):
|
| 170 |
+
return
|
| 171 |
+
key_list = cfg.get('multi_api_keys', [])
|
| 172 |
+
# Find by api_key AND model (best-effort)
|
| 173 |
+
api_key = getattr(self.key, 'api_key', None)
|
| 174 |
+
model = getattr(self.key, 'model', None)
|
| 175 |
+
for entry in key_list:
|
| 176 |
+
if entry.get('api_key') == api_key and entry.get('model') == model:
|
| 177 |
+
entry['use_individual_endpoint'] = bool(getattr(self.key, 'use_individual_endpoint', False))
|
| 178 |
+
entry['azure_endpoint'] = getattr(self.key, 'azure_endpoint', None)
|
| 179 |
+
entry['azure_api_version'] = getattr(self.key, 'azure_api_version', None)
|
| 180 |
+
break
|
| 181 |
+
# Save without message
|
| 182 |
+
if hasattr(self.translator_gui, 'save_config'):
|
| 183 |
+
self.translator_gui.save_config(show_message=False)
|
| 184 |
+
except Exception:
|
| 185 |
+
# Non-fatal
|
| 186 |
+
pass
|
| 187 |
+
|
| 188 |
+
def _on_save(self):
|
| 189 |
+
if not self._validate():
|
| 190 |
+
return
|
| 191 |
+
enabled = self.enable_var.get()
|
| 192 |
+
url = (self.endpoint_var.get() or '').strip()
|
| 193 |
+
ver = (self.api_version_var.get() or '').strip()
|
| 194 |
+
|
| 195 |
+
# Apply to key object
|
| 196 |
+
self.key.use_individual_endpoint = enabled
|
| 197 |
+
self.key.azure_endpoint = url if enabled else None
|
| 198 |
+
# Keep API version even if disabled, but it's only used when enabled
|
| 199 |
+
self.key.azure_api_version = ver or getattr(self.key, 'azure_api_version', '2025-01-01-preview')
|
| 200 |
+
|
| 201 |
+
# Notify parent UI
|
| 202 |
+
if callable(self.refresh_callback):
|
| 203 |
+
try:
|
| 204 |
+
self.refresh_callback()
|
| 205 |
+
except Exception:
|
| 206 |
+
pass
|
| 207 |
+
if callable(self.status_callback):
|
| 208 |
+
try:
|
| 209 |
+
if enabled and url:
|
| 210 |
+
self.status_callback(f"Individual endpoint set: {url}")
|
| 211 |
+
else:
|
| 212 |
+
self.status_callback("Individual endpoint disabled")
|
| 213 |
+
except Exception:
|
| 214 |
+
pass
|
| 215 |
+
|
| 216 |
+
# Best-effort persistence to config
|
| 217 |
+
self._persist_to_config_if_possible()
|
| 218 |
+
|
| 219 |
+
self.dialog.destroy()
|
| 220 |
+
|
| 221 |
+
def _on_disable(self):
|
| 222 |
+
# Disable quickly
|
| 223 |
+
self.enable_var.set(False)
|
| 224 |
+
self._toggle_fields()
|
| 225 |
+
# Apply immediately and close
|
| 226 |
+
self._on_save()
|
| 227 |
+
|
| 228 |
+
def _on_close(self):
|
| 229 |
+
self.dialog.destroy()
|
launch_Glossarion.bat
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
REM ensure we’re in the script’s folder:
|
| 3 |
+
cd /d "%~dp0"
|
| 4 |
+
|
| 5 |
+
REM call the real python
|
| 6 |
+
python translator_gui.py
|
| 7 |
+
|
| 8 |
+
REM or, alternatively:
|
| 9 |
+
REM py -3 translator_gui.py
|
| 10 |
+
|
| 11 |
+
pause
|
launch_Glossarion.vbs
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Set WshShell = CreateObject("WScript.Shell")
|
| 2 |
+
WshShell.Run "pythonw.exe translator_gui.py", 0
|
| 3 |
+
Set WshShell = Nothing
|
launch_web.bat
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
title Glossarion Web Interface
|
| 3 |
+
echo.
|
| 4 |
+
echo ========================================
|
| 5 |
+
echo Glossarion Web Interface Launcher
|
| 6 |
+
echo ========================================
|
| 7 |
+
echo.
|
| 8 |
+
|
| 9 |
+
REM Change to the script directory
|
| 10 |
+
cd /d "%~dp0"
|
| 11 |
+
|
| 12 |
+
REM Check if Python is available
|
| 13 |
+
python --version >nul 2>&1
|
| 14 |
+
if errorlevel 1 (
|
| 15 |
+
echo ERROR: Python is not installed or not in PATH
|
| 16 |
+
echo Please install Python 3.8 or higher
|
| 17 |
+
pause
|
| 18 |
+
exit /b 1
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
echo Starting Glossarion Web Interface...
|
| 22 |
+
echo.
|
| 23 |
+
echo The browser will open automatically once the server is ready.
|
| 24 |
+
echo Press Ctrl+C in the console to stop the server when done.
|
| 25 |
+
echo.
|
| 26 |
+
|
| 27 |
+
REM Start PowerShell script in background to wait for server and open browser
|
| 28 |
+
start "" /B powershell -ExecutionPolicy Bypass -File "%~dp0wait_and_open.ps1" -url "http://127.0.0.1:7860"
|
| 29 |
+
|
| 30 |
+
REM Start the web interface
|
| 31 |
+
python glossarion_web.py
|
| 32 |
+
|
| 33 |
+
echo.
|
| 34 |
+
echo ========================================
|
| 35 |
+
echo Server stopped. You can close this window.
|
| 36 |
+
echo ========================================
|
| 37 |
+
pause
|
launch_web_advanced.bat
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
title Glossarion Web Interface - Advanced Launcher
|
| 3 |
+
color 0A
|
| 4 |
+
echo.
|
| 5 |
+
echo ========================================
|
| 6 |
+
echo Glossarion Web Interface
|
| 7 |
+
echo Advanced Launcher
|
| 8 |
+
echo ========================================
|
| 9 |
+
echo.
|
| 10 |
+
|
| 11 |
+
REM Change to the script directory
|
| 12 |
+
cd /d "%~dp0"
|
| 13 |
+
|
| 14 |
+
REM Check if Python is available
|
| 15 |
+
python --version >nul 2>&1
|
| 16 |
+
if errorlevel 1 (
|
| 17 |
+
color 0C
|
| 18 |
+
echo ERROR: Python is not installed or not in PATH
|
| 19 |
+
echo Please install Python 3.8 or higher
|
| 20 |
+
pause
|
| 21 |
+
exit /b 1
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
echo Select launch mode:
|
| 25 |
+
echo.
|
| 26 |
+
echo [1] Local Only (http://127.0.0.1:7860)
|
| 27 |
+
echo [2] Network Accessible (http://0.0.0.0:7860)
|
| 28 |
+
echo [3] Public Share Link (uses Gradio sharing)
|
| 29 |
+
echo [4] Custom Port (specify your own)
|
| 30 |
+
echo [5] Exit
|
| 31 |
+
echo.
|
| 32 |
+
set /p choice="Enter choice (1-5): "
|
| 33 |
+
|
| 34 |
+
if "%choice%"=="1" (
|
| 35 |
+
set SERVER_NAME=127.0.0.1
|
| 36 |
+
set SERVER_PORT=7860
|
| 37 |
+
set SHARE=False
|
| 38 |
+
goto :launch
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
if "%choice%"=="2" (
|
| 42 |
+
set SERVER_NAME=0.0.0.0
|
| 43 |
+
set SERVER_PORT=7860
|
| 44 |
+
set SHARE=False
|
| 45 |
+
echo.
|
| 46 |
+
echo WARNING: This will make the server accessible to other devices on your network.
|
| 47 |
+
echo.
|
| 48 |
+
goto :launch
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
if "%choice%"=="3" (
|
| 52 |
+
set SERVER_NAME=0.0.0.0
|
| 53 |
+
set SERVER_PORT=7860
|
| 54 |
+
set SHARE=True
|
| 55 |
+
echo.
|
| 56 |
+
echo NOTE: This will create a public link that expires in 72 hours.
|
| 57 |
+
echo.
|
| 58 |
+
goto :launch
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
if "%choice%"=="4" (
|
| 62 |
+
set /p SERVER_PORT="Enter port number (default 7860): "
|
| 63 |
+
if "%SERVER_PORT%"=="" set SERVER_PORT=7860
|
| 64 |
+
set SERVER_NAME=127.0.0.1
|
| 65 |
+
set SHARE=False
|
| 66 |
+
goto :launch
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
if "%choice%"=="5" (
|
| 70 |
+
exit /b 0
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
echo Invalid choice. Exiting.
|
| 74 |
+
pause
|
| 75 |
+
exit /b 1
|
| 76 |
+
|
| 77 |
+
:launch
|
| 78 |
+
echo.
|
| 79 |
+
echo ========================================
|
| 80 |
+
echo Starting Glossarion Web Interface...
|
| 81 |
+
echo ========================================
|
| 82 |
+
echo.
|
| 83 |
+
echo Configuration:
|
| 84 |
+
echo - Host: %SERVER_NAME%
|
| 85 |
+
echo - Port: %SERVER_PORT%
|
| 86 |
+
echo - Public Share: %SHARE%
|
| 87 |
+
echo.
|
| 88 |
+
echo The browser will open automatically once the server is ready.
|
| 89 |
+
echo Press Ctrl+C in the console to stop the server when done.
|
| 90 |
+
echo.
|
| 91 |
+
|
| 92 |
+
REM Set environment variables for Python script to use
|
| 93 |
+
set GRADIO_SERVER_NAME=%SERVER_NAME%
|
| 94 |
+
set GRADIO_SERVER_PORT=%SERVER_PORT%
|
| 95 |
+
set GRADIO_SHARE=%SHARE%
|
| 96 |
+
|
| 97 |
+
REM Start PowerShell script in background to wait for server and open browser
|
| 98 |
+
start "" /B powershell -ExecutionPolicy Bypass -File "%~dp0wait_and_open.ps1" -url "http://127.0.0.1:%SERVER_PORT%"
|
| 99 |
+
|
| 100 |
+
REM Start the web interface
|
| 101 |
+
python glossarion_web.py
|
| 102 |
+
|
| 103 |
+
echo.
|
| 104 |
+
echo ========================================
|
| 105 |
+
echo Server stopped. You can close this window.
|
| 106 |
+
echo ========================================
|
| 107 |
+
pause
|
local_inpainter.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
manga_integration.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
manga_settings_dialog.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
manga_translator.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
memory_usage_reporter.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# memory_usage_reporter.py
|
| 2 |
+
"""
|
| 3 |
+
Background memory usage reporter.
|
| 4 |
+
- Logs process RSS, VMS, peak (if available), GC counts, and optional tracemalloc stats
|
| 5 |
+
- Writes to logs/memory.log and also propagates to root logger (run.log) via a child logger
|
| 6 |
+
- Designed to be lightweight and safe in GUI apps
|
| 7 |
+
"""
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import time
|
| 11 |
+
import threading
|
| 12 |
+
import logging
|
| 13 |
+
import gc
|
| 14 |
+
from logging.handlers import RotatingFileHandler
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
import psutil
|
| 18 |
+
except Exception:
|
| 19 |
+
psutil = None
|
| 20 |
+
|
| 21 |
+
# Global singletons
|
| 22 |
+
_GLOBAL_THREAD = None
|
| 23 |
+
_GLOBAL_STOP = threading.Event()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _ensure_logs_dir() -> str:
|
| 27 |
+
# Prefer explicit override from main app
|
| 28 |
+
try:
|
| 29 |
+
env_dir = os.environ.get("GLOSSARION_LOG_DIR")
|
| 30 |
+
if env_dir:
|
| 31 |
+
dir_path = os.path.expanduser(env_dir)
|
| 32 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 33 |
+
return dir_path
|
| 34 |
+
except Exception:
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
def _can_write(p: str) -> bool:
|
| 38 |
+
try:
|
| 39 |
+
os.makedirs(p, exist_ok=True)
|
| 40 |
+
test_file = os.path.join(p, ".write_test")
|
| 41 |
+
with open(test_file, "w", encoding="utf-8") as f:
|
| 42 |
+
f.write("ok")
|
| 43 |
+
os.remove(test_file)
|
| 44 |
+
return True
|
| 45 |
+
except Exception:
|
| 46 |
+
return False
|
| 47 |
+
|
| 48 |
+
# Frozen exe: try next to the executable first
|
| 49 |
+
try:
|
| 50 |
+
if getattr(sys, 'frozen', False) and hasattr(sys, 'executable'):
|
| 51 |
+
exe_dir = os.path.dirname(sys.executable)
|
| 52 |
+
candidate = os.path.join(exe_dir, "logs")
|
| 53 |
+
if _can_write(candidate):
|
| 54 |
+
return candidate
|
| 55 |
+
except Exception:
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
# User-local app data (persistent and writable)
|
| 59 |
+
try:
|
| 60 |
+
base = os.environ.get('LOCALAPPDATA') or os.environ.get('APPDATA') or os.path.expanduser('~')
|
| 61 |
+
candidate = os.path.join(base, 'Glossarion', 'logs')
|
| 62 |
+
if _can_write(candidate):
|
| 63 |
+
return candidate
|
| 64 |
+
except Exception:
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
# Development fallback: next to this file
|
| 68 |
+
try:
|
| 69 |
+
base_dir = os.path.abspath(os.path.dirname(__file__))
|
| 70 |
+
candidate = os.path.join(base_dir, "logs")
|
| 71 |
+
if _can_write(candidate):
|
| 72 |
+
return candidate
|
| 73 |
+
except Exception:
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
# Final fallback: CWD
|
| 77 |
+
fallback = os.path.join(os.getcwd(), "logs")
|
| 78 |
+
os.makedirs(fallback, exist_ok=True)
|
| 79 |
+
return fallback
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _make_logger() -> logging.Logger:
|
| 83 |
+
logger = logging.getLogger("memory")
|
| 84 |
+
logger.setLevel(logging.INFO)
|
| 85 |
+
|
| 86 |
+
# Avoid duplicate handlers if called more than once
|
| 87 |
+
if not any(isinstance(h, RotatingFileHandler) for h in logger.handlers):
|
| 88 |
+
logs_dir = _ensure_logs_dir()
|
| 89 |
+
file_path = os.path.join(logs_dir, "memory.log")
|
| 90 |
+
fh = RotatingFileHandler(file_path, maxBytes=2 * 1024 * 1024, backupCount=3, encoding="utf-8")
|
| 91 |
+
fmt = logging.Formatter(
|
| 92 |
+
fmt="%(asctime)s %(levelname)s [%(process)d:%(threadName)s] %(name)s: %(message)s",
|
| 93 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 94 |
+
)
|
| 95 |
+
fh.setFormatter(fmt)
|
| 96 |
+
logger.addHandler(fh)
|
| 97 |
+
|
| 98 |
+
# Do NOT propagate to root; keep memory logs out of console and only in memory.log
|
| 99 |
+
logger.propagate = False
|
| 100 |
+
return logger
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _get_process() -> "psutil.Process | None":
|
| 104 |
+
if psutil is None:
|
| 105 |
+
return None
|
| 106 |
+
try:
|
| 107 |
+
return psutil.Process()
|
| 108 |
+
except Exception:
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _format_bytes(num: int) -> str:
|
| 113 |
+
try:
|
| 114 |
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
| 115 |
+
if num < 1024.0:
|
| 116 |
+
return f"{num:,.1f}{unit}"
|
| 117 |
+
num /= 1024.0
|
| 118 |
+
return f"{num:,.1f}PB"
|
| 119 |
+
except Exception:
|
| 120 |
+
return str(num)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _collect_stats(proc) -> dict:
|
| 124 |
+
stats = {}
|
| 125 |
+
try:
|
| 126 |
+
if proc is not None:
|
| 127 |
+
mi = proc.memory_info()
|
| 128 |
+
stats["rss"] = mi.rss
|
| 129 |
+
stats["vms"] = getattr(mi, "vms", 0)
|
| 130 |
+
# Peak RSS on Windows via psutil.Process.memory_info() may expose peak_wset in private API; skip for portability
|
| 131 |
+
else:
|
| 132 |
+
stats["rss"] = 0
|
| 133 |
+
stats["vms"] = 0
|
| 134 |
+
except Exception:
|
| 135 |
+
stats["rss"] = stats.get("rss", 0)
|
| 136 |
+
stats["vms"] = stats.get("vms", 0)
|
| 137 |
+
|
| 138 |
+
# GC stats
|
| 139 |
+
try:
|
| 140 |
+
counts = gc.get_count()
|
| 141 |
+
stats["gc"] = counts
|
| 142 |
+
except Exception:
|
| 143 |
+
stats["gc"] = (0, 0, 0)
|
| 144 |
+
|
| 145 |
+
return stats
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _worker(interval_sec: float, include_tracemalloc: bool):
|
| 149 |
+
log = _make_logger()
|
| 150 |
+
proc = _get_process()
|
| 151 |
+
|
| 152 |
+
# Optional tracemalloc
|
| 153 |
+
if include_tracemalloc:
|
| 154 |
+
try:
|
| 155 |
+
import tracemalloc
|
| 156 |
+
if not tracemalloc.is_tracing():
|
| 157 |
+
tracemalloc.start()
|
| 158 |
+
tm_enabled = True
|
| 159 |
+
except Exception:
|
| 160 |
+
tm_enabled = False
|
| 161 |
+
else:
|
| 162 |
+
tm_enabled = False
|
| 163 |
+
|
| 164 |
+
while not _GLOBAL_STOP.is_set():
|
| 165 |
+
try:
|
| 166 |
+
st = _collect_stats(proc)
|
| 167 |
+
rss = st.get("rss", 0)
|
| 168 |
+
vms = st.get("vms", 0)
|
| 169 |
+
gc0, gc1, gc2 = st.get("gc", (0, 0, 0))
|
| 170 |
+
|
| 171 |
+
msg = (
|
| 172 |
+
f"RSS={_format_bytes(rss)} VMS={_format_bytes(vms)} "
|
| 173 |
+
f"GC={gc0}/{gc1}/{gc2}"
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
if tm_enabled:
|
| 177 |
+
try:
|
| 178 |
+
import tracemalloc
|
| 179 |
+
cur, peak = tracemalloc.get_traced_memory()
|
| 180 |
+
msg += f" TM_CUR={_format_bytes(cur)} TM_PEAK={_format_bytes(peak)}"
|
| 181 |
+
except Exception:
|
| 182 |
+
pass
|
| 183 |
+
|
| 184 |
+
log.info(msg)
|
| 185 |
+
except Exception as e:
|
| 186 |
+
try:
|
| 187 |
+
log.warning("memory reporter error: %s", e)
|
| 188 |
+
except Exception:
|
| 189 |
+
pass
|
| 190 |
+
finally:
|
| 191 |
+
# Sleep in small chunks to react faster to stop
|
| 192 |
+
for _ in range(int(max(1, interval_sec * 10))):
|
| 193 |
+
if _GLOBAL_STOP.is_set():
|
| 194 |
+
break
|
| 195 |
+
time.sleep(0.1)
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def start_global_memory_logger(interval_sec: float = 3.0, include_tracemalloc: bool = False) -> None:
|
| 199 |
+
"""Start the background memory logger once per process.
|
| 200 |
+
|
| 201 |
+
interval_sec: how often to log
|
| 202 |
+
include_tracemalloc: if True, also log tracemalloc current/peak
|
| 203 |
+
"""
|
| 204 |
+
global _GLOBAL_THREAD
|
| 205 |
+
if _GLOBAL_THREAD and _GLOBAL_THREAD.is_alive():
|
| 206 |
+
return
|
| 207 |
+
|
| 208 |
+
_GLOBAL_STOP.clear()
|
| 209 |
+
t = threading.Thread(target=_worker, args=(interval_sec, include_tracemalloc), name="mem-logger", daemon=True)
|
| 210 |
+
_GLOBAL_THREAD = t
|
| 211 |
+
try:
|
| 212 |
+
t.start()
|
| 213 |
+
except Exception:
|
| 214 |
+
# Do not raise to avoid breaking GUI startup
|
| 215 |
+
pass
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def stop_global_memory_logger() -> None:
|
| 219 |
+
try:
|
| 220 |
+
_GLOBAL_STOP.set()
|
| 221 |
+
if _GLOBAL_THREAD and _GLOBAL_THREAD.is_alive():
|
| 222 |
+
# Give it a moment to exit
|
| 223 |
+
_GLOBAL_THREAD.join(timeout=2.0)
|
| 224 |
+
except Exception:
|
| 225 |
+
pass
|
metadata_batch_translator.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model_options.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# model_options.py
|
| 2 |
+
"""
|
| 3 |
+
Centralized model catalog for Glossarion UIs.
|
| 4 |
+
Returned list should mirror the main GUI model dropdown.
|
| 5 |
+
"""
|
| 6 |
+
from typing import List
|
| 7 |
+
|
| 8 |
+
def get_model_options() -> List[str]:
|
| 9 |
+
return [
|
| 10 |
+
|
| 11 |
+
# OpenAI Models
|
| 12 |
+
"gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4.1-nano", "gpt-4.1-mini", "gpt-4.1",
|
| 13 |
+
"gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k",
|
| 14 |
+
"gpt-5-mini","gpt-5","gpt-5-nano",
|
| 15 |
+
"o1-preview", "o1-mini", "o3", "o4-mini",
|
| 16 |
+
|
| 17 |
+
# Google Gemini Models
|
| 18 |
+
"gemini-2.0-flash","gemini-2.0-flash-lite",
|
| 19 |
+
"gemini-2.5-flash","gemini-2.5-flash-lite", "gemini-2.5-pro", "gemini-pro", "gemini-pro-vision",
|
| 20 |
+
|
| 21 |
+
# Anthropic Claude Models
|
| 22 |
+
"claude-opus-4-20250514", "claude-sonnet-4-20250514",
|
| 23 |
+
"claude-3-5-sonnet-20241022", "claude-3-7-sonnet-20250219",
|
| 24 |
+
"claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307",
|
| 25 |
+
"claude-2.1", "claude-2", "claude-instant-1.2",
|
| 26 |
+
|
| 27 |
+
# Grok Models
|
| 28 |
+
"grok-grok-4-0709", "grok-3", "grok-3-mini",
|
| 29 |
+
|
| 30 |
+
# Vertex AI Model Garden - Claude models (confirmed)
|
| 31 |
+
"claude-4-opus@20250514",
|
| 32 |
+
"claude-4-sonnet@20250514",
|
| 33 |
+
"claude-opus-4@20250514",
|
| 34 |
+
"claude-sonnet-4@20250514",
|
| 35 |
+
"claude-3-7-sonnet@20250219",
|
| 36 |
+
"claude-3-5-sonnet@20240620",
|
| 37 |
+
"claude-3-5-sonnet-v2@20241022",
|
| 38 |
+
"claude-3-opus@20240229",
|
| 39 |
+
"claude-3-sonnet@20240229",
|
| 40 |
+
"claude-3-haiku@20240307",
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Alternative format with vertex_ai prefix
|
| 44 |
+
"vertex/claude-3-7-sonnet@20250219",
|
| 45 |
+
"vertex/claude-3-5-sonnet@20240620",
|
| 46 |
+
"vertex/claude-3-opus@20240229",
|
| 47 |
+
"vertex/claude-4-opus@20250514",
|
| 48 |
+
"vertex/claude-4-sonnet@20250514",
|
| 49 |
+
"vertex/gemini-1.5-pro",
|
| 50 |
+
"vertex/gemini-1.5-flash",
|
| 51 |
+
"vertex/gemini-2.0-flash",
|
| 52 |
+
"vertex/gemini-2.5-pro",
|
| 53 |
+
"vertex/gemini-2.5-flash",
|
| 54 |
+
"vertex/gemini-2.5-flash-lite",
|
| 55 |
+
|
| 56 |
+
# Chute AI
|
| 57 |
+
"chutes/openai/gpt-oss-120b",
|
| 58 |
+
"chutes/deepseek-ai/DeepSeek-V3.1",
|
| 59 |
+
|
| 60 |
+
# DeepSeek Models
|
| 61 |
+
"deepseek-chat", "deepseek-coder", "deepseek-coder-33b-instruct",
|
| 62 |
+
|
| 63 |
+
# Mistral Models
|
| 64 |
+
"mistral-large", "mistral-medium", "mistral-small", "mistral-tiny",
|
| 65 |
+
"mixtral-8x7b-instruct", "mixtral-8x22b", "codestral-latest",
|
| 66 |
+
|
| 67 |
+
# Meta Llama Models (via Together/other providers)
|
| 68 |
+
"llama-2-7b-chat", "llama-2-13b-chat", "llama-2-70b-chat",
|
| 69 |
+
"llama-3-8b-instruct", "llama-3-70b-instruct", "codellama-34b-instruct",
|
| 70 |
+
|
| 71 |
+
# Yi Models
|
| 72 |
+
"yi-34b-chat", "yi-34b-chat-200k", "yi-6b-chat",
|
| 73 |
+
|
| 74 |
+
# Qwen Models
|
| 75 |
+
"qwen-72b-chat", "qwen-14b-chat", "qwen-7b-chat", "qwen-plus", "qwen-turbo",
|
| 76 |
+
|
| 77 |
+
# Cohere Models
|
| 78 |
+
"command", "command-light", "command-nightly", "command-r", "command-r-plus",
|
| 79 |
+
|
| 80 |
+
# AI21 Models
|
| 81 |
+
"j2-ultra", "j2-mid", "j2-light", "jamba-instruct",
|
| 82 |
+
|
| 83 |
+
# Perplexity Models
|
| 84 |
+
"perplexity-70b-online", "perplexity-7b-online", "pplx-70b-online", "pplx-7b-online",
|
| 85 |
+
|
| 86 |
+
# Groq Models (usually with suffix)
|
| 87 |
+
"llama-3-70b-groq", "llama-3-8b-groq", "mixtral-8x7b-groq",
|
| 88 |
+
|
| 89 |
+
# Chinese Models
|
| 90 |
+
"glm-4", "glm-3-turbo", "chatglm-6b", "chatglm2-6b", "chatglm3-6b",
|
| 91 |
+
"baichuan-13b-chat", "baichuan2-13b-chat",
|
| 92 |
+
"moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k",
|
| 93 |
+
|
| 94 |
+
# Other Models
|
| 95 |
+
"falcon-40b-instruct", "falcon-7b-instruct",
|
| 96 |
+
"phi-2", "phi-3-mini", "phi-3-small", "phi-3-medium",
|
| 97 |
+
"orca-2-13b", "orca-2-7b",
|
| 98 |
+
"vicuna-13b", "vicuna-7b",
|
| 99 |
+
"alpaca-7b",
|
| 100 |
+
"wizardlm-70b", "wizardlm-13b",
|
| 101 |
+
"openchat-3.5",
|
| 102 |
+
|
| 103 |
+
# For POE, prefix with 'poe/'
|
| 104 |
+
"poe/gpt-4", "poe/gpt-4o", "poe/gpt-4.5", "poe/gpt-4.1",
|
| 105 |
+
"poe/claude-3-opus", "poe/claude-4-opus", "poe/claude-3-sonnet", "poe/claude-4-sonnet",
|
| 106 |
+
"poe/claude", "poe/Assistant",
|
| 107 |
+
"poe/gemini-2.5-flash", "poe/gemini-2.5-pro",
|
| 108 |
+
|
| 109 |
+
# For OR, prevfix with 'or/'
|
| 110 |
+
"or/google/gemini-2.5-pro",
|
| 111 |
+
"or/google/gemini-2.5-flash",
|
| 112 |
+
"or/google/gemini-2.5-flash-lite",
|
| 113 |
+
"or/openai/gpt-5",
|
| 114 |
+
"or/openai/gpt-5-mini",
|
| 115 |
+
"or/openai/gpt-5-nano",
|
| 116 |
+
"or/openai/chatgpt-4o-latest",
|
| 117 |
+
"or/deepseek/deepseek-r1-0528:free",
|
| 118 |
+
"or/google/gemma-3-27b-it:free",
|
| 119 |
+
|
| 120 |
+
# For ElectronHub, prefix with 'eh/'
|
| 121 |
+
"eh/gpt-4", "eh/gpt-3.5-turbo", "eh/claude-3-opus", "eh/claude-3-sonnet",
|
| 122 |
+
"eh/llama-2-70b-chat", "eh/yi-34b-chat-200k", "eh/mistral-large",
|
| 123 |
+
"eh/gemini-pro", "eh/deepseek-coder-33b",
|
| 124 |
+
|
| 125 |
+
# Last Resort
|
| 126 |
+
"deepl", # Will use DeepL API
|
| 127 |
+
"google-translate", # Will use Google Cloud Translate
|
| 128 |
+
]
|
multi_api_key_manager.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
ocr_manager.py
ADDED
|
@@ -0,0 +1,1879 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ocr_manager.py
|
| 2 |
+
"""
|
| 3 |
+
OCR Manager for handling multiple OCR providers
|
| 4 |
+
Handles installation, model downloading, and OCR processing
|
| 5 |
+
Updated with HuggingFace donut model and proper bubble detection integration
|
| 6 |
+
"""
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import cv2
|
| 10 |
+
import json
|
| 11 |
+
import subprocess
|
| 12 |
+
import threading
|
| 13 |
+
import traceback
|
| 14 |
+
from typing import List, Dict, Optional, Tuple, Any
|
| 15 |
+
import numpy as np
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
from PIL import Image
|
| 18 |
+
import logging
|
| 19 |
+
import time
|
| 20 |
+
import random
|
| 21 |
+
import base64
|
| 22 |
+
import io
|
| 23 |
+
import requests
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
import gptqmodel
|
| 27 |
+
HAS_GPTQ = True
|
| 28 |
+
except ImportError:
|
| 29 |
+
try:
|
| 30 |
+
import auto_gptq
|
| 31 |
+
HAS_GPTQ = True
|
| 32 |
+
except ImportError:
|
| 33 |
+
HAS_GPTQ = False
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
import optimum
|
| 37 |
+
HAS_OPTIMUM = True
|
| 38 |
+
except ImportError:
|
| 39 |
+
HAS_OPTIMUM = False
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
import accelerate
|
| 43 |
+
HAS_ACCELERATE = True
|
| 44 |
+
except ImportError:
|
| 45 |
+
HAS_ACCELERATE = False
|
| 46 |
+
|
| 47 |
+
logger = logging.getLogger(__name__)
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class OCRResult:
|
| 51 |
+
"""Unified OCR result format with built-in sanitization to prevent data corruption."""
|
| 52 |
+
text: str
|
| 53 |
+
bbox: Tuple[int, int, int, int] # x, y, w, h
|
| 54 |
+
confidence: float
|
| 55 |
+
vertices: Optional[List[Tuple[int, int]]] = None
|
| 56 |
+
|
| 57 |
+
def __post_init__(self):
|
| 58 |
+
"""
|
| 59 |
+
This special method is called automatically after the object is created.
|
| 60 |
+
It acts as a final safeguard to ensure the 'text' attribute is ALWAYS a clean string.
|
| 61 |
+
"""
|
| 62 |
+
# --- THIS IS THE DEFINITIVE FIX ---
|
| 63 |
+
# If the text we received is a tuple, we extract the first element.
|
| 64 |
+
# This makes it impossible for a tuple to exist in a finished object.
|
| 65 |
+
if isinstance(self.text, tuple):
|
| 66 |
+
# Log that we are fixing a critical data error.
|
| 67 |
+
print(f"CRITICAL WARNING: Corrupted tuple detected in OCRResult. Sanitizing '{self.text}' to '{self.text[0]}'.")
|
| 68 |
+
self.text = self.text[0]
|
| 69 |
+
|
| 70 |
+
# Ensure the final result is always a stripped string.
|
| 71 |
+
self.text = str(self.text).strip()
|
| 72 |
+
|
| 73 |
+
class OCRProvider:
|
| 74 |
+
"""Base class for OCR providers"""
|
| 75 |
+
|
| 76 |
+
def __init__(self, log_callback=None):
|
| 77 |
+
self.log_callback = log_callback
|
| 78 |
+
self.is_installed = False
|
| 79 |
+
self.is_loaded = False
|
| 80 |
+
self.model = None
|
| 81 |
+
self.stop_flag = None
|
| 82 |
+
self._stopped = False
|
| 83 |
+
|
| 84 |
+
def _log(self, message: str, level: str = "info"):
|
| 85 |
+
"""Log message with stop suppression"""
|
| 86 |
+
# Suppress logs when stopped (allow only essential stop confirmation messages)
|
| 87 |
+
if self._check_stop():
|
| 88 |
+
essential_stop_keywords = [
|
| 89 |
+
"⏹️ Translation stopped by user",
|
| 90 |
+
"⏹️ OCR processing stopped",
|
| 91 |
+
"cleanup", "🧹"
|
| 92 |
+
]
|
| 93 |
+
if not any(keyword in message for keyword in essential_stop_keywords):
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
if self.log_callback:
|
| 97 |
+
self.log_callback(message, level)
|
| 98 |
+
else:
|
| 99 |
+
print(f"[{level.upper()}] {message}")
|
| 100 |
+
|
| 101 |
+
def set_stop_flag(self, stop_flag):
|
| 102 |
+
"""Set the stop flag for checking interruptions"""
|
| 103 |
+
self.stop_flag = stop_flag
|
| 104 |
+
self._stopped = False
|
| 105 |
+
|
| 106 |
+
def _check_stop(self) -> bool:
|
| 107 |
+
"""Check if stop has been requested"""
|
| 108 |
+
if self._stopped:
|
| 109 |
+
return True
|
| 110 |
+
if self.stop_flag and self.stop_flag.is_set():
|
| 111 |
+
self._stopped = True
|
| 112 |
+
return True
|
| 113 |
+
# Check global manga translator cancellation
|
| 114 |
+
try:
|
| 115 |
+
from manga_translator import MangaTranslator
|
| 116 |
+
if MangaTranslator.is_globally_cancelled():
|
| 117 |
+
self._stopped = True
|
| 118 |
+
return True
|
| 119 |
+
except Exception:
|
| 120 |
+
pass
|
| 121 |
+
return False
|
| 122 |
+
|
| 123 |
+
def reset_stop_flags(self):
|
| 124 |
+
"""Reset stop flags when starting new processing"""
|
| 125 |
+
self._stopped = False
|
| 126 |
+
|
| 127 |
+
def check_installation(self) -> bool:
|
| 128 |
+
"""Check if provider is installed"""
|
| 129 |
+
raise NotImplementedError
|
| 130 |
+
|
| 131 |
+
def install(self, progress_callback=None) -> bool:
|
| 132 |
+
"""Install the provider"""
|
| 133 |
+
raise NotImplementedError
|
| 134 |
+
|
| 135 |
+
def load_model(self, **kwargs) -> bool:
|
| 136 |
+
"""Load the OCR model"""
|
| 137 |
+
raise NotImplementedError
|
| 138 |
+
|
| 139 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 140 |
+
"""Detect text in image"""
|
| 141 |
+
raise NotImplementedError
|
| 142 |
+
|
| 143 |
+
class CustomAPIProvider(OCRProvider):
|
| 144 |
+
"""Custom API OCR provider that uses existing GUI variables"""
|
| 145 |
+
|
| 146 |
+
def __init__(self, log_callback=None):
|
| 147 |
+
super().__init__(log_callback)
|
| 148 |
+
|
| 149 |
+
# Use EXISTING environment variables from TranslatorGUI
|
| 150 |
+
self.api_url = os.environ.get('OPENAI_CUSTOM_BASE_URL', '')
|
| 151 |
+
self.api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
|
| 152 |
+
self.model_name = os.environ.get('MODEL', 'gpt-4o-mini')
|
| 153 |
+
|
| 154 |
+
# OCR prompt - use system prompt or a dedicated OCR prompt variable
|
| 155 |
+
self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
|
| 156 |
+
os.environ.get('SYSTEM_PROMPT',
|
| 157 |
+
"YOU ARE AN OCR SYSTEM. YOUR ONLY JOB IS TEXT EXTRACTION.\n\n"
|
| 158 |
+
"CRITICAL RULES:\n"
|
| 159 |
+
"1. DO NOT TRANSLATE ANYTHING\n"
|
| 160 |
+
"2. DO NOT MODIFY THE TEXT\n"
|
| 161 |
+
"3. DO NOT EXPLAIN OR COMMENT\n"
|
| 162 |
+
"4. ONLY OUTPUT THE EXACT TEXT YOU SEE\n"
|
| 163 |
+
"5. PRESERVE NATURAL TEXT FLOW - DO NOT ADD UNNECESSARY LINE BREAKS\n\n"
|
| 164 |
+
"If you see Korean text, output it in Korean.\n"
|
| 165 |
+
"If you see Japanese text, output it in Japanese.\n"
|
| 166 |
+
"If you see Chinese text, output it in Chinese.\n"
|
| 167 |
+
"If you see English text, output it in English.\n\n"
|
| 168 |
+
"IMPORTANT: Only use line breaks where they naturally occur in the original text "
|
| 169 |
+
"(e.g., between dialogue lines or paragraphs). Do not break text mid-sentence or "
|
| 170 |
+
"between every word/character.\n\n"
|
| 171 |
+
"For vertical text common in manga/comics, transcribe it as a continuous line unless "
|
| 172 |
+
"there are clear visual breaks.\n\n"
|
| 173 |
+
"NEVER translate. ONLY extract exactly what is written.\n"
|
| 174 |
+
"Output ONLY the raw text, nothing else."
|
| 175 |
+
))
|
| 176 |
+
|
| 177 |
+
# Use existing temperature and token settings
|
| 178 |
+
self.temperature = float(os.environ.get('TRANSLATION_TEMPERATURE', '0.01'))
|
| 179 |
+
# Don't hardcode to 8192 - get fresh value when actually used
|
| 180 |
+
self.max_tokens = int(os.environ.get('MAX_OUTPUT_TOKENS', '4096'))
|
| 181 |
+
|
| 182 |
+
# Image settings from existing compression variables
|
| 183 |
+
self.image_format = 'jpeg' if os.environ.get('IMAGE_COMPRESSION_FORMAT', 'auto') != 'png' else 'png'
|
| 184 |
+
self.image_quality = int(os.environ.get('JPEG_QUALITY', '100'))
|
| 185 |
+
|
| 186 |
+
# Simple defaults
|
| 187 |
+
self.api_format = 'openai' # Most custom endpoints are OpenAI-compatible
|
| 188 |
+
self.timeout = int(os.environ.get('CHUNK_TIMEOUT', '30'))
|
| 189 |
+
self.api_headers = {} # Additional custom headers
|
| 190 |
+
|
| 191 |
+
# Retry configuration for Custom API OCR calls
|
| 192 |
+
self.max_retries = int(os.environ.get('CUSTOM_OCR_MAX_RETRIES', '3'))
|
| 193 |
+
self.retry_initial_delay = float(os.environ.get('CUSTOM_OCR_RETRY_INITIAL_DELAY', '0.8'))
|
| 194 |
+
self.retry_backoff = float(os.environ.get('CUSTOM_OCR_RETRY_BACKOFF', '1.8'))
|
| 195 |
+
self.retry_jitter = float(os.environ.get('CUSTOM_OCR_RETRY_JITTER', '0.4'))
|
| 196 |
+
self.retry_on_empty = os.environ.get('CUSTOM_OCR_RETRY_ON_EMPTY', '1') == '1'
|
| 197 |
+
|
| 198 |
+
def check_installation(self) -> bool:
|
| 199 |
+
"""Always installed - uses UnifiedClient"""
|
| 200 |
+
self.is_installed = True
|
| 201 |
+
return True
|
| 202 |
+
|
| 203 |
+
def install(self, progress_callback=None) -> bool:
|
| 204 |
+
"""No installation needed for API-based provider"""
|
| 205 |
+
return self.check_installation()
|
| 206 |
+
|
| 207 |
+
def load_model(self, **kwargs) -> bool:
|
| 208 |
+
"""Initialize UnifiedClient with current settings"""
|
| 209 |
+
try:
|
| 210 |
+
from unified_api_client import UnifiedClient
|
| 211 |
+
|
| 212 |
+
# Support passing API key from GUI if available
|
| 213 |
+
if 'api_key' in kwargs:
|
| 214 |
+
api_key = kwargs['api_key']
|
| 215 |
+
else:
|
| 216 |
+
api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
|
| 217 |
+
|
| 218 |
+
if 'model' in kwargs:
|
| 219 |
+
model = kwargs['model']
|
| 220 |
+
else:
|
| 221 |
+
model = os.environ.get('MODEL', 'gpt-4o-mini')
|
| 222 |
+
|
| 223 |
+
if not api_key:
|
| 224 |
+
self._log("❌ No API key configured", "error")
|
| 225 |
+
return False
|
| 226 |
+
|
| 227 |
+
# Create UnifiedClient just like translations do
|
| 228 |
+
self.client = UnifiedClient(model=model, api_key=api_key)
|
| 229 |
+
|
| 230 |
+
#self._log(f"✅ Using {model} for OCR via UnifiedClient")
|
| 231 |
+
self.is_loaded = True
|
| 232 |
+
return True
|
| 233 |
+
|
| 234 |
+
except Exception as e:
|
| 235 |
+
self._log(f"❌ Failed to initialize UnifiedClient: {str(e)}", "error")
|
| 236 |
+
return False
|
| 237 |
+
|
| 238 |
+
def _test_connection(self) -> bool:
|
| 239 |
+
"""Test API connection with a simple request"""
|
| 240 |
+
try:
|
| 241 |
+
# Create a small test image
|
| 242 |
+
test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255
|
| 243 |
+
cv2.putText(test_image, "TEST", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
|
| 244 |
+
|
| 245 |
+
# Encode image
|
| 246 |
+
image_base64 = self._encode_image(test_image)
|
| 247 |
+
|
| 248 |
+
# Prepare test request based on API format
|
| 249 |
+
if self.api_format == 'openai':
|
| 250 |
+
test_payload = {
|
| 251 |
+
"model": self.model_name,
|
| 252 |
+
"messages": [
|
| 253 |
+
{
|
| 254 |
+
"role": "user",
|
| 255 |
+
"content": [
|
| 256 |
+
{"type": "text", "text": "What text do you see?"},
|
| 257 |
+
{"type": "image_url", "image_url": {"url": f"data:image/{self.image_format};base64,{image_base64}"}}
|
| 258 |
+
]
|
| 259 |
+
}
|
| 260 |
+
],
|
| 261 |
+
"max_tokens": 50
|
| 262 |
+
}
|
| 263 |
+
else:
|
| 264 |
+
# For other formats, just try a basic health check
|
| 265 |
+
return True
|
| 266 |
+
|
| 267 |
+
headers = self._prepare_headers()
|
| 268 |
+
response = requests.post(
|
| 269 |
+
self.api_url,
|
| 270 |
+
headers=headers,
|
| 271 |
+
json=test_payload,
|
| 272 |
+
timeout=10
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
return response.status_code == 200
|
| 276 |
+
|
| 277 |
+
except Exception:
|
| 278 |
+
return False
|
| 279 |
+
|
| 280 |
+
def _encode_image(self, image: np.ndarray) -> str:
|
| 281 |
+
"""Encode numpy array to base64 string"""
|
| 282 |
+
# Convert BGR to RGB if needed
|
| 283 |
+
if len(image.shape) == 3 and image.shape[2] == 3:
|
| 284 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 285 |
+
else:
|
| 286 |
+
image_rgb = image
|
| 287 |
+
|
| 288 |
+
# Convert to PIL Image
|
| 289 |
+
pil_image = Image.fromarray(image_rgb)
|
| 290 |
+
|
| 291 |
+
# Save to bytes buffer
|
| 292 |
+
buffer = io.BytesIO()
|
| 293 |
+
if self.image_format.lower() == 'png':
|
| 294 |
+
pil_image.save(buffer, format='PNG')
|
| 295 |
+
else:
|
| 296 |
+
pil_image.save(buffer, format='JPEG', quality=self.image_quality)
|
| 297 |
+
|
| 298 |
+
# Encode to base64
|
| 299 |
+
buffer.seek(0)
|
| 300 |
+
image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
|
| 301 |
+
|
| 302 |
+
return image_base64
|
| 303 |
+
|
| 304 |
+
def _prepare_headers(self) -> dict:
|
| 305 |
+
"""Prepare request headers"""
|
| 306 |
+
headers = {
|
| 307 |
+
"Content-Type": "application/json"
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
# Add API key if configured
|
| 311 |
+
if self.api_key:
|
| 312 |
+
if self.api_format == 'anthropic':
|
| 313 |
+
headers["x-api-key"] = self.api_key
|
| 314 |
+
else:
|
| 315 |
+
headers["Authorization"] = f"Bearer {self.api_key}"
|
| 316 |
+
|
| 317 |
+
# Add any custom headers
|
| 318 |
+
headers.update(self.api_headers)
|
| 319 |
+
|
| 320 |
+
return headers
|
| 321 |
+
|
| 322 |
+
def _prepare_request_payload(self, image_base64: str) -> dict:
|
| 323 |
+
"""Prepare request payload based on API format"""
|
| 324 |
+
if self.api_format == 'openai':
|
| 325 |
+
return {
|
| 326 |
+
"model": self.model_name,
|
| 327 |
+
"messages": [
|
| 328 |
+
{
|
| 329 |
+
"role": "user",
|
| 330 |
+
"content": [
|
| 331 |
+
{"type": "text", "text": self.ocr_prompt},
|
| 332 |
+
{
|
| 333 |
+
"type": "image_url",
|
| 334 |
+
"image_url": {
|
| 335 |
+
"url": f"data:image/{self.image_format};base64,{image_base64}"
|
| 336 |
+
}
|
| 337 |
+
}
|
| 338 |
+
]
|
| 339 |
+
}
|
| 340 |
+
],
|
| 341 |
+
"max_tokens": self.max_tokens,
|
| 342 |
+
"temperature": self.temperature
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
elif self.api_format == 'anthropic':
|
| 346 |
+
return {
|
| 347 |
+
"model": self.model_name,
|
| 348 |
+
"max_tokens": self.max_tokens,
|
| 349 |
+
"temperature": self.temperature,
|
| 350 |
+
"messages": [
|
| 351 |
+
{
|
| 352 |
+
"role": "user",
|
| 353 |
+
"content": [
|
| 354 |
+
{
|
| 355 |
+
"type": "text",
|
| 356 |
+
"text": self.ocr_prompt
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"type": "image",
|
| 360 |
+
"source": {
|
| 361 |
+
"type": "base64",
|
| 362 |
+
"media_type": f"image/{self.image_format}",
|
| 363 |
+
"data": image_base64
|
| 364 |
+
}
|
| 365 |
+
}
|
| 366 |
+
]
|
| 367 |
+
}
|
| 368 |
+
]
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
else:
|
| 372 |
+
# Custom format - use environment variable for template
|
| 373 |
+
template = os.environ.get('CUSTOM_OCR_REQUEST_TEMPLATE', '{}')
|
| 374 |
+
payload = json.loads(template)
|
| 375 |
+
|
| 376 |
+
# Replace placeholders
|
| 377 |
+
payload_str = json.dumps(payload)
|
| 378 |
+
payload_str = payload_str.replace('{{IMAGE_BASE64}}', image_base64)
|
| 379 |
+
payload_str = payload_str.replace('{{PROMPT}}', self.ocr_prompt)
|
| 380 |
+
payload_str = payload_str.replace('{{MODEL}}', self.model_name)
|
| 381 |
+
payload_str = payload_str.replace('{{MAX_TOKENS}}', str(self.max_tokens))
|
| 382 |
+
payload_str = payload_str.replace('{{TEMPERATURE}}', str(self.temperature))
|
| 383 |
+
|
| 384 |
+
return json.loads(payload_str)
|
| 385 |
+
|
| 386 |
+
def _extract_text_from_response(self, response_data: dict) -> str:
|
| 387 |
+
"""Extract text from API response based on format"""
|
| 388 |
+
try:
|
| 389 |
+
if self.api_format == 'openai':
|
| 390 |
+
# OpenAI format: response.choices[0].message.content
|
| 391 |
+
return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
|
| 392 |
+
|
| 393 |
+
elif self.api_format == 'anthropic':
|
| 394 |
+
# Anthropic format: response.content[0].text
|
| 395 |
+
content = response_data.get('content', [])
|
| 396 |
+
if content and isinstance(content, list):
|
| 397 |
+
return content[0].get('text', '')
|
| 398 |
+
return ''
|
| 399 |
+
|
| 400 |
+
else:
|
| 401 |
+
# Custom format - use environment variable for path
|
| 402 |
+
response_path = os.environ.get('CUSTOM_OCR_RESPONSE_PATH', 'text')
|
| 403 |
+
|
| 404 |
+
# Navigate through the response using the path
|
| 405 |
+
result = response_data
|
| 406 |
+
for key in response_path.split('.'):
|
| 407 |
+
if isinstance(result, dict):
|
| 408 |
+
result = result.get(key, '')
|
| 409 |
+
elif isinstance(result, list) and key.isdigit():
|
| 410 |
+
idx = int(key)
|
| 411 |
+
result = result[idx] if idx < len(result) else ''
|
| 412 |
+
else:
|
| 413 |
+
result = ''
|
| 414 |
+
break
|
| 415 |
+
|
| 416 |
+
return str(result)
|
| 417 |
+
|
| 418 |
+
except Exception as e:
|
| 419 |
+
self._log(f"Failed to extract text from response: {e}", "error")
|
| 420 |
+
return ''
|
| 421 |
+
|
| 422 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 423 |
+
"""Process image using UnifiedClient.send_image()"""
|
| 424 |
+
results = []
|
| 425 |
+
|
| 426 |
+
try:
|
| 427 |
+
# Get fresh max_tokens from environment - GUI will have set this
|
| 428 |
+
max_tokens = int(os.environ.get('MAX_OUTPUT_TOKENS', '4096'))
|
| 429 |
+
if not self.is_loaded:
|
| 430 |
+
if not self.load_model():
|
| 431 |
+
return results
|
| 432 |
+
|
| 433 |
+
import cv2
|
| 434 |
+
from PIL import Image
|
| 435 |
+
import base64
|
| 436 |
+
import io
|
| 437 |
+
|
| 438 |
+
# Convert numpy array to PIL Image
|
| 439 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 440 |
+
pil_image = Image.fromarray(image_rgb)
|
| 441 |
+
h, w = image.shape[:2]
|
| 442 |
+
|
| 443 |
+
# Convert PIL Image to base64 string
|
| 444 |
+
buffer = io.BytesIO()
|
| 445 |
+
|
| 446 |
+
# Use the image format from settings
|
| 447 |
+
if self.image_format.lower() == 'png':
|
| 448 |
+
pil_image.save(buffer, format='PNG')
|
| 449 |
+
else:
|
| 450 |
+
pil_image.save(buffer, format='JPEG', quality=self.image_quality)
|
| 451 |
+
|
| 452 |
+
buffer.seek(0)
|
| 453 |
+
image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
|
| 454 |
+
|
| 455 |
+
# For OpenAI vision models, we need BOTH:
|
| 456 |
+
# 1. System prompt with instructions
|
| 457 |
+
# 2. User message that includes the image
|
| 458 |
+
messages = [
|
| 459 |
+
{
|
| 460 |
+
"role": "system",
|
| 461 |
+
"content": self.ocr_prompt # The OCR instruction as system prompt
|
| 462 |
+
},
|
| 463 |
+
{
|
| 464 |
+
"role": "user",
|
| 465 |
+
"content": [
|
| 466 |
+
{
|
| 467 |
+
"type": "text",
|
| 468 |
+
"text": "Image:" # Minimal text, just to have something
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"type": "image_url",
|
| 472 |
+
"image_url": {
|
| 473 |
+
"url": f"data:image/jpeg;base64,{image_base64}"
|
| 474 |
+
}
|
| 475 |
+
}
|
| 476 |
+
]
|
| 477 |
+
}
|
| 478 |
+
]
|
| 479 |
+
|
| 480 |
+
# Now send this properly formatted message
|
| 481 |
+
# The UnifiedClient should handle this correctly
|
| 482 |
+
# But we're NOT using send_image, we're using regular send
|
| 483 |
+
|
| 484 |
+
# Retry-aware call
|
| 485 |
+
from unified_api_client import UnifiedClientError # local import to avoid hard dependency at module import time
|
| 486 |
+
max_attempts = max(1, self.max_retries)
|
| 487 |
+
attempt = 0
|
| 488 |
+
last_error = None
|
| 489 |
+
|
| 490 |
+
# Common refusal/error phrases that indicate a non-OCR response
|
| 491 |
+
refusal_phrases = [
|
| 492 |
+
"I can't extract", "I cannot extract",
|
| 493 |
+
"I'm sorry", "I am sorry",
|
| 494 |
+
"I'm unable", "I am unable",
|
| 495 |
+
"cannot process images",
|
| 496 |
+
"I can't help with that",
|
| 497 |
+
"cannot view images",
|
| 498 |
+
"no text in the image"
|
| 499 |
+
]
|
| 500 |
+
|
| 501 |
+
while attempt < max_attempts:
|
| 502 |
+
# Check for stop before each attempt
|
| 503 |
+
if self._check_stop():
|
| 504 |
+
self._log("⏹️ OCR processing stopped by user", "warning")
|
| 505 |
+
return results
|
| 506 |
+
|
| 507 |
+
try:
|
| 508 |
+
response = self.client.send(
|
| 509 |
+
messages=messages,
|
| 510 |
+
temperature=self.temperature,
|
| 511 |
+
max_tokens=max_tokens
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
# Extract content from response object
|
| 515 |
+
content, finish_reason = response
|
| 516 |
+
|
| 517 |
+
# Validate content
|
| 518 |
+
has_content = bool(content and str(content).strip())
|
| 519 |
+
refused = False
|
| 520 |
+
if has_content:
|
| 521 |
+
# Filter out explicit failure markers
|
| 522 |
+
if "[" in content and "FAILED]" in content:
|
| 523 |
+
refused = True
|
| 524 |
+
elif any(phrase.lower() in content.lower() for phrase in refusal_phrases):
|
| 525 |
+
refused = True
|
| 526 |
+
|
| 527 |
+
# Decide success or retry
|
| 528 |
+
if has_content and not refused:
|
| 529 |
+
text = str(content).strip()
|
| 530 |
+
results.append(OCRResult(
|
| 531 |
+
text=text,
|
| 532 |
+
bbox=(0, 0, w, h),
|
| 533 |
+
confidence=kwargs.get('confidence', 0.85),
|
| 534 |
+
vertices=[(0, 0), (w, 0), (w, h), (0, h)]
|
| 535 |
+
))
|
| 536 |
+
self._log(f"✅ Detected: {text[:50]}...")
|
| 537 |
+
break # success
|
| 538 |
+
else:
|
| 539 |
+
reason = "empty result" if not has_content else "refusal/non-OCR response"
|
| 540 |
+
last_error = f"{reason} (finish_reason: {finish_reason})"
|
| 541 |
+
# Check if we should retry on empty or refusal
|
| 542 |
+
should_retry = (not has_content and self.retry_on_empty) or refused
|
| 543 |
+
attempt += 1
|
| 544 |
+
if attempt >= max_attempts or not should_retry:
|
| 545 |
+
# No more retries or shouldn't retry
|
| 546 |
+
if not has_content:
|
| 547 |
+
self._log(f"⚠️ No text detected (finish_reason: {finish_reason})")
|
| 548 |
+
else:
|
| 549 |
+
self._log(f"❌ Model returned non-OCR response: {str(content)[:120]}", "warning")
|
| 550 |
+
break
|
| 551 |
+
# Backoff before retrying
|
| 552 |
+
delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
|
| 553 |
+
self._log(f"🔄 Retry {attempt}/{max_attempts - 1} after {delay:.1f}s due to {reason}...", "warning")
|
| 554 |
+
time.sleep(delay)
|
| 555 |
+
time.sleep(0.1) # Brief pause for stability
|
| 556 |
+
self._log("💤 OCR retry pausing briefly for stability", "debug")
|
| 557 |
+
continue
|
| 558 |
+
|
| 559 |
+
except UnifiedClientError as ue:
|
| 560 |
+
msg = str(ue)
|
| 561 |
+
last_error = msg
|
| 562 |
+
# Do not retry on explicit user cancellation
|
| 563 |
+
if 'cancelled' in msg.lower() or 'stopped by user' in msg.lower():
|
| 564 |
+
self._log(f"❌ OCR cancelled: {msg}", "error")
|
| 565 |
+
break
|
| 566 |
+
attempt += 1
|
| 567 |
+
if attempt >= max_attempts:
|
| 568 |
+
self._log(f"❌ OCR failed after {attempt} attempts: {msg}", "error")
|
| 569 |
+
break
|
| 570 |
+
delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
|
| 571 |
+
self._log(f"🔄 API error, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {msg}", "warning")
|
| 572 |
+
time.sleep(delay)
|
| 573 |
+
time.sleep(0.1) # Brief pause for stability
|
| 574 |
+
self._log("💤 OCR API error retry pausing briefly for stability", "debug")
|
| 575 |
+
continue
|
| 576 |
+
except Exception as e_inner:
|
| 577 |
+
last_error = str(e_inner)
|
| 578 |
+
attempt += 1
|
| 579 |
+
if attempt >= max_attempts:
|
| 580 |
+
self._log(f"❌ OCR exception after {attempt} attempts: {last_error}", "error")
|
| 581 |
+
break
|
| 582 |
+
delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
|
| 583 |
+
self._log(f"🔄 Exception, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {last_error}", "warning")
|
| 584 |
+
time.sleep(delay)
|
| 585 |
+
time.sleep(0.1) # Brief pause for stability
|
| 586 |
+
self._log("💤 OCR exception retry pausing briefly for stability", "debug")
|
| 587 |
+
continue
|
| 588 |
+
|
| 589 |
+
except Exception as e:
|
| 590 |
+
self._log(f"❌ Error: {str(e)}", "error")
|
| 591 |
+
import traceback
|
| 592 |
+
self._log(traceback.format_exc(), "debug")
|
| 593 |
+
|
| 594 |
+
return results
|
| 595 |
+
|
| 596 |
+
class MangaOCRProvider(OCRProvider):
|
| 597 |
+
"""Manga OCR provider using HuggingFace model directly"""
|
| 598 |
+
|
| 599 |
+
def __init__(self, log_callback=None):
|
| 600 |
+
super().__init__(log_callback)
|
| 601 |
+
self.processor = None
|
| 602 |
+
self.model = None
|
| 603 |
+
self.tokenizer = None
|
| 604 |
+
|
| 605 |
+
def check_installation(self) -> bool:
|
| 606 |
+
"""Check if transformers is installed"""
|
| 607 |
+
try:
|
| 608 |
+
import transformers
|
| 609 |
+
import torch
|
| 610 |
+
self.is_installed = True
|
| 611 |
+
return True
|
| 612 |
+
except ImportError:
|
| 613 |
+
return False
|
| 614 |
+
|
| 615 |
+
def install(self, progress_callback=None) -> bool:
|
| 616 |
+
"""Install transformers and torch"""
|
| 617 |
+
pass
|
| 618 |
+
|
| 619 |
+
def _is_valid_local_model_dir(self, path: str) -> bool:
|
| 620 |
+
"""Check that a local HF model directory has required files."""
|
| 621 |
+
try:
|
| 622 |
+
if not path or not os.path.isdir(path):
|
| 623 |
+
return False
|
| 624 |
+
needed_any_weights = any(
|
| 625 |
+
os.path.exists(os.path.join(path, name)) for name in (
|
| 626 |
+
'pytorch_model.bin',
|
| 627 |
+
'model.safetensors'
|
| 628 |
+
)
|
| 629 |
+
)
|
| 630 |
+
has_config = os.path.exists(os.path.join(path, 'config.json'))
|
| 631 |
+
has_processor = (
|
| 632 |
+
os.path.exists(os.path.join(path, 'preprocessor_config.json')) or
|
| 633 |
+
os.path.exists(os.path.join(path, 'processor_config.json'))
|
| 634 |
+
)
|
| 635 |
+
has_tokenizer = (
|
| 636 |
+
os.path.exists(os.path.join(path, 'tokenizer.json')) or
|
| 637 |
+
os.path.exists(os.path.join(path, 'tokenizer_config.json'))
|
| 638 |
+
)
|
| 639 |
+
return has_config and needed_any_weights and has_processor and has_tokenizer
|
| 640 |
+
except Exception:
|
| 641 |
+
return False
|
| 642 |
+
|
| 643 |
+
def load_model(self, **kwargs) -> bool:
|
| 644 |
+
"""Load the manga-ocr model, preferring a local directory to avoid re-downloading"""
|
| 645 |
+
try:
|
| 646 |
+
if not self.is_installed and not self.check_installation():
|
| 647 |
+
self._log("❌ Transformers not installed", "error")
|
| 648 |
+
return False
|
| 649 |
+
|
| 650 |
+
# Always disable progress bars to avoid tqdm issues in some environments
|
| 651 |
+
import os
|
| 652 |
+
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
| 653 |
+
|
| 654 |
+
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoImageProcessor
|
| 655 |
+
import torch
|
| 656 |
+
|
| 657 |
+
# Prefer a local model directory if present to avoid any Hub access
|
| 658 |
+
candidates = []
|
| 659 |
+
env_local = os.environ.get("MANGA_OCR_LOCAL_DIR")
|
| 660 |
+
if env_local:
|
| 661 |
+
candidates.append(env_local)
|
| 662 |
+
|
| 663 |
+
# Project root one level up from this file
|
| 664 |
+
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
| 665 |
+
candidates.append(os.path.join(root_dir, 'models', 'manga-ocr-base'))
|
| 666 |
+
candidates.append(os.path.join(root_dir, 'models', 'kha-white', 'manga-ocr-base'))
|
| 667 |
+
|
| 668 |
+
model_source = None
|
| 669 |
+
local_only = False
|
| 670 |
+
# Find a valid local dir
|
| 671 |
+
for cand in candidates:
|
| 672 |
+
if self._is_valid_local_model_dir(cand):
|
| 673 |
+
model_source = cand
|
| 674 |
+
local_only = True
|
| 675 |
+
break
|
| 676 |
+
|
| 677 |
+
# If no valid local dir, use Hub
|
| 678 |
+
if not model_source:
|
| 679 |
+
model_source = "kha-white/manga-ocr-base"
|
| 680 |
+
# Make sure we are not forcing offline mode
|
| 681 |
+
if os.environ.get("HF_HUB_OFFLINE") == "1":
|
| 682 |
+
try:
|
| 683 |
+
del os.environ["HF_HUB_OFFLINE"]
|
| 684 |
+
except Exception:
|
| 685 |
+
pass
|
| 686 |
+
self._log("🔥 Loading manga-ocr model from Hugging Face Hub")
|
| 687 |
+
self._log(f" Repo: {model_source}")
|
| 688 |
+
else:
|
| 689 |
+
# Only set offline when local dir is fully valid
|
| 690 |
+
os.environ.setdefault("HF_HUB_OFFLINE", "1")
|
| 691 |
+
self._log("🔥 Loading manga-ocr model from local directory")
|
| 692 |
+
self._log(f" Local path: {model_source}")
|
| 693 |
+
|
| 694 |
+
# Decide target device once; we will move after full CPU load to avoid meta tensors
|
| 695 |
+
use_cuda = torch.cuda.is_available()
|
| 696 |
+
|
| 697 |
+
# Try loading components, falling back to Hub if local-only fails
|
| 698 |
+
def _load_components(source: str, local_flag: bool):
|
| 699 |
+
self._log(" Loading tokenizer...")
|
| 700 |
+
tok = AutoTokenizer.from_pretrained(source, local_files_only=local_flag)
|
| 701 |
+
|
| 702 |
+
self._log(" Loading image processor...")
|
| 703 |
+
try:
|
| 704 |
+
from transformers import AutoProcessor
|
| 705 |
+
except Exception:
|
| 706 |
+
AutoProcessor = None
|
| 707 |
+
try:
|
| 708 |
+
proc = AutoImageProcessor.from_pretrained(source, local_files_only=local_flag)
|
| 709 |
+
except Exception as e_proc:
|
| 710 |
+
if AutoProcessor is not None:
|
| 711 |
+
self._log(f" ⚠️ AutoImageProcessor failed: {e_proc}. Trying AutoProcessor...", "warning")
|
| 712 |
+
proc = AutoProcessor.from_pretrained(source, local_files_only=local_flag)
|
| 713 |
+
else:
|
| 714 |
+
raise
|
| 715 |
+
|
| 716 |
+
self._log(" Loading model...")
|
| 717 |
+
# Prevent meta tensors by forcing full materialization on CPU at load time
|
| 718 |
+
os.environ.setdefault('TORCHDYNAMO_DISABLE', '1')
|
| 719 |
+
mdl = VisionEncoderDecoderModel.from_pretrained(
|
| 720 |
+
source,
|
| 721 |
+
local_files_only=local_flag,
|
| 722 |
+
low_cpu_mem_usage=False,
|
| 723 |
+
dtype=torch.float32,
|
| 724 |
+
device_map=None
|
| 725 |
+
)
|
| 726 |
+
return tok, proc, mdl
|
| 727 |
+
|
| 728 |
+
try:
|
| 729 |
+
self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
|
| 730 |
+
except Exception as e_local:
|
| 731 |
+
if local_only:
|
| 732 |
+
# Fallback to Hub once if local fails
|
| 733 |
+
self._log(f" ⚠️ Local model load failed: {e_local}", "warning")
|
| 734 |
+
try:
|
| 735 |
+
if os.environ.get("HF_HUB_OFFLINE") == "1":
|
| 736 |
+
del os.environ["HF_HUB_OFFLINE"]
|
| 737 |
+
except Exception:
|
| 738 |
+
pass
|
| 739 |
+
model_source = "kha-white/manga-ocr-base"
|
| 740 |
+
local_only = False
|
| 741 |
+
self._log(" Retrying from Hugging Face Hub...")
|
| 742 |
+
self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
|
| 743 |
+
else:
|
| 744 |
+
raise
|
| 745 |
+
|
| 746 |
+
# Move to CUDA only after full CPU materialization
|
| 747 |
+
target_device = 'cpu'
|
| 748 |
+
if use_cuda:
|
| 749 |
+
try:
|
| 750 |
+
self.model = self.model.to('cuda')
|
| 751 |
+
target_device = 'cuda'
|
| 752 |
+
except Exception as move_err:
|
| 753 |
+
self._log(f" ⚠️ Could not move model to CUDA: {move_err}", "warning")
|
| 754 |
+
target_device = 'cpu'
|
| 755 |
+
|
| 756 |
+
# Finalize eval mode
|
| 757 |
+
self.model.eval()
|
| 758 |
+
|
| 759 |
+
# Sanity-check: ensure no parameter remains on 'meta' device
|
| 760 |
+
try:
|
| 761 |
+
for n, p in self.model.named_parameters():
|
| 762 |
+
dev = getattr(p, 'device', None)
|
| 763 |
+
if dev is not None and getattr(dev, 'type', '') == 'meta':
|
| 764 |
+
raise RuntimeError(f"Parameter {n} is on 'meta' after load")
|
| 765 |
+
except Exception as sanity_err:
|
| 766 |
+
self._log(f"❌ Manga-OCR model load sanity check failed: {sanity_err}", "error")
|
| 767 |
+
return False
|
| 768 |
+
|
| 769 |
+
self._log(f" ✅ Model loaded on {target_device.upper()}")
|
| 770 |
+
self.is_loaded = True
|
| 771 |
+
self._log("✅ Manga OCR model ready")
|
| 772 |
+
return True
|
| 773 |
+
|
| 774 |
+
except Exception as e:
|
| 775 |
+
self._log(f"❌ Failed to load manga-ocr model: {str(e)}", "error")
|
| 776 |
+
import traceback
|
| 777 |
+
self._log(traceback.format_exc(), "error")
|
| 778 |
+
try:
|
| 779 |
+
if 'local_only' in locals() and local_only:
|
| 780 |
+
self._log("Hint: Local load failed. Ensure your models/manga-ocr-base contains required files (config.json, preprocessor_config.json, tokenizer.json or tokenizer_config.json, and model weights).", "warning")
|
| 781 |
+
except Exception:
|
| 782 |
+
pass
|
| 783 |
+
return False
|
| 784 |
+
|
| 785 |
+
def _run_ocr(self, pil_image):
|
| 786 |
+
"""Run OCR on a PIL image using the HuggingFace model"""
|
| 787 |
+
import torch
|
| 788 |
+
|
| 789 |
+
# Process image (keyword arg for broader compatibility across transformers versions)
|
| 790 |
+
inputs = self.processor(images=pil_image, return_tensors="pt")
|
| 791 |
+
pixel_values = inputs["pixel_values"]
|
| 792 |
+
|
| 793 |
+
# Move to same device as model
|
| 794 |
+
try:
|
| 795 |
+
model_device = next(self.model.parameters()).device
|
| 796 |
+
except StopIteration:
|
| 797 |
+
model_device = torch.device('cpu')
|
| 798 |
+
pixel_values = pixel_values.to(model_device)
|
| 799 |
+
|
| 800 |
+
# Generate text
|
| 801 |
+
with torch.no_grad():
|
| 802 |
+
generated_ids = self.model.generate(pixel_values)
|
| 803 |
+
|
| 804 |
+
# Decode
|
| 805 |
+
generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 806 |
+
|
| 807 |
+
return generated_text
|
| 808 |
+
|
| 809 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 810 |
+
"""
|
| 811 |
+
Process the image region passed to it.
|
| 812 |
+
This could be a bubble region or the full image.
|
| 813 |
+
"""
|
| 814 |
+
results = []
|
| 815 |
+
|
| 816 |
+
# Check for stop at start
|
| 817 |
+
if self._check_stop():
|
| 818 |
+
self._log("⏹️ Manga-OCR processing stopped by user", "warning")
|
| 819 |
+
return results
|
| 820 |
+
|
| 821 |
+
try:
|
| 822 |
+
if not self.is_loaded:
|
| 823 |
+
if not self.load_model():
|
| 824 |
+
return results
|
| 825 |
+
|
| 826 |
+
import cv2
|
| 827 |
+
from PIL import Image
|
| 828 |
+
|
| 829 |
+
# Get confidence from kwargs
|
| 830 |
+
confidence = kwargs.get('confidence', 0.7)
|
| 831 |
+
|
| 832 |
+
# Convert numpy array to PIL
|
| 833 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 834 |
+
pil_image = Image.fromarray(image_rgb)
|
| 835 |
+
h, w = image.shape[:2]
|
| 836 |
+
|
| 837 |
+
self._log("🔍 Processing region with manga-ocr...")
|
| 838 |
+
|
| 839 |
+
# Check for stop before inference
|
| 840 |
+
if self._check_stop():
|
| 841 |
+
self._log("⏹️ Manga-OCR inference stopped by user", "warning")
|
| 842 |
+
return results
|
| 843 |
+
|
| 844 |
+
# Run OCR on the image region
|
| 845 |
+
text = self._run_ocr(pil_image)
|
| 846 |
+
|
| 847 |
+
if text and text.strip():
|
| 848 |
+
# Return result for this region with its actual bbox
|
| 849 |
+
results.append(OCRResult(
|
| 850 |
+
text=text.strip(),
|
| 851 |
+
bbox=(0, 0, w, h), # Relative to the region passed in
|
| 852 |
+
confidence=confidence,
|
| 853 |
+
vertices=[(0, 0), (w, 0), (w, h), (0, h)]
|
| 854 |
+
))
|
| 855 |
+
self._log(f"✅ Detected text: {text[:50]}...")
|
| 856 |
+
|
| 857 |
+
except Exception as e:
|
| 858 |
+
self._log(f"❌ Error in manga-ocr: {str(e)}", "error")
|
| 859 |
+
|
| 860 |
+
return results
|
| 861 |
+
|
| 862 |
+
class Qwen2VL(OCRProvider):
|
| 863 |
+
"""OCR using Qwen2-VL - Vision Language Model that can read Korean text"""
|
| 864 |
+
|
| 865 |
+
def __init__(self, log_callback=None):
|
| 866 |
+
super().__init__(log_callback)
|
| 867 |
+
self.processor = None
|
| 868 |
+
self.model = None
|
| 869 |
+
self.tokenizer = None
|
| 870 |
+
|
| 871 |
+
# Get OCR prompt from environment or use default
|
| 872 |
+
self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
|
| 873 |
+
"YOU ARE AN OCR SYSTEM. YOUR ONLY JOB IS TEXT EXTRACTION.\n\n"
|
| 874 |
+
"CRITICAL RULES:\n"
|
| 875 |
+
"1. DO NOT TRANSLATE ANYTHING\n"
|
| 876 |
+
"2. DO NOT MODIFY THE TEXT\n"
|
| 877 |
+
"3. DO NOT EXPLAIN OR COMMENT\n"
|
| 878 |
+
"4. ONLY OUTPUT THE EXACT TEXT YOU SEE\n"
|
| 879 |
+
"5. PRESERVE NATURAL TEXT FLOW - DO NOT ADD UNNECESSARY LINE BREAKS\n\n"
|
| 880 |
+
"If you see Korean text, output it in Korean.\n"
|
| 881 |
+
"If you see Japanese text, output it in Japanese.\n"
|
| 882 |
+
"If you see Chinese text, output it in Chinese.\n"
|
| 883 |
+
"If you see English text, output it in English.\n\n"
|
| 884 |
+
"IMPORTANT: Only use line breaks where they naturally occur in the original text "
|
| 885 |
+
"(e.g., between dialogue lines or paragraphs). Do not break text mid-sentence or "
|
| 886 |
+
"between every word/character.\n\n"
|
| 887 |
+
"For vertical text common in manga/comics, transcribe it as a continuous line unless "
|
| 888 |
+
"there are clear visual breaks.\n\n"
|
| 889 |
+
"NEVER translate. ONLY extract exactly what is written.\n"
|
| 890 |
+
"Output ONLY the raw text, nothing else."
|
| 891 |
+
)
|
| 892 |
+
|
| 893 |
+
def set_ocr_prompt(self, prompt: str):
|
| 894 |
+
"""Allow setting the OCR prompt dynamically"""
|
| 895 |
+
self.ocr_prompt = prompt
|
| 896 |
+
|
| 897 |
+
def check_installation(self) -> bool:
|
| 898 |
+
"""Check if required packages are installed"""
|
| 899 |
+
try:
|
| 900 |
+
import transformers
|
| 901 |
+
import torch
|
| 902 |
+
self.is_installed = True
|
| 903 |
+
return True
|
| 904 |
+
except ImportError:
|
| 905 |
+
return False
|
| 906 |
+
|
| 907 |
+
def install(self, progress_callback=None) -> bool:
|
| 908 |
+
"""Install requirements for Qwen2-VL"""
|
| 909 |
+
pass
|
| 910 |
+
|
| 911 |
+
def load_model(self, model_size=None, **kwargs) -> bool:
|
| 912 |
+
"""Load Qwen2-VL model with size selection"""
|
| 913 |
+
self._log(f"DEBUG: load_model called with model_size={model_size}")
|
| 914 |
+
|
| 915 |
+
try:
|
| 916 |
+
if not self.is_installed and not self.check_installation():
|
| 917 |
+
self._log("❌ Not installed", "error")
|
| 918 |
+
return False
|
| 919 |
+
|
| 920 |
+
self._log("🔥 Loading Qwen2-VL for Advanced OCR...")
|
| 921 |
+
|
| 922 |
+
|
| 923 |
+
|
| 924 |
+
from transformers import AutoProcessor, AutoTokenizer
|
| 925 |
+
import torch
|
| 926 |
+
|
| 927 |
+
# Model options
|
| 928 |
+
model_options = {
|
| 929 |
+
"1": "Qwen/Qwen2-VL-2B-Instruct",
|
| 930 |
+
"2": "Qwen/Qwen2-VL-7B-Instruct",
|
| 931 |
+
"3": "Qwen/Qwen2-VL-72B-Instruct",
|
| 932 |
+
"4": "custom"
|
| 933 |
+
}
|
| 934 |
+
# CHANGE: Default to 7B instead of 2B
|
| 935 |
+
# Check for saved preference first
|
| 936 |
+
if model_size is None:
|
| 937 |
+
# Try to get from environment or config
|
| 938 |
+
import os
|
| 939 |
+
model_size = os.environ.get('QWEN2VL_MODEL_SIZE', '1')
|
| 940 |
+
|
| 941 |
+
# Determine which model to load
|
| 942 |
+
if model_size and str(model_size).startswith("custom:"):
|
| 943 |
+
# Custom model passed with ID
|
| 944 |
+
model_id = str(model_size).replace("custom:", "")
|
| 945 |
+
self.loaded_model_size = "Custom"
|
| 946 |
+
self.model_id = model_id
|
| 947 |
+
self._log(f"Loading custom model: {model_id}")
|
| 948 |
+
elif model_size == "4":
|
| 949 |
+
# Custom option selected but no ID - shouldn't happen
|
| 950 |
+
self._log("❌ Custom model selected but no ID provided", "error")
|
| 951 |
+
return False
|
| 952 |
+
elif model_size and str(model_size) in model_options:
|
| 953 |
+
# Standard model option
|
| 954 |
+
option = model_options[str(model_size)]
|
| 955 |
+
if option == "custom":
|
| 956 |
+
self._log("❌ Custom model needs an ID", "error")
|
| 957 |
+
return False
|
| 958 |
+
model_id = option
|
| 959 |
+
# Set loaded_model_size for status display
|
| 960 |
+
if model_size == "1":
|
| 961 |
+
self.loaded_model_size = "2B"
|
| 962 |
+
elif model_size == "2":
|
| 963 |
+
self.loaded_model_size = "7B"
|
| 964 |
+
elif model_size == "3":
|
| 965 |
+
self.loaded_model_size = "72B"
|
| 966 |
+
else:
|
| 967 |
+
# CHANGE: Default to 7B (option "2") instead of 2B
|
| 968 |
+
model_id = model_options["1"] # Changed from "1" to "2"
|
| 969 |
+
self.loaded_model_size = "2B" # Changed from "2B" to "7B"
|
| 970 |
+
self._log("No model size specified, defaulting to 2B") # Changed message
|
| 971 |
+
|
| 972 |
+
self._log(f"Loading model: {model_id}")
|
| 973 |
+
|
| 974 |
+
# Load processor and tokenizer
|
| 975 |
+
self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
| 976 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 977 |
+
|
| 978 |
+
# Load the model - let it figure out the class dynamically
|
| 979 |
+
if torch.cuda.is_available():
|
| 980 |
+
self._log(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 981 |
+
# Use auto model class
|
| 982 |
+
from transformers import AutoModelForVision2Seq
|
| 983 |
+
self.model = AutoModelForVision2Seq.from_pretrained(
|
| 984 |
+
model_id,
|
| 985 |
+
dtype=torch.float16,
|
| 986 |
+
device_map="auto",
|
| 987 |
+
trust_remote_code=True
|
| 988 |
+
)
|
| 989 |
+
self._log("✅ Model loaded on GPU")
|
| 990 |
+
else:
|
| 991 |
+
self._log("Loading on CPU...")
|
| 992 |
+
from transformers import AutoModelForVision2Seq
|
| 993 |
+
self.model = AutoModelForVision2Seq.from_pretrained(
|
| 994 |
+
model_id,
|
| 995 |
+
dtype=torch.float32,
|
| 996 |
+
trust_remote_code=True
|
| 997 |
+
)
|
| 998 |
+
self._log("✅ Model loaded on CPU")
|
| 999 |
+
|
| 1000 |
+
self.model.eval()
|
| 1001 |
+
self.is_loaded = True
|
| 1002 |
+
self._log("✅ Qwen2-VL ready for Advanced OCR!")
|
| 1003 |
+
return True
|
| 1004 |
+
|
| 1005 |
+
except Exception as e:
|
| 1006 |
+
self._log(f"❌ Failed to load: {str(e)}", "error")
|
| 1007 |
+
import traceback
|
| 1008 |
+
self._log(traceback.format_exc(), "debug")
|
| 1009 |
+
return False
|
| 1010 |
+
|
| 1011 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 1012 |
+
"""Process image with Qwen2-VL for Korean text extraction"""
|
| 1013 |
+
results = []
|
| 1014 |
+
if hasattr(self, 'model_id'):
|
| 1015 |
+
self._log(f"DEBUG: Using model: {self.model_id}", "debug")
|
| 1016 |
+
|
| 1017 |
+
# Check if OCR prompt was passed in kwargs (for dynamic updates)
|
| 1018 |
+
if 'ocr_prompt' in kwargs:
|
| 1019 |
+
self.ocr_prompt = kwargs['ocr_prompt']
|
| 1020 |
+
|
| 1021 |
+
try:
|
| 1022 |
+
if not self.is_loaded:
|
| 1023 |
+
if not self.load_model():
|
| 1024 |
+
return results
|
| 1025 |
+
|
| 1026 |
+
import cv2
|
| 1027 |
+
from PIL import Image
|
| 1028 |
+
import torch
|
| 1029 |
+
|
| 1030 |
+
# Convert to PIL
|
| 1031 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 1032 |
+
pil_image = Image.fromarray(image_rgb)
|
| 1033 |
+
h, w = image.shape[:2]
|
| 1034 |
+
|
| 1035 |
+
self._log(f"🔍 Processing with Qwen2-VL ({w}x{h} pixels)...")
|
| 1036 |
+
|
| 1037 |
+
# Use the configurable OCR prompt
|
| 1038 |
+
messages = [
|
| 1039 |
+
{
|
| 1040 |
+
"role": "user",
|
| 1041 |
+
"content": [
|
| 1042 |
+
{
|
| 1043 |
+
"type": "image",
|
| 1044 |
+
"image": pil_image,
|
| 1045 |
+
},
|
| 1046 |
+
{
|
| 1047 |
+
"type": "text",
|
| 1048 |
+
"text": self.ocr_prompt # Use the configurable prompt
|
| 1049 |
+
}
|
| 1050 |
+
]
|
| 1051 |
+
}
|
| 1052 |
+
]
|
| 1053 |
+
|
| 1054 |
+
# Alternative simpler prompt if the above still causes issues:
|
| 1055 |
+
# "text": "OCR: Extract text as-is"
|
| 1056 |
+
|
| 1057 |
+
# Process with Qwen2-VL
|
| 1058 |
+
text = self.processor.apply_chat_template(
|
| 1059 |
+
messages,
|
| 1060 |
+
tokenize=False,
|
| 1061 |
+
add_generation_prompt=True
|
| 1062 |
+
)
|
| 1063 |
+
|
| 1064 |
+
inputs = self.processor(
|
| 1065 |
+
text=[text],
|
| 1066 |
+
images=[pil_image],
|
| 1067 |
+
padding=True,
|
| 1068 |
+
return_tensors="pt"
|
| 1069 |
+
)
|
| 1070 |
+
|
| 1071 |
+
# Get the device and dtype the model is currently on
|
| 1072 |
+
model_device = next(self.model.parameters()).device
|
| 1073 |
+
model_dtype = next(self.model.parameters()).dtype
|
| 1074 |
+
|
| 1075 |
+
# Move inputs to the same device as the model and cast float tensors to model dtype
|
| 1076 |
+
try:
|
| 1077 |
+
# Move first
|
| 1078 |
+
inputs = inputs.to(model_device)
|
| 1079 |
+
# Then align dtypes only for floating tensors (e.g., pixel_values)
|
| 1080 |
+
for k, v in inputs.items():
|
| 1081 |
+
if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
|
| 1082 |
+
inputs[k] = v.to(model_dtype)
|
| 1083 |
+
except Exception:
|
| 1084 |
+
# Fallback: ensure at least pixel_values is correct if present
|
| 1085 |
+
try:
|
| 1086 |
+
if isinstance(inputs, dict) and "pixel_values" in inputs:
|
| 1087 |
+
pv = inputs["pixel_values"].to(model_device)
|
| 1088 |
+
if torch.is_floating_point(pv):
|
| 1089 |
+
inputs["pixel_values"] = pv.to(model_dtype)
|
| 1090 |
+
except Exception:
|
| 1091 |
+
pass
|
| 1092 |
+
|
| 1093 |
+
# Ensure pixel_values explicitly matches model dtype if present
|
| 1094 |
+
try:
|
| 1095 |
+
if isinstance(inputs, dict) and "pixel_values" in inputs:
|
| 1096 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(device=model_device, dtype=model_dtype)
|
| 1097 |
+
except Exception:
|
| 1098 |
+
pass
|
| 1099 |
+
|
| 1100 |
+
# Generate text with stricter parameters to avoid creative responses
|
| 1101 |
+
use_amp = (hasattr(torch, 'cuda') and model_device.type == 'cuda' and model_dtype in (torch.float16, torch.bfloat16))
|
| 1102 |
+
autocast_dev = 'cuda' if model_device.type == 'cuda' else 'cpu'
|
| 1103 |
+
autocast_dtype = model_dtype if model_dtype in (torch.float16, torch.bfloat16) else None
|
| 1104 |
+
|
| 1105 |
+
with torch.no_grad():
|
| 1106 |
+
if use_amp and autocast_dtype is not None:
|
| 1107 |
+
with torch.autocast(autocast_dev, dtype=autocast_dtype):
|
| 1108 |
+
generated_ids = self.model.generate(
|
| 1109 |
+
**inputs,
|
| 1110 |
+
max_new_tokens=128, # Reduced from 512 - manga bubbles are typically short
|
| 1111 |
+
do_sample=False, # Keep deterministic
|
| 1112 |
+
temperature=0.01, # Keep your very low temperature
|
| 1113 |
+
top_p=1.0, # Keep no nucleus sampling
|
| 1114 |
+
repetition_penalty=1.0, # Keep no repetition penalty
|
| 1115 |
+
num_beams=1, # Ensure greedy decoding (faster than beam search)
|
| 1116 |
+
use_cache=True, # Enable KV cache for speed
|
| 1117 |
+
early_stopping=True, # Stop at EOS token
|
| 1118 |
+
pad_token_id=self.tokenizer.pad_token_id, # Proper padding
|
| 1119 |
+
eos_token_id=self.tokenizer.eos_token_id, # Proper stopping
|
| 1120 |
+
)
|
| 1121 |
+
else:
|
| 1122 |
+
generated_ids = self.model.generate(
|
| 1123 |
+
**inputs,
|
| 1124 |
+
max_new_tokens=128, # Reduced from 512 - manga bubbles are typically short
|
| 1125 |
+
do_sample=False, # Keep deterministic
|
| 1126 |
+
temperature=0.01, # Keep your very low temperature
|
| 1127 |
+
top_p=1.0, # Keep no nucleus sampling
|
| 1128 |
+
repetition_penalty=1.0, # Keep no repetition penalty
|
| 1129 |
+
num_beams=1, # Ensure greedy decoding (faster than beam search)
|
| 1130 |
+
use_cache=True, # Enable KV cache for speed
|
| 1131 |
+
early_stopping=True, # Stop at EOS token
|
| 1132 |
+
pad_token_id=self.tokenizer.pad_token_id, # Proper padding
|
| 1133 |
+
eos_token_id=self.tokenizer.eos_token_id, # Proper stopping
|
| 1134 |
+
)
|
| 1135 |
+
|
| 1136 |
+
# Decode the output
|
| 1137 |
+
generated_ids_trimmed = [
|
| 1138 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| 1139 |
+
]
|
| 1140 |
+
output_text = self.processor.batch_decode(
|
| 1141 |
+
generated_ids_trimmed,
|
| 1142 |
+
skip_special_tokens=True,
|
| 1143 |
+
clean_up_tokenization_spaces=False
|
| 1144 |
+
)[0]
|
| 1145 |
+
|
| 1146 |
+
if output_text and output_text.strip():
|
| 1147 |
+
text = output_text.strip()
|
| 1148 |
+
|
| 1149 |
+
# ADDED: Filter out any response that looks like an explanation or apology
|
| 1150 |
+
# Common patterns that indicate the model is being "helpful" instead of just extracting
|
| 1151 |
+
unwanted_patterns = [
|
| 1152 |
+
"죄송합니다", # "I apologize"
|
| 1153 |
+
"sorry",
|
| 1154 |
+
"apologize",
|
| 1155 |
+
"이미지에는", # "in this image"
|
| 1156 |
+
"텍스트가 없습니다", # "there is no text"
|
| 1157 |
+
"I cannot",
|
| 1158 |
+
"I don't see",
|
| 1159 |
+
"There is no",
|
| 1160 |
+
"질문이 있으시면", # "if you have questions"
|
| 1161 |
+
]
|
| 1162 |
+
|
| 1163 |
+
# Check if response contains unwanted patterns
|
| 1164 |
+
text_lower = text.lower()
|
| 1165 |
+
is_explanation = any(pattern.lower() in text_lower for pattern in unwanted_patterns)
|
| 1166 |
+
|
| 1167 |
+
# Also check if the response is suspiciously long for a bubble
|
| 1168 |
+
# Most manga bubbles are short, if we get 50+ chars it might be an explanation
|
| 1169 |
+
is_too_long = len(text) > 100 and ('.' in text or ',' in text or '!' in text)
|
| 1170 |
+
|
| 1171 |
+
if is_explanation or is_too_long:
|
| 1172 |
+
self._log(f"⚠️ Model returned explanation instead of text, ignoring", "warning")
|
| 1173 |
+
# Return empty result or just skip this region
|
| 1174 |
+
return results
|
| 1175 |
+
|
| 1176 |
+
# Check language
|
| 1177 |
+
has_korean = any('\uAC00' <= c <= '\uD7AF' for c in text)
|
| 1178 |
+
has_japanese = any('\u3040' <= c <= '\u309F' or '\u30A0' <= c <= '\u30FF' for c in text)
|
| 1179 |
+
has_chinese = any('\u4E00' <= c <= '\u9FFF' for c in text)
|
| 1180 |
+
|
| 1181 |
+
if has_korean:
|
| 1182 |
+
self._log(f"✅ Korean detected: {text[:50]}...")
|
| 1183 |
+
elif has_japanese:
|
| 1184 |
+
self._log(f"✅ Japanese detected: {text[:50]}...")
|
| 1185 |
+
elif has_chinese:
|
| 1186 |
+
self._log(f"✅ Chinese detected: {text[:50]}...")
|
| 1187 |
+
else:
|
| 1188 |
+
self._log(f"✅ Text: {text[:50]}...")
|
| 1189 |
+
|
| 1190 |
+
results.append(OCRResult(
|
| 1191 |
+
text=text,
|
| 1192 |
+
bbox=(0, 0, w, h),
|
| 1193 |
+
confidence=0.9,
|
| 1194 |
+
vertices=[(0, 0), (w, 0), (w, h), (0, h)]
|
| 1195 |
+
))
|
| 1196 |
+
else:
|
| 1197 |
+
self._log("⚠️ No text detected", "warning")
|
| 1198 |
+
|
| 1199 |
+
except Exception as e:
|
| 1200 |
+
self._log(f"❌ Error: {str(e)}", "error")
|
| 1201 |
+
import traceback
|
| 1202 |
+
self._log(traceback.format_exc(), "debug")
|
| 1203 |
+
|
| 1204 |
+
return results
|
| 1205 |
+
|
| 1206 |
+
class EasyOCRProvider(OCRProvider):
|
| 1207 |
+
"""EasyOCR provider for multiple languages"""
|
| 1208 |
+
|
| 1209 |
+
def __init__(self, log_callback=None, languages=None):
|
| 1210 |
+
super().__init__(log_callback)
|
| 1211 |
+
# Default to safe language combination
|
| 1212 |
+
self.languages = languages or ['ja', 'en'] # Safe default
|
| 1213 |
+
self._validate_language_combination()
|
| 1214 |
+
|
| 1215 |
+
def _validate_language_combination(self):
|
| 1216 |
+
"""Validate and fix EasyOCR language combinations"""
|
| 1217 |
+
# EasyOCR language compatibility rules
|
| 1218 |
+
incompatible_pairs = [
|
| 1219 |
+
(['ja', 'ko'], 'Japanese and Korean cannot be used together'),
|
| 1220 |
+
(['ja', 'zh'], 'Japanese and Chinese cannot be used together'),
|
| 1221 |
+
(['ko', 'zh'], 'Korean and Chinese cannot be used together')
|
| 1222 |
+
]
|
| 1223 |
+
|
| 1224 |
+
for incompatible, reason in incompatible_pairs:
|
| 1225 |
+
if all(lang in self.languages for lang in incompatible):
|
| 1226 |
+
self._log(f"⚠️ EasyOCR: {reason}", "warning")
|
| 1227 |
+
# Keep first language + English
|
| 1228 |
+
self.languages = [self.languages[0], 'en']
|
| 1229 |
+
self._log(f"🔧 Auto-adjusted to: {self.languages}", "info")
|
| 1230 |
+
break
|
| 1231 |
+
|
| 1232 |
+
def check_installation(self) -> bool:
|
| 1233 |
+
"""Check if easyocr is installed"""
|
| 1234 |
+
try:
|
| 1235 |
+
import easyocr
|
| 1236 |
+
self.is_installed = True
|
| 1237 |
+
return True
|
| 1238 |
+
except ImportError:
|
| 1239 |
+
return False
|
| 1240 |
+
|
| 1241 |
+
def install(self, progress_callback=None) -> bool:
|
| 1242 |
+
"""Install easyocr"""
|
| 1243 |
+
pass
|
| 1244 |
+
|
| 1245 |
+
def load_model(self, **kwargs) -> bool:
|
| 1246 |
+
"""Load easyocr model"""
|
| 1247 |
+
try:
|
| 1248 |
+
if not self.is_installed and not self.check_installation():
|
| 1249 |
+
self._log("❌ easyocr not installed", "error")
|
| 1250 |
+
return False
|
| 1251 |
+
|
| 1252 |
+
self._log(f"🔥 Loading easyocr model for languages: {self.languages}...")
|
| 1253 |
+
import easyocr
|
| 1254 |
+
|
| 1255 |
+
# This will download models on first run
|
| 1256 |
+
self.model = easyocr.Reader(self.languages, gpu=True)
|
| 1257 |
+
self.is_loaded = True
|
| 1258 |
+
|
| 1259 |
+
self._log("✅ easyocr model loaded successfully")
|
| 1260 |
+
return True
|
| 1261 |
+
|
| 1262 |
+
except Exception as e:
|
| 1263 |
+
self._log(f"❌ Failed to load easyocr: {str(e)}", "error")
|
| 1264 |
+
# Try CPU mode if GPU fails
|
| 1265 |
+
try:
|
| 1266 |
+
import easyocr
|
| 1267 |
+
self.model = easyocr.Reader(self.languages, gpu=False)
|
| 1268 |
+
self.is_loaded = True
|
| 1269 |
+
self._log("✅ easyocr loaded in CPU mode")
|
| 1270 |
+
return True
|
| 1271 |
+
except:
|
| 1272 |
+
return False
|
| 1273 |
+
|
| 1274 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 1275 |
+
"""Detect text using easyocr"""
|
| 1276 |
+
results = []
|
| 1277 |
+
|
| 1278 |
+
try:
|
| 1279 |
+
if not self.is_loaded:
|
| 1280 |
+
if not self.load_model():
|
| 1281 |
+
return results
|
| 1282 |
+
|
| 1283 |
+
# EasyOCR can work directly with numpy arrays
|
| 1284 |
+
ocr_results = self.model.readtext(image, detail=1)
|
| 1285 |
+
|
| 1286 |
+
# Parse results
|
| 1287 |
+
for (bbox, text, confidence) in ocr_results:
|
| 1288 |
+
# bbox is a list of 4 points
|
| 1289 |
+
xs = [point[0] for point in bbox]
|
| 1290 |
+
ys = [point[1] for point in bbox]
|
| 1291 |
+
x_min, x_max = min(xs), max(xs)
|
| 1292 |
+
y_min, y_max = min(ys), max(ys)
|
| 1293 |
+
|
| 1294 |
+
results.append(OCRResult(
|
| 1295 |
+
text=text,
|
| 1296 |
+
bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
|
| 1297 |
+
confidence=confidence,
|
| 1298 |
+
vertices=[(int(p[0]), int(p[1])) for p in bbox]
|
| 1299 |
+
))
|
| 1300 |
+
|
| 1301 |
+
self._log(f"✅ Detected {len(results)} text regions")
|
| 1302 |
+
|
| 1303 |
+
except Exception as e:
|
| 1304 |
+
self._log(f"❌ Error in easyocr detection: {str(e)}", "error")
|
| 1305 |
+
|
| 1306 |
+
return results
|
| 1307 |
+
|
| 1308 |
+
|
| 1309 |
+
class PaddleOCRProvider(OCRProvider):
|
| 1310 |
+
"""PaddleOCR provider with memory safety measures"""
|
| 1311 |
+
|
| 1312 |
+
def check_installation(self) -> bool:
|
| 1313 |
+
"""Check if paddleocr is installed"""
|
| 1314 |
+
try:
|
| 1315 |
+
from paddleocr import PaddleOCR
|
| 1316 |
+
self.is_installed = True
|
| 1317 |
+
return True
|
| 1318 |
+
except ImportError:
|
| 1319 |
+
return False
|
| 1320 |
+
|
| 1321 |
+
def install(self, progress_callback=None) -> bool:
|
| 1322 |
+
"""Install paddleocr"""
|
| 1323 |
+
pass
|
| 1324 |
+
|
| 1325 |
+
def load_model(self, **kwargs) -> bool:
|
| 1326 |
+
"""Load paddleocr model with memory-safe configurations"""
|
| 1327 |
+
try:
|
| 1328 |
+
if not self.is_installed and not self.check_installation():
|
| 1329 |
+
self._log("❌ paddleocr not installed", "error")
|
| 1330 |
+
return False
|
| 1331 |
+
|
| 1332 |
+
self._log("🔥 Loading PaddleOCR model...")
|
| 1333 |
+
|
| 1334 |
+
# Set memory-safe environment variables BEFORE importing
|
| 1335 |
+
import os
|
| 1336 |
+
os.environ['OMP_NUM_THREADS'] = '1' # Prevent OpenMP conflicts
|
| 1337 |
+
os.environ['MKL_NUM_THREADS'] = '1' # Prevent MKL conflicts
|
| 1338 |
+
os.environ['OPENBLAS_NUM_THREADS'] = '1' # Prevent OpenBLAS conflicts
|
| 1339 |
+
os.environ['FLAGS_use_mkldnn'] = '0' # Disable MKL-DNN
|
| 1340 |
+
|
| 1341 |
+
from paddleocr import PaddleOCR
|
| 1342 |
+
|
| 1343 |
+
# Try memory-safe configurations
|
| 1344 |
+
configs_to_try = [
|
| 1345 |
+
# Config 1: Most memory-safe configuration
|
| 1346 |
+
{
|
| 1347 |
+
'use_angle_cls': False, # Disable angle to save memory
|
| 1348 |
+
'lang': 'ch',
|
| 1349 |
+
'rec_batch_num': 1, # Process one at a time
|
| 1350 |
+
'max_text_length': 100, # Limit text length
|
| 1351 |
+
'drop_score': 0.5, # Higher threshold to reduce detections
|
| 1352 |
+
'cpu_threads': 1, # Single thread to avoid conflicts
|
| 1353 |
+
},
|
| 1354 |
+
# Config 2: Minimal memory footprint
|
| 1355 |
+
{
|
| 1356 |
+
'lang': 'ch',
|
| 1357 |
+
'rec_batch_num': 1,
|
| 1358 |
+
'cpu_threads': 1,
|
| 1359 |
+
},
|
| 1360 |
+
# Config 3: Absolute minimal
|
| 1361 |
+
{
|
| 1362 |
+
'lang': 'ch'
|
| 1363 |
+
},
|
| 1364 |
+
# Config 4: Empty config
|
| 1365 |
+
{}
|
| 1366 |
+
]
|
| 1367 |
+
|
| 1368 |
+
for i, config in enumerate(configs_to_try):
|
| 1369 |
+
try:
|
| 1370 |
+
self._log(f" Trying configuration {i+1}/{len(configs_to_try)}: {config}")
|
| 1371 |
+
|
| 1372 |
+
# Force garbage collection before loading
|
| 1373 |
+
import gc
|
| 1374 |
+
gc.collect()
|
| 1375 |
+
|
| 1376 |
+
self.model = PaddleOCR(**config)
|
| 1377 |
+
self.is_loaded = True
|
| 1378 |
+
self.current_config = config
|
| 1379 |
+
self._log(f"✅ PaddleOCR loaded successfully with config: {config}")
|
| 1380 |
+
return True
|
| 1381 |
+
except Exception as e:
|
| 1382 |
+
error_str = str(e)
|
| 1383 |
+
self._log(f" Config {i+1} failed: {error_str}", "debug")
|
| 1384 |
+
|
| 1385 |
+
# Clean up on failure
|
| 1386 |
+
if hasattr(self, 'model'):
|
| 1387 |
+
del self.model
|
| 1388 |
+
gc.collect()
|
| 1389 |
+
continue
|
| 1390 |
+
|
| 1391 |
+
self._log(f"❌ PaddleOCR failed to load with any configuration", "error")
|
| 1392 |
+
return False
|
| 1393 |
+
|
| 1394 |
+
except Exception as e:
|
| 1395 |
+
self._log(f"❌ Failed to load paddleocr: {str(e)}", "error")
|
| 1396 |
+
import traceback
|
| 1397 |
+
self._log(traceback.format_exc(), "debug")
|
| 1398 |
+
return False
|
| 1399 |
+
|
| 1400 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 1401 |
+
"""Detect text with memory safety measures"""
|
| 1402 |
+
results = []
|
| 1403 |
+
|
| 1404 |
+
try:
|
| 1405 |
+
if not self.is_loaded:
|
| 1406 |
+
if not self.load_model():
|
| 1407 |
+
return results
|
| 1408 |
+
|
| 1409 |
+
import cv2
|
| 1410 |
+
import numpy as np
|
| 1411 |
+
import gc
|
| 1412 |
+
|
| 1413 |
+
# Memory safety: Ensure image isn't too large
|
| 1414 |
+
h, w = image.shape[:2] if len(image.shape) >= 2 else (0, 0)
|
| 1415 |
+
|
| 1416 |
+
# Limit image size to prevent memory issues
|
| 1417 |
+
MAX_DIMENSION = 1500
|
| 1418 |
+
if h > MAX_DIMENSION or w > MAX_DIMENSION:
|
| 1419 |
+
scale = min(MAX_DIMENSION/h, MAX_DIMENSION/w)
|
| 1420 |
+
new_h, new_w = int(h*scale), int(w*scale)
|
| 1421 |
+
self._log(f"⚠️ Resizing large image from {w}x{h} to {new_w}x{new_h} for memory safety", "warning")
|
| 1422 |
+
image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
| 1423 |
+
scale_factor = 1/scale
|
| 1424 |
+
else:
|
| 1425 |
+
scale_factor = 1.0
|
| 1426 |
+
|
| 1427 |
+
# Ensure correct format
|
| 1428 |
+
if len(image.shape) == 2: # Grayscale
|
| 1429 |
+
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
| 1430 |
+
elif len(image.shape) == 4: # Batch
|
| 1431 |
+
image = image[0]
|
| 1432 |
+
|
| 1433 |
+
# Ensure uint8 type
|
| 1434 |
+
if image.dtype != np.uint8:
|
| 1435 |
+
if image.max() <= 1.0:
|
| 1436 |
+
image = (image * 255).astype(np.uint8)
|
| 1437 |
+
else:
|
| 1438 |
+
image = image.astype(np.uint8)
|
| 1439 |
+
|
| 1440 |
+
# Make a copy to avoid memory corruption
|
| 1441 |
+
image_copy = image.copy()
|
| 1442 |
+
|
| 1443 |
+
# Force garbage collection before OCR
|
| 1444 |
+
gc.collect()
|
| 1445 |
+
|
| 1446 |
+
# Process with timeout protection
|
| 1447 |
+
import signal
|
| 1448 |
+
import threading
|
| 1449 |
+
|
| 1450 |
+
ocr_results = None
|
| 1451 |
+
ocr_error = None
|
| 1452 |
+
|
| 1453 |
+
def run_ocr():
|
| 1454 |
+
nonlocal ocr_results, ocr_error
|
| 1455 |
+
try:
|
| 1456 |
+
ocr_results = self.model.ocr(image_copy)
|
| 1457 |
+
except Exception as e:
|
| 1458 |
+
ocr_error = e
|
| 1459 |
+
|
| 1460 |
+
# Run OCR in a separate thread with timeout
|
| 1461 |
+
ocr_thread = threading.Thread(target=run_ocr)
|
| 1462 |
+
ocr_thread.daemon = True
|
| 1463 |
+
ocr_thread.start()
|
| 1464 |
+
ocr_thread.join(timeout=30) # 30 second timeout
|
| 1465 |
+
|
| 1466 |
+
if ocr_thread.is_alive():
|
| 1467 |
+
self._log("❌ PaddleOCR timeout - taking too long", "error")
|
| 1468 |
+
return results
|
| 1469 |
+
|
| 1470 |
+
if ocr_error:
|
| 1471 |
+
raise ocr_error
|
| 1472 |
+
|
| 1473 |
+
# Parse results
|
| 1474 |
+
results = self._parse_ocr_results(ocr_results)
|
| 1475 |
+
|
| 1476 |
+
# Scale coordinates back if image was resized
|
| 1477 |
+
if scale_factor != 1.0 and results:
|
| 1478 |
+
for r in results:
|
| 1479 |
+
x, y, width, height = r.bbox
|
| 1480 |
+
r.bbox = (int(x*scale_factor), int(y*scale_factor),
|
| 1481 |
+
int(width*scale_factor), int(height*scale_factor))
|
| 1482 |
+
r.vertices = [(int(v[0]*scale_factor), int(v[1]*scale_factor))
|
| 1483 |
+
for v in r.vertices]
|
| 1484 |
+
|
| 1485 |
+
if results:
|
| 1486 |
+
self._log(f"✅ Detected {len(results)} text regions", "info")
|
| 1487 |
+
else:
|
| 1488 |
+
self._log("No text regions found", "debug")
|
| 1489 |
+
|
| 1490 |
+
# Clean up
|
| 1491 |
+
del image_copy
|
| 1492 |
+
gc.collect()
|
| 1493 |
+
|
| 1494 |
+
except Exception as e:
|
| 1495 |
+
error_msg = str(e) if str(e) else type(e).__name__
|
| 1496 |
+
|
| 1497 |
+
if "memory" in error_msg.lower() or "0x" in error_msg:
|
| 1498 |
+
self._log("❌ Memory access violation in PaddleOCR", "error")
|
| 1499 |
+
self._log(" This is a known Windows issue with PaddleOCR", "info")
|
| 1500 |
+
self._log(" Please switch to EasyOCR or manga-ocr instead", "warning")
|
| 1501 |
+
elif "trace_order.size()" in error_msg:
|
| 1502 |
+
self._log("❌ PaddleOCR internal error", "error")
|
| 1503 |
+
self._log(" Please switch to EasyOCR or manga-ocr", "warning")
|
| 1504 |
+
else:
|
| 1505 |
+
self._log(f"❌ Error in paddleocr detection: {error_msg}", "error")
|
| 1506 |
+
|
| 1507 |
+
import traceback
|
| 1508 |
+
self._log(traceback.format_exc(), "debug")
|
| 1509 |
+
|
| 1510 |
+
return results
|
| 1511 |
+
|
| 1512 |
+
def _parse_ocr_results(self, ocr_results) -> List[OCRResult]:
|
| 1513 |
+
"""Parse OCR results safely"""
|
| 1514 |
+
results = []
|
| 1515 |
+
|
| 1516 |
+
if isinstance(ocr_results, bool) and ocr_results == False:
|
| 1517 |
+
return results
|
| 1518 |
+
|
| 1519 |
+
if ocr_results is None or not isinstance(ocr_results, list):
|
| 1520 |
+
return results
|
| 1521 |
+
|
| 1522 |
+
if len(ocr_results) == 0:
|
| 1523 |
+
return results
|
| 1524 |
+
|
| 1525 |
+
# Handle batch format
|
| 1526 |
+
if isinstance(ocr_results[0], list) and len(ocr_results[0]) > 0:
|
| 1527 |
+
first_item = ocr_results[0][0]
|
| 1528 |
+
if isinstance(first_item, list) and len(first_item) > 0:
|
| 1529 |
+
if isinstance(first_item[0], (list, tuple)) and len(first_item[0]) == 2:
|
| 1530 |
+
ocr_results = ocr_results[0]
|
| 1531 |
+
|
| 1532 |
+
# Parse detections
|
| 1533 |
+
for detection in ocr_results:
|
| 1534 |
+
if not detection or isinstance(detection, bool):
|
| 1535 |
+
continue
|
| 1536 |
+
|
| 1537 |
+
if not isinstance(detection, (list, tuple)) or len(detection) < 2:
|
| 1538 |
+
continue
|
| 1539 |
+
|
| 1540 |
+
try:
|
| 1541 |
+
bbox_points = detection[0]
|
| 1542 |
+
text_data = detection[1]
|
| 1543 |
+
|
| 1544 |
+
if not isinstance(bbox_points, (list, tuple)) or len(bbox_points) != 4:
|
| 1545 |
+
continue
|
| 1546 |
+
|
| 1547 |
+
if not isinstance(text_data, (tuple, list)) or len(text_data) < 2:
|
| 1548 |
+
continue
|
| 1549 |
+
|
| 1550 |
+
text = str(text_data[0]).strip()
|
| 1551 |
+
confidence = float(text_data[1])
|
| 1552 |
+
|
| 1553 |
+
if not text or confidence < 0.3:
|
| 1554 |
+
continue
|
| 1555 |
+
|
| 1556 |
+
xs = [float(p[0]) for p in bbox_points]
|
| 1557 |
+
ys = [float(p[1]) for p in bbox_points]
|
| 1558 |
+
x_min, x_max = min(xs), max(xs)
|
| 1559 |
+
y_min, y_max = min(ys), max(ys)
|
| 1560 |
+
|
| 1561 |
+
if (x_max - x_min) < 5 or (y_max - y_min) < 5:
|
| 1562 |
+
continue
|
| 1563 |
+
|
| 1564 |
+
results.append(OCRResult(
|
| 1565 |
+
text=text,
|
| 1566 |
+
bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
|
| 1567 |
+
confidence=confidence,
|
| 1568 |
+
vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
|
| 1569 |
+
))
|
| 1570 |
+
|
| 1571 |
+
except Exception:
|
| 1572 |
+
continue
|
| 1573 |
+
|
| 1574 |
+
return results
|
| 1575 |
+
|
| 1576 |
+
class DocTROCRProvider(OCRProvider):
|
| 1577 |
+
"""DocTR OCR provider"""
|
| 1578 |
+
|
| 1579 |
+
def check_installation(self) -> bool:
|
| 1580 |
+
"""Check if doctr is installed"""
|
| 1581 |
+
try:
|
| 1582 |
+
from doctr.models import ocr_predictor
|
| 1583 |
+
self.is_installed = True
|
| 1584 |
+
return True
|
| 1585 |
+
except ImportError:
|
| 1586 |
+
return False
|
| 1587 |
+
|
| 1588 |
+
def install(self, progress_callback=None) -> bool:
|
| 1589 |
+
"""Install doctr"""
|
| 1590 |
+
pass
|
| 1591 |
+
|
| 1592 |
+
def load_model(self, **kwargs) -> bool:
|
| 1593 |
+
"""Load doctr model"""
|
| 1594 |
+
try:
|
| 1595 |
+
if not self.is_installed and not self.check_installation():
|
| 1596 |
+
self._log("❌ doctr not installed", "error")
|
| 1597 |
+
return False
|
| 1598 |
+
|
| 1599 |
+
self._log("🔥 Loading DocTR model...")
|
| 1600 |
+
from doctr.models import ocr_predictor
|
| 1601 |
+
|
| 1602 |
+
# Load pretrained model
|
| 1603 |
+
self.model = ocr_predictor(pretrained=True)
|
| 1604 |
+
self.is_loaded = True
|
| 1605 |
+
|
| 1606 |
+
self._log("✅ DocTR model loaded successfully")
|
| 1607 |
+
return True
|
| 1608 |
+
|
| 1609 |
+
except Exception as e:
|
| 1610 |
+
self._log(f"❌ Failed to load doctr: {str(e)}", "error")
|
| 1611 |
+
return False
|
| 1612 |
+
|
| 1613 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 1614 |
+
"""Detect text using doctr"""
|
| 1615 |
+
results = []
|
| 1616 |
+
|
| 1617 |
+
try:
|
| 1618 |
+
if not self.is_loaded:
|
| 1619 |
+
if not self.load_model():
|
| 1620 |
+
return results
|
| 1621 |
+
|
| 1622 |
+
from doctr.io import DocumentFile
|
| 1623 |
+
|
| 1624 |
+
# DocTR expects document format
|
| 1625 |
+
# Convert numpy array to PIL and save temporarily
|
| 1626 |
+
import tempfile
|
| 1627 |
+
import cv2
|
| 1628 |
+
|
| 1629 |
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
|
| 1630 |
+
cv2.imwrite(tmp.name, image)
|
| 1631 |
+
doc = DocumentFile.from_images(tmp.name)
|
| 1632 |
+
|
| 1633 |
+
# Run OCR
|
| 1634 |
+
result = self.model(doc)
|
| 1635 |
+
|
| 1636 |
+
# Parse results
|
| 1637 |
+
h, w = image.shape[:2]
|
| 1638 |
+
for page in result.pages:
|
| 1639 |
+
for block in page.blocks:
|
| 1640 |
+
for line in block.lines:
|
| 1641 |
+
for word in line.words:
|
| 1642 |
+
# Handle different geometry formats
|
| 1643 |
+
geometry = word.geometry
|
| 1644 |
+
|
| 1645 |
+
if len(geometry) == 4:
|
| 1646 |
+
# Standard format: (x1, y1, x2, y2)
|
| 1647 |
+
x1, y1, x2, y2 = geometry
|
| 1648 |
+
elif len(geometry) == 2:
|
| 1649 |
+
# Alternative format: ((x1, y1), (x2, y2))
|
| 1650 |
+
(x1, y1), (x2, y2) = geometry
|
| 1651 |
+
else:
|
| 1652 |
+
self._log(f"Unexpected geometry format: {geometry}", "warning")
|
| 1653 |
+
continue
|
| 1654 |
+
|
| 1655 |
+
# Convert relative coordinates to absolute
|
| 1656 |
+
x1, x2 = int(x1 * w), int(x2 * w)
|
| 1657 |
+
y1, y2 = int(y1 * h), int(y2 * h)
|
| 1658 |
+
|
| 1659 |
+
results.append(OCRResult(
|
| 1660 |
+
text=word.value,
|
| 1661 |
+
bbox=(x1, y1, x2 - x1, y2 - y1),
|
| 1662 |
+
confidence=word.confidence,
|
| 1663 |
+
vertices=[(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
|
| 1664 |
+
))
|
| 1665 |
+
|
| 1666 |
+
# Clean up temp file
|
| 1667 |
+
try:
|
| 1668 |
+
os.unlink(tmp.name)
|
| 1669 |
+
except:
|
| 1670 |
+
pass
|
| 1671 |
+
|
| 1672 |
+
self._log(f"DocTR detected {len(results)} text regions")
|
| 1673 |
+
|
| 1674 |
+
except Exception as e:
|
| 1675 |
+
self._log(f"Error in doctr detection: {str(e)}", "error")
|
| 1676 |
+
import traceback
|
| 1677 |
+
self._log(traceback.format_exc(), "error")
|
| 1678 |
+
|
| 1679 |
+
return results
|
| 1680 |
+
|
| 1681 |
+
|
| 1682 |
+
class RapidOCRProvider(OCRProvider):
|
| 1683 |
+
"""RapidOCR provider for fast local OCR"""
|
| 1684 |
+
|
| 1685 |
+
def check_installation(self) -> bool:
|
| 1686 |
+
"""Check if rapidocr is installed"""
|
| 1687 |
+
try:
|
| 1688 |
+
import rapidocr_onnxruntime
|
| 1689 |
+
self.is_installed = True
|
| 1690 |
+
return True
|
| 1691 |
+
except ImportError:
|
| 1692 |
+
return False
|
| 1693 |
+
|
| 1694 |
+
def install(self, progress_callback=None) -> bool:
|
| 1695 |
+
"""Install rapidocr (requires manual pip install)"""
|
| 1696 |
+
# RapidOCR requires manual installation
|
| 1697 |
+
if progress_callback:
|
| 1698 |
+
progress_callback("RapidOCR requires manual pip installation")
|
| 1699 |
+
self._log("Run: pip install rapidocr-onnxruntime", "info")
|
| 1700 |
+
return False # Always return False since we can't auto-install
|
| 1701 |
+
|
| 1702 |
+
def load_model(self, **kwargs) -> bool:
|
| 1703 |
+
"""Load RapidOCR model"""
|
| 1704 |
+
try:
|
| 1705 |
+
if not self.is_installed and not self.check_installation():
|
| 1706 |
+
self._log("RapidOCR not installed", "error")
|
| 1707 |
+
return False
|
| 1708 |
+
|
| 1709 |
+
self._log("Loading RapidOCR...")
|
| 1710 |
+
from rapidocr_onnxruntime import RapidOCR
|
| 1711 |
+
|
| 1712 |
+
self.model = RapidOCR()
|
| 1713 |
+
self.is_loaded = True
|
| 1714 |
+
|
| 1715 |
+
self._log("RapidOCR model loaded successfully")
|
| 1716 |
+
return True
|
| 1717 |
+
|
| 1718 |
+
except Exception as e:
|
| 1719 |
+
self._log(f"Failed to load RapidOCR: {str(e)}", "error")
|
| 1720 |
+
return False
|
| 1721 |
+
|
| 1722 |
+
def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
|
| 1723 |
+
"""Detect text using RapidOCR"""
|
| 1724 |
+
if not self.is_loaded:
|
| 1725 |
+
self._log("RapidOCR model not loaded", "error")
|
| 1726 |
+
return []
|
| 1727 |
+
|
| 1728 |
+
results = []
|
| 1729 |
+
|
| 1730 |
+
try:
|
| 1731 |
+
# Convert numpy array to PIL Image for RapidOCR
|
| 1732 |
+
if len(image.shape) == 3:
|
| 1733 |
+
# BGR to RGB
|
| 1734 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 1735 |
+
else:
|
| 1736 |
+
image_rgb = image
|
| 1737 |
+
|
| 1738 |
+
# RapidOCR expects PIL Image or numpy array
|
| 1739 |
+
ocr_results, _ = self.model(image_rgb)
|
| 1740 |
+
|
| 1741 |
+
if ocr_results:
|
| 1742 |
+
for result in ocr_results:
|
| 1743 |
+
# RapidOCR returns [bbox, text, confidence]
|
| 1744 |
+
bbox_points = result[0] # 4 corner points
|
| 1745 |
+
text = result[1]
|
| 1746 |
+
confidence = float(result[2])
|
| 1747 |
+
|
| 1748 |
+
if not text or not text.strip():
|
| 1749 |
+
continue
|
| 1750 |
+
|
| 1751 |
+
# Convert 4-point bbox to x,y,w,h format
|
| 1752 |
+
xs = [point[0] for point in bbox_points]
|
| 1753 |
+
ys = [point[1] for point in bbox_points]
|
| 1754 |
+
x_min, x_max = min(xs), max(xs)
|
| 1755 |
+
y_min, y_max = min(ys), max(ys)
|
| 1756 |
+
|
| 1757 |
+
results.append(OCRResult(
|
| 1758 |
+
text=text.strip(),
|
| 1759 |
+
bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
|
| 1760 |
+
confidence=confidence,
|
| 1761 |
+
vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
|
| 1762 |
+
))
|
| 1763 |
+
|
| 1764 |
+
self._log(f"Detected {len(results)} text regions")
|
| 1765 |
+
|
| 1766 |
+
except Exception as e:
|
| 1767 |
+
self._log(f"Error in RapidOCR detection: {str(e)}", "error")
|
| 1768 |
+
|
| 1769 |
+
return results
|
| 1770 |
+
|
| 1771 |
+
class OCRManager:
|
| 1772 |
+
"""Manager for multiple OCR providers"""
|
| 1773 |
+
|
| 1774 |
+
def __init__(self, log_callback=None):
|
| 1775 |
+
self.log_callback = log_callback
|
| 1776 |
+
self.providers = {
|
| 1777 |
+
'custom-api': CustomAPIProvider(log_callback) ,
|
| 1778 |
+
'manga-ocr': MangaOCRProvider(log_callback),
|
| 1779 |
+
'easyocr': EasyOCRProvider(log_callback),
|
| 1780 |
+
'paddleocr': PaddleOCRProvider(log_callback),
|
| 1781 |
+
'doctr': DocTROCRProvider(log_callback),
|
| 1782 |
+
'rapidocr': RapidOCRProvider(log_callback),
|
| 1783 |
+
'Qwen2-VL': Qwen2VL(log_callback)
|
| 1784 |
+
}
|
| 1785 |
+
self.current_provider = None
|
| 1786 |
+
self.stop_flag = None
|
| 1787 |
+
|
| 1788 |
+
def get_provider(self, name: str) -> Optional[OCRProvider]:
|
| 1789 |
+
"""Get OCR provider by name"""
|
| 1790 |
+
return self.providers.get(name)
|
| 1791 |
+
|
| 1792 |
+
def set_current_provider(self, name: str):
|
| 1793 |
+
"""Set current active provider"""
|
| 1794 |
+
if name in self.providers:
|
| 1795 |
+
self.current_provider = name
|
| 1796 |
+
return True
|
| 1797 |
+
return False
|
| 1798 |
+
|
| 1799 |
+
def check_provider_status(self, name: str) -> Dict[str, bool]:
|
| 1800 |
+
"""Check installation and loading status of provider"""
|
| 1801 |
+
provider = self.providers.get(name)
|
| 1802 |
+
if not provider:
|
| 1803 |
+
return {'installed': False, 'loaded': False}
|
| 1804 |
+
|
| 1805 |
+
result = {
|
| 1806 |
+
'installed': provider.check_installation(),
|
| 1807 |
+
'loaded': provider.is_loaded
|
| 1808 |
+
}
|
| 1809 |
+
if self.log_callback:
|
| 1810 |
+
self.log_callback(f"DEBUG: check_provider_status({name}) returning loaded={result['loaded']}", "debug")
|
| 1811 |
+
return result
|
| 1812 |
+
|
| 1813 |
+
def install_provider(self, name: str, progress_callback=None) -> bool:
|
| 1814 |
+
"""Install a provider"""
|
| 1815 |
+
provider = self.providers.get(name)
|
| 1816 |
+
if not provider:
|
| 1817 |
+
return False
|
| 1818 |
+
|
| 1819 |
+
return provider.install(progress_callback)
|
| 1820 |
+
|
| 1821 |
+
def load_provider(self, name: str, **kwargs) -> bool:
|
| 1822 |
+
"""Load a provider's model with optional parameters"""
|
| 1823 |
+
provider = self.providers.get(name)
|
| 1824 |
+
if not provider:
|
| 1825 |
+
return False
|
| 1826 |
+
|
| 1827 |
+
return provider.load_model(**kwargs) # <-- Passes model_size and any other kwargs
|
| 1828 |
+
|
| 1829 |
+
def shutdown(self):
|
| 1830 |
+
"""Release models/processors/tokenizers for all providers and clear caches."""
|
| 1831 |
+
try:
|
| 1832 |
+
import gc
|
| 1833 |
+
for name, provider in list(self.providers.items()):
|
| 1834 |
+
try:
|
| 1835 |
+
if hasattr(provider, 'model'):
|
| 1836 |
+
provider.model = None
|
| 1837 |
+
if hasattr(provider, 'processor'):
|
| 1838 |
+
provider.processor = None
|
| 1839 |
+
if hasattr(provider, 'tokenizer'):
|
| 1840 |
+
provider.tokenizer = None
|
| 1841 |
+
if hasattr(provider, 'reader'):
|
| 1842 |
+
provider.reader = None
|
| 1843 |
+
if hasattr(provider, 'is_loaded'):
|
| 1844 |
+
provider.is_loaded = False
|
| 1845 |
+
except Exception:
|
| 1846 |
+
pass
|
| 1847 |
+
gc.collect()
|
| 1848 |
+
try:
|
| 1849 |
+
import torch
|
| 1850 |
+
torch.cuda.empty_cache()
|
| 1851 |
+
except Exception:
|
| 1852 |
+
pass
|
| 1853 |
+
except Exception:
|
| 1854 |
+
pass
|
| 1855 |
+
|
| 1856 |
+
def detect_text(self, image: np.ndarray, provider_name: str = None, **kwargs) -> List[OCRResult]:
|
| 1857 |
+
"""Detect text using specified or current provider"""
|
| 1858 |
+
provider_name = provider_name or self.current_provider
|
| 1859 |
+
if not provider_name:
|
| 1860 |
+
return []
|
| 1861 |
+
|
| 1862 |
+
provider = self.providers.get(provider_name)
|
| 1863 |
+
if not provider:
|
| 1864 |
+
return []
|
| 1865 |
+
|
| 1866 |
+
return provider.detect_text(image, **kwargs)
|
| 1867 |
+
|
| 1868 |
+
def set_stop_flag(self, stop_flag):
|
| 1869 |
+
"""Set stop flag for all providers"""
|
| 1870 |
+
self.stop_flag = stop_flag
|
| 1871 |
+
for provider in self.providers.values():
|
| 1872 |
+
if hasattr(provider, 'set_stop_flag'):
|
| 1873 |
+
provider.set_stop_flag(stop_flag)
|
| 1874 |
+
|
| 1875 |
+
def reset_stop_flags(self):
|
| 1876 |
+
"""Reset stop flags for all providers"""
|
| 1877 |
+
for provider in self.providers.values():
|
| 1878 |
+
if hasattr(provider, 'reset_stop_flags'):
|
| 1879 |
+
provider.reset_stop_flags()
|
scan_html_folder.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
splash_utils.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#splah_utils.py
|
| 2 |
+
import time
|
| 3 |
+
import atexit
|
| 4 |
+
|
| 5 |
+
class SplashManager:
|
| 6 |
+
"""Simple splash screen manager that works with main thread"""
|
| 7 |
+
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.splash_window = None
|
| 10 |
+
self._status_text = "Initializing..."
|
| 11 |
+
self.progress_value = 0 # Track actual progress 0-100
|
| 12 |
+
self.canvas_width = 320 # Progress bar dimensions (increased from 300)
|
| 13 |
+
self.canvas_height = 36 # Increased from 30
|
| 14 |
+
self._after_id = None
|
| 15 |
+
|
| 16 |
+
def start_splash(self):
|
| 17 |
+
"""Create splash window on main thread"""
|
| 18 |
+
try:
|
| 19 |
+
import tkinter as tk
|
| 20 |
+
|
| 21 |
+
print("🎨 Starting splash screen...")
|
| 22 |
+
|
| 23 |
+
# Create splash window on main thread
|
| 24 |
+
self.splash_window = tk.Tk()
|
| 25 |
+
self.splash_window.title("Loading Glossarion...")
|
| 26 |
+
self.splash_window.geometry("450x350")
|
| 27 |
+
self.splash_window.configure(bg='#2b2b2b')
|
| 28 |
+
self.splash_window.resizable(False, False)
|
| 29 |
+
self.splash_window.overrideredirect(True)
|
| 30 |
+
|
| 31 |
+
# Center the window
|
| 32 |
+
self.splash_window.update_idletasks()
|
| 33 |
+
x = (self.splash_window.winfo_screenwidth() // 2) - 225
|
| 34 |
+
y = (self.splash_window.winfo_screenheight() // 2) - 175
|
| 35 |
+
self.splash_window.geometry(f"450x350+{x}+{y}")
|
| 36 |
+
|
| 37 |
+
# Add content
|
| 38 |
+
main_frame = tk.Frame(self.splash_window, bg='#2b2b2b', relief='raised', bd=2)
|
| 39 |
+
main_frame.pack(fill='both', expand=True, padx=2, pady=2)
|
| 40 |
+
|
| 41 |
+
# Load the actual Halgakos.ico icon
|
| 42 |
+
self._load_icon(main_frame)
|
| 43 |
+
|
| 44 |
+
# Title
|
| 45 |
+
title_label = tk.Label(main_frame, text="Glossarion v4.8.5",
|
| 46 |
+
bg='#2b2b2b', fg='#4a9eff', font=('Arial', 20, 'bold'))
|
| 47 |
+
title_label.pack(pady=(10, 5))
|
| 48 |
+
|
| 49 |
+
# Subtitle
|
| 50 |
+
subtitle_label = tk.Label(main_frame, text="Advanced AI Translation Suite",
|
| 51 |
+
bg='#2b2b2b', fg='#cccccc', font=('Arial', 12))
|
| 52 |
+
subtitle_label.pack(pady=(0, 15))
|
| 53 |
+
|
| 54 |
+
# Status
|
| 55 |
+
self.status_label = tk.Label(main_frame, text=self._status_text,
|
| 56 |
+
bg='#2b2b2b', fg='#ffffff', font=('Arial', 11))
|
| 57 |
+
self.status_label.pack(pady=(10, 10))
|
| 58 |
+
|
| 59 |
+
# Progress bar container
|
| 60 |
+
progress_frame = tk.Frame(main_frame, bg='#2b2b2b')
|
| 61 |
+
progress_frame.pack(pady=(5, 15)) # Adjusted padding for larger bar
|
| 62 |
+
|
| 63 |
+
# Progress bar background
|
| 64 |
+
self.progress_bg = tk.Canvas(progress_frame, width=self.canvas_width, height=self.canvas_height,
|
| 65 |
+
bg='#2b2b2b', highlightthickness=0)
|
| 66 |
+
self.progress_bg.pack()
|
| 67 |
+
|
| 68 |
+
# Create border
|
| 69 |
+
self.progress_bg.create_rectangle(1, 1, self.canvas_width-1, self.canvas_height-1,
|
| 70 |
+
outline='#666666', width=2)
|
| 71 |
+
|
| 72 |
+
# Create background
|
| 73 |
+
self.progress_bg.create_rectangle(3, 3, self.canvas_width-3, self.canvas_height-3,
|
| 74 |
+
fill='#1a1a1a', outline='')
|
| 75 |
+
|
| 76 |
+
# Progress bar fill (will be updated)
|
| 77 |
+
self.progress_fill = None
|
| 78 |
+
|
| 79 |
+
# Progress percentage text - moved up and with better font
|
| 80 |
+
text_x = self.canvas_width // 2 # 160 for 320px width
|
| 81 |
+
text_y = 13.5 # Positioned slightly above center for visual balance
|
| 82 |
+
|
| 83 |
+
# Use a cleaner, more modern font
|
| 84 |
+
progress_font = ('Montserrat', 12, 'bold') # Increased size to 12
|
| 85 |
+
|
| 86 |
+
# Create outline for better readability
|
| 87 |
+
for dx in [-1, 0, 1]:
|
| 88 |
+
for dy in [-1, 0, 1]:
|
| 89 |
+
if dx != 0 or dy != 0:
|
| 90 |
+
self.progress_bg.create_text(text_x + dx, text_y + dy, text="0%",
|
| 91 |
+
fill='#000000', font=progress_font,
|
| 92 |
+
tags="outline", anchor='center')
|
| 93 |
+
|
| 94 |
+
# Main text on top (white)
|
| 95 |
+
self.progress_text = self.progress_bg.create_text(text_x, text_y, text="0%",
|
| 96 |
+
fill='#ffffff', font=progress_font,
|
| 97 |
+
anchor='center')
|
| 98 |
+
|
| 99 |
+
# Version info
|
| 100 |
+
version_label = tk.Label(main_frame, text="Starting up...",
|
| 101 |
+
bg='#2b2b2b', fg='#888888', font=('Arial', 9))
|
| 102 |
+
version_label.pack(side='bottom', pady=(0, 15))
|
| 103 |
+
|
| 104 |
+
# Start progress animation
|
| 105 |
+
self._animate_progress()
|
| 106 |
+
|
| 107 |
+
# Update the display
|
| 108 |
+
self.splash_window.update()
|
| 109 |
+
|
| 110 |
+
# Register cleanup
|
| 111 |
+
atexit.register(self.close_splash)
|
| 112 |
+
return True
|
| 113 |
+
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"⚠️ Could not start splash: {e}")
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
def _load_icon(self, parent):
|
| 119 |
+
"""Load the Halgakos.ico icon"""
|
| 120 |
+
try:
|
| 121 |
+
# Get icon path - handle both development and packaged modes
|
| 122 |
+
import os
|
| 123 |
+
import sys
|
| 124 |
+
import tkinter as tk
|
| 125 |
+
|
| 126 |
+
if getattr(sys, 'frozen', False):
|
| 127 |
+
# Running as .exe
|
| 128 |
+
base_dir = sys._MEIPASS
|
| 129 |
+
else:
|
| 130 |
+
# Running as .py files
|
| 131 |
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 132 |
+
|
| 133 |
+
ico_path = os.path.join(base_dir, 'Halgakos.ico')
|
| 134 |
+
|
| 135 |
+
if os.path.isfile(ico_path):
|
| 136 |
+
try:
|
| 137 |
+
# Try PIL first for better quality
|
| 138 |
+
from PIL import Image, ImageTk
|
| 139 |
+
pil_image = Image.open(ico_path)
|
| 140 |
+
pil_image = pil_image.resize((128, 128), Image.Resampling.LANCZOS)
|
| 141 |
+
icon_photo = ImageTk.PhotoImage(pil_image, master=self.splash_window)
|
| 142 |
+
icon_label = tk.Label(parent, image=icon_photo, bg='#2b2b2b')
|
| 143 |
+
icon_label.image = icon_photo # Keep reference
|
| 144 |
+
icon_label.pack(pady=(20, 10))
|
| 145 |
+
return
|
| 146 |
+
except ImportError:
|
| 147 |
+
# Fallback to basic tkinter
|
| 148 |
+
try:
|
| 149 |
+
icon_image = tk.PhotoImage(file=ico_path)
|
| 150 |
+
icon_label = tk.Label(parent, image=icon_image, bg='#2b2b2b')
|
| 151 |
+
icon_label.image = icon_image
|
| 152 |
+
icon_label.pack(pady=(20, 10))
|
| 153 |
+
return
|
| 154 |
+
except tk.TclError:
|
| 155 |
+
pass
|
| 156 |
+
except Exception:
|
| 157 |
+
pass
|
| 158 |
+
|
| 159 |
+
# Fallback emoji if icon loading fails
|
| 160 |
+
import tkinter as tk
|
| 161 |
+
icon_frame = tk.Frame(parent, bg='#4a9eff', width=128, height=128)
|
| 162 |
+
icon_frame.pack(pady=(20, 10))
|
| 163 |
+
icon_frame.pack_propagate(False)
|
| 164 |
+
|
| 165 |
+
icon_label = tk.Label(icon_frame, text="📚", font=('Arial', 64),
|
| 166 |
+
bg='#4a9eff', fg='white')
|
| 167 |
+
icon_label.pack(expand=True)
|
| 168 |
+
|
| 169 |
+
def _animate_progress(self):
|
| 170 |
+
"""Animate progress bar filling up"""
|
| 171 |
+
# Cancel any existing after callback first
|
| 172 |
+
if self._after_id:
|
| 173 |
+
try:
|
| 174 |
+
self.splash_window.after_cancel(self._after_id)
|
| 175 |
+
except:
|
| 176 |
+
pass
|
| 177 |
+
self._after_id = None
|
| 178 |
+
|
| 179 |
+
if self.splash_window and self.splash_window.winfo_exists():
|
| 180 |
+
try:
|
| 181 |
+
# Auto-increment progress for visual effect during startup
|
| 182 |
+
if self.progress_value < 100:
|
| 183 |
+
# Increment at different rates for different phases
|
| 184 |
+
if self.progress_value < 30:
|
| 185 |
+
self.progress_value += 8 # Fast initial progress
|
| 186 |
+
elif self.progress_value < 70:
|
| 187 |
+
self.progress_value += 4 # Medium progress
|
| 188 |
+
elif self.progress_value < 90:
|
| 189 |
+
self.progress_value += 2 # Slow progress
|
| 190 |
+
else:
|
| 191 |
+
self.progress_value += 1 # Very slow final progress
|
| 192 |
+
|
| 193 |
+
# Cap at 99% until explicitly set to 100%
|
| 194 |
+
if self.progress_value >= 99:
|
| 195 |
+
self.progress_value = 99
|
| 196 |
+
|
| 197 |
+
# Update progress bar fill
|
| 198 |
+
if self.progress_fill:
|
| 199 |
+
self.progress_bg.delete(self.progress_fill)
|
| 200 |
+
# Also delete old highlight
|
| 201 |
+
self.progress_bg.delete("highlight")
|
| 202 |
+
|
| 203 |
+
# Calculate fill width (3 to canvas_width-3)
|
| 204 |
+
fill_width = int((self.progress_value / 100) * (self.canvas_width - 6)) # -6 for borders
|
| 205 |
+
if fill_width > 0:
|
| 206 |
+
# Create gradient effect
|
| 207 |
+
self.progress_fill = self.progress_bg.create_rectangle(
|
| 208 |
+
3, 3, 3 + fill_width, self.canvas_height - 3,
|
| 209 |
+
fill='#4a9eff', outline=''
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Add a highlight effect (adjusted for new height)
|
| 213 |
+
if fill_width > 10:
|
| 214 |
+
self.progress_bg.create_rectangle(
|
| 215 |
+
3, 3, min(13, 3 + fill_width), 12,
|
| 216 |
+
fill='#6bb6ff', outline='', tags="highlight"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# Update percentage text without changing position
|
| 220 |
+
percent_text = f"{self.progress_value}%"
|
| 221 |
+
|
| 222 |
+
# Update main text
|
| 223 |
+
self.progress_bg.itemconfig(self.progress_text, text=percent_text)
|
| 224 |
+
|
| 225 |
+
# Update all outline layers
|
| 226 |
+
for item in self.progress_bg.find_withtag("outline"):
|
| 227 |
+
self.progress_bg.itemconfig(item, text=percent_text)
|
| 228 |
+
|
| 229 |
+
# Ensure text stays on top of progress fill
|
| 230 |
+
self.progress_bg.tag_raise("outline")
|
| 231 |
+
self.progress_bg.tag_raise(self.progress_text)
|
| 232 |
+
|
| 233 |
+
# Store the after ID so we can cancel it later
|
| 234 |
+
self._after_id = self.splash_window.after(100, self._animate_progress)
|
| 235 |
+
|
| 236 |
+
except Exception:
|
| 237 |
+
self._after_id = None
|
| 238 |
+
pass
|
| 239 |
+
|
| 240 |
+
def update_status(self, message):
|
| 241 |
+
"""Update splash status and progress with enhanced module loading support"""
|
| 242 |
+
self._status_text = message
|
| 243 |
+
try:
|
| 244 |
+
if self.splash_window and hasattr(self, 'status_label'):
|
| 245 |
+
self.status_label.config(text=message)
|
| 246 |
+
|
| 247 |
+
# Enhanced progress mapping starting module loading at 10%
|
| 248 |
+
progress_map = {
|
| 249 |
+
"Loading theme framework...": 5,
|
| 250 |
+
"Loading UI framework...": 8,
|
| 251 |
+
|
| 252 |
+
# Module loading phase - starts at 10% and goes to 85%
|
| 253 |
+
"Loading translation modules...": 10,
|
| 254 |
+
"Initializing module system...": 15,
|
| 255 |
+
"Loading translation engine...": 20,
|
| 256 |
+
"Validating translation engine...": 30,
|
| 257 |
+
"✅ translation engine loaded": 40,
|
| 258 |
+
"Loading glossary extractor...": 45,
|
| 259 |
+
"Validating glossary extractor...": 55,
|
| 260 |
+
"✅ glossary extractor loaded": 65,
|
| 261 |
+
"Loading EPUB converter...": 70,
|
| 262 |
+
"✅ EPUB converter loaded": 75,
|
| 263 |
+
"Loading QA scanner...": 78,
|
| 264 |
+
"✅ QA scanner loaded": 82,
|
| 265 |
+
"Finalizing module initialization...": 85,
|
| 266 |
+
"✅ All modules loaded successfully": 88,
|
| 267 |
+
|
| 268 |
+
"Creating main window...": 92,
|
| 269 |
+
"Ready!": 100
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
# Check for exact matches first
|
| 273 |
+
if message in progress_map:
|
| 274 |
+
self.set_progress(progress_map[message])
|
| 275 |
+
else:
|
| 276 |
+
# Check for partial matches
|
| 277 |
+
for key, value in progress_map.items():
|
| 278 |
+
if key in message:
|
| 279 |
+
self.set_progress(value)
|
| 280 |
+
break
|
| 281 |
+
|
| 282 |
+
self.splash_window.update()
|
| 283 |
+
except:
|
| 284 |
+
pass
|
| 285 |
+
|
| 286 |
+
def set_progress(self, value):
|
| 287 |
+
"""Manually set progress value (0-100)"""
|
| 288 |
+
self.progress_value = max(0, min(100, value))
|
| 289 |
+
|
| 290 |
+
def close_splash(self):
|
| 291 |
+
"""Close the splash screen with proper text visibility"""
|
| 292 |
+
try:
|
| 293 |
+
# IMPORTANT: Cancel the animation first
|
| 294 |
+
if self._after_id and self.splash_window:
|
| 295 |
+
try:
|
| 296 |
+
self.splash_window.after_cancel(self._after_id)
|
| 297 |
+
except:
|
| 298 |
+
pass
|
| 299 |
+
self._after_id = None
|
| 300 |
+
|
| 301 |
+
if self.splash_window and self.splash_window.winfo_exists():
|
| 302 |
+
# Set to 100% and ensure text is visible
|
| 303 |
+
self.progress_value = 100
|
| 304 |
+
|
| 305 |
+
# Update display one last time without scheduling another callback
|
| 306 |
+
if hasattr(self, 'progress_fill') and self.progress_fill:
|
| 307 |
+
self.progress_bg.delete(self.progress_fill)
|
| 308 |
+
self.progress_bg.delete("highlight")
|
| 309 |
+
|
| 310 |
+
# Create the 100% progress bar (but leave space for text)
|
| 311 |
+
fill_width = int((self.progress_value / 100) * (self.canvas_width - 6))
|
| 312 |
+
if fill_width > 0:
|
| 313 |
+
# Create progress fill that doesn't cover the text area
|
| 314 |
+
self.progress_fill = self.progress_bg.create_rectangle(
|
| 315 |
+
3, 3, 3 + fill_width, self.canvas_height - 3,
|
| 316 |
+
fill='#4a9eff', outline=''
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
# Add highlight effect
|
| 320 |
+
if fill_width > 10:
|
| 321 |
+
self.progress_bg.create_rectangle(
|
| 322 |
+
3, 3, min(13, 3 + fill_width), 12,
|
| 323 |
+
fill='#6bb6ff', outline='', tags="highlight"
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
# CRITICAL: Make sure text stays on top and is visible
|
| 327 |
+
if hasattr(self, 'progress_text'):
|
| 328 |
+
self.progress_bg.itemconfig(self.progress_text, text="100%", fill='#ffffff')
|
| 329 |
+
|
| 330 |
+
# Update all outline layers for better visibility
|
| 331 |
+
for item in self.progress_bg.find_withtag("outline"):
|
| 332 |
+
self.progress_bg.itemconfig(item, text="100%", fill='#000000')
|
| 333 |
+
|
| 334 |
+
# Ensure text layers are on top of progress fill
|
| 335 |
+
self.progress_bg.tag_raise("outline")
|
| 336 |
+
if hasattr(self, 'progress_text'):
|
| 337 |
+
self.progress_bg.tag_raise(self.progress_text)
|
| 338 |
+
|
| 339 |
+
self.splash_window.update()
|
| 340 |
+
time.sleep(0.1)
|
| 341 |
+
|
| 342 |
+
self.splash_window.destroy()
|
| 343 |
+
self.splash_window = None
|
| 344 |
+
except:
|
| 345 |
+
# Ensure cleanup even on error
|
| 346 |
+
self._after_id = None
|
| 347 |
+
self.splash_window = None
|
tqdm_safety.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# tqdm_safety.py
|
| 2 |
+
"""
|
| 3 |
+
A defensive patch for tqdm to prevent AttributeError at interpreter shutdown:
|
| 4 |
+
AttributeError: type object 'tqdm' has no attribute '_lock'
|
| 5 |
+
|
| 6 |
+
Root cause
|
| 7 |
+
- During interpreter shutdown, module globals/class attributes may be cleared before tqdm.__del__ runs.
|
| 8 |
+
- tqdm.close() calls a class method that uses cls._lock; if it's already deleted, AttributeError is raised.
|
| 9 |
+
|
| 10 |
+
Fix
|
| 11 |
+
- Ensure a class-level _lock exists and is a threading.RLock().
|
| 12 |
+
- Wrap __del__ and close() to guard against shutdown-time attribute loss.
|
| 13 |
+
- No-ops if core attributes are missing, preserving normal behavior during runtime.
|
| 14 |
+
|
| 15 |
+
This keeps tqdm enabled and visible; it only avoids the noisy traceback on exit.
|
| 16 |
+
"""
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import threading
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def apply_tqdm_safety_patch() -> None:
|
| 23 |
+
try:
|
| 24 |
+
import tqdm as _tqdm_mod
|
| 25 |
+
# Prefer the tqdm.tqdm class
|
| 26 |
+
tqdm_cls = getattr(_tqdm_mod, 'tqdm', None)
|
| 27 |
+
if tqdm_cls is None:
|
| 28 |
+
# Some variants might expose TqdmExperimentalWarning only; bail quietly
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
# Ensure a class-level lock exists
|
| 32 |
+
if not hasattr(tqdm_cls, '_lock') or getattr(tqdm_cls, '_lock') is None:
|
| 33 |
+
try:
|
| 34 |
+
tqdm_cls._lock = threading.RLock()
|
| 35 |
+
except Exception:
|
| 36 |
+
# As last resort, set a dummy object with context manager protocol
|
| 37 |
+
class _DummyLock:
|
| 38 |
+
def __enter__(self):
|
| 39 |
+
return self
|
| 40 |
+
def __exit__(self, exc_type, exc, tb):
|
| 41 |
+
return False
|
| 42 |
+
tqdm_cls._lock = _DummyLock()
|
| 43 |
+
|
| 44 |
+
# Patch the class method used during close to guard missing attributes
|
| 45 |
+
_orig_decr = getattr(tqdm_cls, '_decr_instances', None)
|
| 46 |
+
if callable(_orig_decr):
|
| 47 |
+
def _safe_decr_instances(*args, **kwargs):
|
| 48 |
+
try:
|
| 49 |
+
# cls._lock might be gone at shutdown
|
| 50 |
+
if not hasattr(tqdm_cls, '_lock') or tqdm_cls._lock is None:
|
| 51 |
+
return
|
| 52 |
+
return _orig_decr(*args, **kwargs)
|
| 53 |
+
except Exception:
|
| 54 |
+
# Swallow shutdown-time errors only
|
| 55 |
+
return
|
| 56 |
+
try:
|
| 57 |
+
_safe_decr_instances.__name__ = _orig_decr.__name__
|
| 58 |
+
except Exception:
|
| 59 |
+
pass
|
| 60 |
+
setattr(tqdm_cls, '_decr_instances', staticmethod(_safe_decr_instances))
|
| 61 |
+
|
| 62 |
+
# Wrap instance .close() to be defensive
|
| 63 |
+
_orig_close = getattr(tqdm_cls, 'close', None)
|
| 64 |
+
if callable(_orig_close):
|
| 65 |
+
def _safe_close(self, *args, **kwargs):
|
| 66 |
+
try:
|
| 67 |
+
return _orig_close(self, *args, **kwargs)
|
| 68 |
+
except AttributeError:
|
| 69 |
+
# Happens if class attrs are missing at shutdown
|
| 70 |
+
return
|
| 71 |
+
except Exception:
|
| 72 |
+
# Avoid raising during shutdown
|
| 73 |
+
try:
|
| 74 |
+
# Best effort: clear display without relying on internals
|
| 75 |
+
fp = getattr(self, 'fp', None)
|
| 76 |
+
if fp and hasattr(fp, 'flush'):
|
| 77 |
+
fp.flush()
|
| 78 |
+
except Exception:
|
| 79 |
+
pass
|
| 80 |
+
return
|
| 81 |
+
setattr(tqdm_cls, 'close', _safe_close)
|
| 82 |
+
|
| 83 |
+
# Wrap destructor to ignore shutdown-time errors
|
| 84 |
+
_orig_del = getattr(tqdm_cls, '__del__', None)
|
| 85 |
+
if callable(_orig_del):
|
| 86 |
+
def _safe_del(self):
|
| 87 |
+
try:
|
| 88 |
+
_orig_del(self)
|
| 89 |
+
except Exception:
|
| 90 |
+
# Ignore any errors during interpreter shutdown
|
| 91 |
+
return
|
| 92 |
+
setattr(tqdm_cls, '__del__', _safe_del)
|
| 93 |
+
|
| 94 |
+
except Exception:
|
| 95 |
+
# Never let the safety patch break startup
|
| 96 |
+
return
|
translator_gui.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
txt_processor.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# txt_processor.py
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import json
|
| 5 |
+
from typing import List, Tuple, Dict
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
from chapter_splitter import ChapterSplitter
|
| 8 |
+
from decimal import Decimal
|
| 9 |
+
import hashlib
|
| 10 |
+
|
| 11 |
+
class TextFileProcessor:
|
| 12 |
+
"""Process plain text files for translation"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, file_path: str, output_dir: str):
|
| 15 |
+
self.file_path = file_path
|
| 16 |
+
self.output_dir = output_dir
|
| 17 |
+
self.file_base = os.path.splitext(os.path.basename(file_path))[0]
|
| 18 |
+
|
| 19 |
+
# Initialize chapter splitter
|
| 20 |
+
model_name = os.getenv("MODEL", "gpt-3.5-turbo")
|
| 21 |
+
self.chapter_splitter = ChapterSplitter(model_name=model_name)
|
| 22 |
+
|
| 23 |
+
def extract_chapters(self) -> List[Dict]:
|
| 24 |
+
"""Extract chapters from text file"""
|
| 25 |
+
with open(self.file_path, 'r', encoding='utf-8') as f:
|
| 26 |
+
content = f.read()
|
| 27 |
+
|
| 28 |
+
# First, detect chapters in the content
|
| 29 |
+
raw_chapters = self._detect_chapters(content)
|
| 30 |
+
|
| 31 |
+
# Then, process each chapter for splitting if needed
|
| 32 |
+
final_chapters = self._process_chapters_for_splitting(raw_chapters)
|
| 33 |
+
|
| 34 |
+
print(f"📚 Extracted {len(final_chapters)} total chunks from {len(raw_chapters)} detected chapters")
|
| 35 |
+
return final_chapters
|
| 36 |
+
|
| 37 |
+
def _detect_chapters(self, content: str) -> List[Dict]:
|
| 38 |
+
"""Detect chapter boundaries in the text"""
|
| 39 |
+
chapters = []
|
| 40 |
+
|
| 41 |
+
# Chapter detection patterns
|
| 42 |
+
chapter_patterns = [
|
| 43 |
+
# English patterns
|
| 44 |
+
(r'^Chapter\s+(\d+).*$', 'chapter'),
|
| 45 |
+
(r'^CHAPTER\s+(\d+).*$', 'chapter'),
|
| 46 |
+
(r'^Ch\.\s*(\d+).*$', 'chapter'),
|
| 47 |
+
# Numbered sections
|
| 48 |
+
(r'^(\d+)\.\s+(.*)$', 'numbered'),
|
| 49 |
+
(r'^Part\s+(\d+).*$', 'part'),
|
| 50 |
+
# Scene breaks (these don't have numbers)
|
| 51 |
+
(r'^\*\s*\*\s*\*.*$', 'break'),
|
| 52 |
+
(r'^---+.*$', 'break'),
|
| 53 |
+
(r'^===+.*$', 'break'),
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
# Find all chapter markers and their positions
|
| 57 |
+
chapter_breaks = []
|
| 58 |
+
lines = content.split('\n')
|
| 59 |
+
|
| 60 |
+
for line_num, line in enumerate(lines):
|
| 61 |
+
for pattern, pattern_type in chapter_patterns:
|
| 62 |
+
match = re.match(pattern, line.strip())
|
| 63 |
+
if match:
|
| 64 |
+
chapter_breaks.append({
|
| 65 |
+
'line_num': line_num,
|
| 66 |
+
'line': line,
|
| 67 |
+
'type': pattern_type,
|
| 68 |
+
'match': match
|
| 69 |
+
})
|
| 70 |
+
break
|
| 71 |
+
|
| 72 |
+
if not chapter_breaks:
|
| 73 |
+
# No chapter markers found, treat as single chapter
|
| 74 |
+
print(f"No chapter markers found in {self.file_base}, treating as single document")
|
| 75 |
+
# FIX: Use "Section 1" instead of filename to avoid number extraction issues
|
| 76 |
+
chapters = [{
|
| 77 |
+
'num': 1,
|
| 78 |
+
'title': 'Section 1', # Changed from self.file_base
|
| 79 |
+
'content': content
|
| 80 |
+
}]
|
| 81 |
+
else:
|
| 82 |
+
# Split content by chapter markers
|
| 83 |
+
print(f"Found {len(chapter_breaks)} chapter markers in {self.file_base}")
|
| 84 |
+
|
| 85 |
+
for i, chapter_break in enumerate(chapter_breaks):
|
| 86 |
+
# Determine chapter number and title
|
| 87 |
+
chapter_num, chapter_title = self._extract_chapter_info(chapter_break, i)
|
| 88 |
+
|
| 89 |
+
# Get content for this chapter
|
| 90 |
+
start_line = chapter_break['line_num'] + 1 # Start after the chapter marker
|
| 91 |
+
|
| 92 |
+
# Find where this chapter ends
|
| 93 |
+
if i < len(chapter_breaks) - 1:
|
| 94 |
+
end_line = chapter_breaks[i + 1]['line_num']
|
| 95 |
+
else:
|
| 96 |
+
end_line = len(lines)
|
| 97 |
+
|
| 98 |
+
# Extract chapter content
|
| 99 |
+
chapter_lines = lines[start_line:end_line]
|
| 100 |
+
chapter_content = '\n'.join(chapter_lines).strip()
|
| 101 |
+
|
| 102 |
+
if chapter_content: # Only add if there's actual content
|
| 103 |
+
chapters.append({
|
| 104 |
+
'num': chapter_num,
|
| 105 |
+
'title': chapter_title,
|
| 106 |
+
'content': chapter_content
|
| 107 |
+
})
|
| 108 |
+
|
| 109 |
+
return chapters
|
| 110 |
+
|
| 111 |
+
def _extract_chapter_info(self, chapter_break: Dict, index: int) -> Tuple[int, str]:
|
| 112 |
+
"""Extract chapter number and title from a chapter break"""
|
| 113 |
+
if chapter_break['type'] == 'break':
|
| 114 |
+
# Scene breaks don't have numbers
|
| 115 |
+
chapter_num = index + 1
|
| 116 |
+
chapter_title = f"Section {chapter_num}"
|
| 117 |
+
else:
|
| 118 |
+
# Try to extract number from match
|
| 119 |
+
match_groups = chapter_break['match'].groups()
|
| 120 |
+
if match_groups and match_groups[0]: # Check if group exists AND is not empty
|
| 121 |
+
try:
|
| 122 |
+
# Strip whitespace and check if it's a valid number
|
| 123 |
+
num_str = match_groups[0].strip()
|
| 124 |
+
if num_str: # Only try to convert if not empty
|
| 125 |
+
chapter_num = int(num_str)
|
| 126 |
+
chapter_title = chapter_break['line'].strip()
|
| 127 |
+
else:
|
| 128 |
+
# Empty match group, use index
|
| 129 |
+
chapter_num = index + 1
|
| 130 |
+
chapter_title = chapter_break['line'].strip()
|
| 131 |
+
except (ValueError, IndexError):
|
| 132 |
+
# Failed to convert to int, use index
|
| 133 |
+
chapter_num = index + 1
|
| 134 |
+
chapter_title = chapter_break['line'].strip()
|
| 135 |
+
else:
|
| 136 |
+
# No match groups or empty match
|
| 137 |
+
chapter_num = index + 1
|
| 138 |
+
chapter_title = chapter_break['line'].strip()
|
| 139 |
+
|
| 140 |
+
return chapter_num, chapter_title
|
| 141 |
+
|
| 142 |
+
def _process_chapters_for_splitting(self, raw_chapters: List[Dict]) -> List[Dict]:
|
| 143 |
+
"""Process chapters and split them if they exceed token limits"""
|
| 144 |
+
final_chapters = []
|
| 145 |
+
|
| 146 |
+
# Calculate based on OUTPUT token limits
|
| 147 |
+
max_output_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
|
| 148 |
+
compression_factor = float(os.getenv("COMPRESSION_FACTOR", "0.8"))
|
| 149 |
+
safety_margin_output = 500
|
| 150 |
+
|
| 151 |
+
# Calculate chunk size based on output limit
|
| 152 |
+
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
|
| 153 |
+
available_tokens = max(available_tokens, 1000)
|
| 154 |
+
|
| 155 |
+
print(f"📊 Text file chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
|
| 156 |
+
|
| 157 |
+
for chapter_data in raw_chapters:
|
| 158 |
+
# Convert chapter content to HTML format
|
| 159 |
+
chapter_html = self._text_to_html(chapter_data['content'])
|
| 160 |
+
chapter_tokens = self.chapter_splitter.count_tokens(chapter_html)
|
| 161 |
+
|
| 162 |
+
if chapter_tokens > available_tokens:
|
| 163 |
+
# Chapter needs splitting
|
| 164 |
+
print(f"Chapter {chapter_data['num']} ({chapter_data['title']}) has {chapter_tokens} tokens, splitting...")
|
| 165 |
+
|
| 166 |
+
chunks = self.chapter_splitter.split_chapter(chapter_html, available_tokens)
|
| 167 |
+
|
| 168 |
+
# Add each chunk as a separate chapter
|
| 169 |
+
for chunk_html, chunk_idx, total_chunks in chunks:
|
| 170 |
+
chunk_title = chapter_data['title']
|
| 171 |
+
if total_chunks > 1:
|
| 172 |
+
chunk_title = f"{chapter_data['title']} (Part {chunk_idx}/{total_chunks})"
|
| 173 |
+
|
| 174 |
+
# Create float chapter numbers for chunks: 1.0, 1.1, 1.2, etc.
|
| 175 |
+
chunk_num = round(chapter_data['num'] + (chunk_idx - 1) * 0.1, 1)
|
| 176 |
+
|
| 177 |
+
final_chapters.append({
|
| 178 |
+
'num': chunk_num,
|
| 179 |
+
'title': chunk_title,
|
| 180 |
+
'body': chunk_html,
|
| 181 |
+
'filename': f"section_{int(chapter_data['num'])}_part{chunk_idx}.txt", # Changed to avoid using file_base
|
| 182 |
+
'content_hash': self._generate_hash(chunk_html),
|
| 183 |
+
'file_size': len(chunk_html),
|
| 184 |
+
'has_images': False,
|
| 185 |
+
'is_chunk': True,
|
| 186 |
+
'chunk_info': {
|
| 187 |
+
'chunk_idx': chunk_idx,
|
| 188 |
+
'total_chunks': total_chunks,
|
| 189 |
+
'original_chapter': chapter_data['num']
|
| 190 |
+
}
|
| 191 |
+
})
|
| 192 |
+
else:
|
| 193 |
+
# Chapter is small enough, add as-is
|
| 194 |
+
final_chapters.append({
|
| 195 |
+
'num': chapter_data['num'], # Keep as integer for non-split chapters
|
| 196 |
+
'title': chapter_data['title'],
|
| 197 |
+
'body': chapter_html,
|
| 198 |
+
'filename': f"section_{chapter_data['num']}.txt", # Changed to avoid using file_base
|
| 199 |
+
'content_hash': self._generate_hash(chapter_html),
|
| 200 |
+
'file_size': len(chapter_html),
|
| 201 |
+
'has_images': False,
|
| 202 |
+
'is_chunk': False
|
| 203 |
+
})
|
| 204 |
+
|
| 205 |
+
# Ensure we have at least one chapter
|
| 206 |
+
if not final_chapters:
|
| 207 |
+
# Fallback: create a single chapter with all content
|
| 208 |
+
all_content = '\n\n'.join(ch['content'] for ch in raw_chapters if ch.get('content'))
|
| 209 |
+
if not all_content and raw_chapters:
|
| 210 |
+
all_content = raw_chapters[0].get('content', '')
|
| 211 |
+
|
| 212 |
+
final_chapters.append({
|
| 213 |
+
'num': 1,
|
| 214 |
+
'title': 'Section 1', # Changed from self.file_base
|
| 215 |
+
'body': self._text_to_html(all_content or 'Empty file'),
|
| 216 |
+
'filename': 'section_1.txt', # Changed to avoid using file_base
|
| 217 |
+
'content_hash': self._generate_hash(all_content or ''),
|
| 218 |
+
'file_size': len(all_content or ''),
|
| 219 |
+
'has_images': False,
|
| 220 |
+
'is_chunk': False
|
| 221 |
+
})
|
| 222 |
+
|
| 223 |
+
return final_chapters
|
| 224 |
+
|
| 225 |
+
def _text_to_html(self, text: str) -> str:
|
| 226 |
+
"""Convert plain text to HTML format"""
|
| 227 |
+
# Escape HTML characters
|
| 228 |
+
text = text.replace('&', '&')
|
| 229 |
+
text = text.replace('<', '<')
|
| 230 |
+
text = text.replace('>', '>')
|
| 231 |
+
|
| 232 |
+
# Split into paragraphs
|
| 233 |
+
paragraphs = text.split('\n\n')
|
| 234 |
+
|
| 235 |
+
# Wrap each paragraph in <p> tags
|
| 236 |
+
html_parts = []
|
| 237 |
+
for para in paragraphs:
|
| 238 |
+
para = para.strip()
|
| 239 |
+
if para:
|
| 240 |
+
# Check if it's a chapter heading
|
| 241 |
+
if re.match(r'^(Chapter|CHAPTER|Ch\.|Part)\s+\d+', para):
|
| 242 |
+
html_parts.append(f'<h1>{para}</h1>')
|
| 243 |
+
else:
|
| 244 |
+
# Replace single newlines with <br> within paragraphs
|
| 245 |
+
para = para.replace('\n', '<br>\n')
|
| 246 |
+
html_parts.append(f'<p>{para}</p>')
|
| 247 |
+
|
| 248 |
+
# Create a simple HTML structure
|
| 249 |
+
html = f"""<html>
|
| 250 |
+
<head>
|
| 251 |
+
<title>{self.file_base}</title>
|
| 252 |
+
<meta charset="utf-8"/>
|
| 253 |
+
</head>
|
| 254 |
+
<body>
|
| 255 |
+
{''.join(html_parts)}
|
| 256 |
+
</body>
|
| 257 |
+
</html>"""
|
| 258 |
+
|
| 259 |
+
return html
|
| 260 |
+
|
| 261 |
+
def _generate_hash(self, content: str) -> str:
|
| 262 |
+
"""Generate hash for content"""
|
| 263 |
+
return hashlib.md5(content.encode('utf-8')).hexdigest()
|
| 264 |
+
|
| 265 |
+
def save_original_structure(self):
|
| 266 |
+
"""Save original text file structure info"""
|
| 267 |
+
metadata = {
|
| 268 |
+
'source_file': os.path.basename(self.file_path),
|
| 269 |
+
'type': 'text',
|
| 270 |
+
'encoding': 'utf-8'
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
metadata_path = os.path.join(self.output_dir, 'metadata.json')
|
| 274 |
+
with open(metadata_path, 'w', encoding='utf-8') as f:
|
| 275 |
+
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
| 276 |
+
|
| 277 |
+
def create_output_structure(self, translated_chapters: List[Tuple[str, str]]) -> str:
|
| 278 |
+
"""Create output text file from translated chapters"""
|
| 279 |
+
# Sort chapters by filename to ensure correct order
|
| 280 |
+
sorted_chapters = sorted(translated_chapters, key=lambda x: x[0])
|
| 281 |
+
|
| 282 |
+
# Combine all content
|
| 283 |
+
all_content = []
|
| 284 |
+
for filename, content in sorted_chapters:
|
| 285 |
+
# Extract text from HTML
|
| 286 |
+
soup = BeautifulSoup(content, 'html.parser')
|
| 287 |
+
text_content = soup.get_text()
|
| 288 |
+
|
| 289 |
+
# Add chapter separator if needed
|
| 290 |
+
if len(all_content) > 0:
|
| 291 |
+
all_content.append('\n\n' + '='*50 + '\n\n')
|
| 292 |
+
|
| 293 |
+
all_content.append(text_content)
|
| 294 |
+
|
| 295 |
+
# Create output filename
|
| 296 |
+
output_filename = f"{self.file_base}_translated.txt"
|
| 297 |
+
output_path = os.path.join(self.output_dir, output_filename)
|
| 298 |
+
|
| 299 |
+
# Write the translated text
|
| 300 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 301 |
+
f.write(''.join(all_content))
|
| 302 |
+
|
| 303 |
+
print(f"✅ Created translated text file: {output_filename}")
|
| 304 |
+
return output_path
|
unified_api_client.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
update_manager.py
ADDED
|
@@ -0,0 +1,826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# update_manager.py - Auto-update functionality for Glossarion
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import json
|
| 5 |
+
import requests
|
| 6 |
+
import threading
|
| 7 |
+
import concurrent.futures
|
| 8 |
+
import time
|
| 9 |
+
import re
|
| 10 |
+
from typing import Optional, Dict, Tuple, List
|
| 11 |
+
from packaging import version
|
| 12 |
+
import tkinter as tk
|
| 13 |
+
from tkinter import ttk, messagebox, font
|
| 14 |
+
import ttkbootstrap as tb
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
|
| 17 |
+
class UpdateManager:
|
| 18 |
+
"""Handles automatic update checking and installation for Glossarion"""
|
| 19 |
+
|
| 20 |
+
GITHUB_API_URL = "https://api.github.com/repos/Shirochi-stack/Glossarion/releases"
|
| 21 |
+
GITHUB_LATEST_URL = "https://api.github.com/repos/Shirochi-stack/Glossarion/releases/latest"
|
| 22 |
+
|
| 23 |
+
def __init__(self, main_gui, base_dir):
|
| 24 |
+
self.main_gui = main_gui
|
| 25 |
+
self.base_dir = base_dir
|
| 26 |
+
self.update_available = False
|
| 27 |
+
# Use shared executor from main GUI if available
|
| 28 |
+
try:
|
| 29 |
+
if hasattr(self.main_gui, '_ensure_executor'):
|
| 30 |
+
self.main_gui._ensure_executor()
|
| 31 |
+
self.executor = getattr(self.main_gui, 'executor', None)
|
| 32 |
+
except Exception:
|
| 33 |
+
self.executor = None
|
| 34 |
+
self.latest_release = None
|
| 35 |
+
self.all_releases = [] # Store all fetched releases
|
| 36 |
+
self.download_progress = 0
|
| 37 |
+
self.is_downloading = False
|
| 38 |
+
# Load persistent check time from config
|
| 39 |
+
self._last_check_time = self.main_gui.config.get('last_update_check_time', 0)
|
| 40 |
+
self._check_cache_duration = 1800 # Cache for 30 minutes
|
| 41 |
+
self.selected_asset = None # Store selected asset for download
|
| 42 |
+
|
| 43 |
+
# Get version from the main GUI's __version__ variable
|
| 44 |
+
if hasattr(main_gui, '__version__'):
|
| 45 |
+
self.CURRENT_VERSION = main_gui.__version__
|
| 46 |
+
else:
|
| 47 |
+
# Extract from window title as fallback
|
| 48 |
+
title = self.main_gui.master.title()
|
| 49 |
+
if 'v' in title:
|
| 50 |
+
self.CURRENT_VERSION = title.split('v')[-1].strip()
|
| 51 |
+
else:
|
| 52 |
+
self.CURRENT_VERSION = "0.0.0"
|
| 53 |
+
|
| 54 |
+
def fetch_multiple_releases(self, count=10) -> List[Dict]:
|
| 55 |
+
"""Fetch multiple releases from GitHub
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
count: Number of releases to fetch
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
List of release data dictionaries
|
| 62 |
+
"""
|
| 63 |
+
try:
|
| 64 |
+
headers = {
|
| 65 |
+
'Accept': 'application/vnd.github.v3+json',
|
| 66 |
+
'User-Agent': 'Glossarion-Updater'
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Fetch multiple releases with retry logic
|
| 70 |
+
max_retries = 2
|
| 71 |
+
timeout = 10 # Reduced timeout
|
| 72 |
+
|
| 73 |
+
for attempt in range(max_retries + 1):
|
| 74 |
+
try:
|
| 75 |
+
response = requests.get(
|
| 76 |
+
f"{self.GITHUB_API_URL}?per_page={count}",
|
| 77 |
+
headers=headers,
|
| 78 |
+
timeout=timeout
|
| 79 |
+
)
|
| 80 |
+
response.raise_for_status()
|
| 81 |
+
break # Success
|
| 82 |
+
except (requests.Timeout, requests.ConnectionError) as e:
|
| 83 |
+
if attempt == max_retries:
|
| 84 |
+
raise # Re-raise after final attempt
|
| 85 |
+
time.sleep(1)
|
| 86 |
+
|
| 87 |
+
releases = response.json()
|
| 88 |
+
|
| 89 |
+
# Process each release's notes
|
| 90 |
+
for release in releases:
|
| 91 |
+
if 'body' in release and release['body']:
|
| 92 |
+
# Clean up but don't truncate for history viewing
|
| 93 |
+
body = release['body']
|
| 94 |
+
# Just clean up excessive newlines
|
| 95 |
+
body = re.sub(r'\n{3,}', '\n\n', body)
|
| 96 |
+
release['body'] = body
|
| 97 |
+
|
| 98 |
+
return releases
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print(f"Error fetching releases: {e}")
|
| 102 |
+
return []
|
| 103 |
+
|
| 104 |
+
def check_for_updates_async(self, silent=True, force_show=False):
|
| 105 |
+
"""Run check_for_updates in the background using the shared executor.
|
| 106 |
+
Returns a Future if an executor is available, else runs in a thread.
|
| 107 |
+
"""
|
| 108 |
+
try:
|
| 109 |
+
# Ensure shared executor
|
| 110 |
+
if hasattr(self.main_gui, '_ensure_executor'):
|
| 111 |
+
self.main_gui._ensure_executor()
|
| 112 |
+
execu = getattr(self, 'executor', None) or getattr(self.main_gui, 'executor', None)
|
| 113 |
+
if execu:
|
| 114 |
+
future = execu.submit(self.check_for_updates, silent, force_show)
|
| 115 |
+
return future
|
| 116 |
+
except Exception:
|
| 117 |
+
pass
|
| 118 |
+
|
| 119 |
+
# Fallback to thread if executor not available
|
| 120 |
+
def _worker():
|
| 121 |
+
try:
|
| 122 |
+
self.check_for_updates(silent=silent, force_show=force_show)
|
| 123 |
+
except Exception:
|
| 124 |
+
pass
|
| 125 |
+
t = threading.Thread(target=_worker, daemon=True)
|
| 126 |
+
t.start()
|
| 127 |
+
return None
|
| 128 |
+
|
| 129 |
+
def check_for_updates(self, silent=True, force_show=False) -> Tuple[bool, Optional[Dict]]:
|
| 130 |
+
"""Check GitHub for newer releases
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
silent: If True, don't show error messages
|
| 134 |
+
force_show: If True, show the dialog even when up to date
|
| 135 |
+
|
| 136 |
+
Returns:
|
| 137 |
+
Tuple of (update_available, release_info)
|
| 138 |
+
"""
|
| 139 |
+
try:
|
| 140 |
+
# Check if we need to skip the check due to cache
|
| 141 |
+
current_time = time.time()
|
| 142 |
+
if not force_show and (current_time - self._last_check_time) < self._check_cache_duration:
|
| 143 |
+
print(f"[DEBUG] Skipping update check - cache still valid for {int(self._check_cache_duration - (current_time - self._last_check_time))} seconds")
|
| 144 |
+
return False, None
|
| 145 |
+
|
| 146 |
+
# Check if this version was previously skipped
|
| 147 |
+
skipped_versions = self.main_gui.config.get('skipped_versions', [])
|
| 148 |
+
|
| 149 |
+
headers = {
|
| 150 |
+
'Accept': 'application/vnd.github.v3+json',
|
| 151 |
+
'User-Agent': 'Glossarion-Updater'
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
# Try with shorter timeout and retry logic
|
| 155 |
+
max_retries = 2
|
| 156 |
+
timeout = 10 # Reduced from 30 seconds
|
| 157 |
+
|
| 158 |
+
for attempt in range(max_retries + 1):
|
| 159 |
+
try:
|
| 160 |
+
print(f"[DEBUG] Update check attempt {attempt + 1}/{max_retries + 1}")
|
| 161 |
+
response = requests.get(self.GITHUB_LATEST_URL, headers=headers, timeout=timeout)
|
| 162 |
+
response.raise_for_status()
|
| 163 |
+
break # Success, exit retry loop
|
| 164 |
+
except (requests.Timeout, requests.ConnectionError) as e:
|
| 165 |
+
if attempt == max_retries:
|
| 166 |
+
# Last attempt failed, save check time and re-raise
|
| 167 |
+
self._save_last_check_time()
|
| 168 |
+
raise
|
| 169 |
+
print(f"[DEBUG] Network error on attempt {attempt + 1}: {e}")
|
| 170 |
+
time.sleep(1) # Short delay before retry
|
| 171 |
+
|
| 172 |
+
release_data = response.json()
|
| 173 |
+
latest_version = release_data['tag_name'].lstrip('v')
|
| 174 |
+
|
| 175 |
+
# Save successful check time
|
| 176 |
+
self._save_last_check_time()
|
| 177 |
+
|
| 178 |
+
# Fetch all releases for history regardless
|
| 179 |
+
self.all_releases = self.fetch_multiple_releases(count=10)
|
| 180 |
+
self.latest_release = release_data
|
| 181 |
+
|
| 182 |
+
# Check if this version was skipped by user
|
| 183 |
+
if release_data['tag_name'] in skipped_versions and not force_show:
|
| 184 |
+
return False, None
|
| 185 |
+
|
| 186 |
+
# Compare versions
|
| 187 |
+
if version.parse(latest_version) > version.parse(self.CURRENT_VERSION):
|
| 188 |
+
self.update_available = True
|
| 189 |
+
|
| 190 |
+
# Show update dialog when update is available
|
| 191 |
+
print(f"[DEBUG] Showing update dialog for version {latest_version}")
|
| 192 |
+
self.main_gui.master.after(100, self.show_update_dialog)
|
| 193 |
+
|
| 194 |
+
return True, release_data
|
| 195 |
+
else:
|
| 196 |
+
# We're up to date
|
| 197 |
+
self.update_available = False
|
| 198 |
+
|
| 199 |
+
# Show dialog if explicitly requested (from menu)
|
| 200 |
+
if force_show or not silent:
|
| 201 |
+
self.main_gui.master.after(100, self.show_update_dialog)
|
| 202 |
+
|
| 203 |
+
return False, None
|
| 204 |
+
|
| 205 |
+
except requests.Timeout:
|
| 206 |
+
if not silent:
|
| 207 |
+
messagebox.showerror("Update Check Failed",
|
| 208 |
+
"Connection timed out while checking for updates.\n\n"
|
| 209 |
+
"This is usually due to network connectivity issues.\n"
|
| 210 |
+
"The next update check will be in 1 hour.")
|
| 211 |
+
return False, None
|
| 212 |
+
|
| 213 |
+
except requests.ConnectionError as e:
|
| 214 |
+
if not silent:
|
| 215 |
+
if 'api.github.com' in str(e):
|
| 216 |
+
messagebox.showerror("Update Check Failed",
|
| 217 |
+
"Cannot reach GitHub servers for update check.\n\n"
|
| 218 |
+
"This may be due to:\n"
|
| 219 |
+
"• Internet connectivity issues\n"
|
| 220 |
+
"• Firewall blocking GitHub API\n"
|
| 221 |
+
"• GitHub API temporarily unavailable\n\n"
|
| 222 |
+
"The next update check will be in 1 hour.")
|
| 223 |
+
else:
|
| 224 |
+
messagebox.showerror("Update Check Failed",
|
| 225 |
+
f"Network error: {str(e)}\n\n"
|
| 226 |
+
"The next update check will be in 1 hour.")
|
| 227 |
+
return False, None
|
| 228 |
+
|
| 229 |
+
except requests.HTTPError as e:
|
| 230 |
+
if not silent:
|
| 231 |
+
if e.response.status_code == 403:
|
| 232 |
+
messagebox.showerror("Update Check Failed",
|
| 233 |
+
"GitHub API rate limit exceeded. Please try again later.")
|
| 234 |
+
else:
|
| 235 |
+
messagebox.showerror("Update Check Failed",
|
| 236 |
+
f"GitHub returned error: {e.response.status_code}")
|
| 237 |
+
return False, None
|
| 238 |
+
|
| 239 |
+
except ValueError as e:
|
| 240 |
+
if not silent:
|
| 241 |
+
messagebox.showerror("Update Check Failed",
|
| 242 |
+
"Invalid response from GitHub. The update service may be temporarily unavailable.")
|
| 243 |
+
return False, None
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
if not silent:
|
| 247 |
+
messagebox.showerror("Update Check Failed",
|
| 248 |
+
f"An unexpected error occurred:\n{str(e)}")
|
| 249 |
+
return False, None
|
| 250 |
+
|
| 251 |
+
def check_for_updates_manual(self):
|
| 252 |
+
"""Manual update check from menu - always shows dialog (async)"""
|
| 253 |
+
return self.check_for_updates_async(silent=False, force_show=True)
|
| 254 |
+
|
| 255 |
+
def _save_last_check_time(self):
|
| 256 |
+
"""Save the last update check time to config"""
|
| 257 |
+
try:
|
| 258 |
+
current_time = time.time()
|
| 259 |
+
self._last_check_time = current_time
|
| 260 |
+
self.main_gui.config['last_update_check_time'] = current_time
|
| 261 |
+
# Save config without showing message
|
| 262 |
+
self.main_gui.save_config(show_message=False)
|
| 263 |
+
except Exception as e:
|
| 264 |
+
print(f"[DEBUG] Failed to save last check time: {e}")
|
| 265 |
+
|
| 266 |
+
def format_markdown_to_tkinter(self, text_widget, markdown_text):
|
| 267 |
+
"""Convert GitHub markdown to formatted tkinter text - simplified version
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
text_widget: The Text widget to insert formatted text into
|
| 271 |
+
markdown_text: The markdown source text
|
| 272 |
+
"""
|
| 273 |
+
# Configure minimal tags
|
| 274 |
+
text_widget.tag_config("heading", font=('TkDefaultFont', 12, 'bold'))
|
| 275 |
+
text_widget.tag_config("bold", font=('TkDefaultFont', 10, 'bold'))
|
| 276 |
+
|
| 277 |
+
# Process text line by line with minimal formatting
|
| 278 |
+
lines = markdown_text.split('\n')
|
| 279 |
+
|
| 280 |
+
for line in lines:
|
| 281 |
+
# Strip any weird unicode characters that might cause display issues
|
| 282 |
+
line = ''.join(char for char in line if ord(char) < 65536)
|
| 283 |
+
|
| 284 |
+
# Handle headings
|
| 285 |
+
if line.startswith('#'):
|
| 286 |
+
# Remove all # symbols and get the heading text
|
| 287 |
+
heading_text = line.lstrip('#').strip()
|
| 288 |
+
if heading_text:
|
| 289 |
+
text_widget.insert('end', heading_text + '\n', 'heading')
|
| 290 |
+
|
| 291 |
+
# Handle bullet points
|
| 292 |
+
elif line.strip().startswith(('- ', '* ')):
|
| 293 |
+
# Get the text after the bullet
|
| 294 |
+
bullet_text = line.strip()[2:].strip()
|
| 295 |
+
# Clean the text of markdown formatting
|
| 296 |
+
bullet_text = self._clean_markdown_text(bullet_text)
|
| 297 |
+
text_widget.insert('end', ' • ' + bullet_text + '\n')
|
| 298 |
+
|
| 299 |
+
# Handle numbered lists
|
| 300 |
+
elif re.match(r'^\s*\d+\.\s', line):
|
| 301 |
+
# Extract number and text
|
| 302 |
+
match = re.match(r'^(\s*)(\d+)\.\s(.+)', line)
|
| 303 |
+
if match:
|
| 304 |
+
indent, num, text = match.groups()
|
| 305 |
+
clean_text = self._clean_markdown_text(text.strip())
|
| 306 |
+
text_widget.insert('end', f' {num}. {clean_text}\n')
|
| 307 |
+
|
| 308 |
+
# Handle separator lines
|
| 309 |
+
elif line.strip() in ['---', '***', '___']:
|
| 310 |
+
text_widget.insert('end', '─' * 40 + '\n')
|
| 311 |
+
|
| 312 |
+
# Handle code blocks - just skip the markers
|
| 313 |
+
elif line.strip().startswith('```'):
|
| 314 |
+
continue # Skip code fence markers
|
| 315 |
+
|
| 316 |
+
# Regular text
|
| 317 |
+
elif line.strip():
|
| 318 |
+
# Clean and insert the line
|
| 319 |
+
clean_text = self._clean_markdown_text(line)
|
| 320 |
+
# Check if this looks like it should be bold (common pattern)
|
| 321 |
+
if clean_text.endswith(':') and len(clean_text) < 50:
|
| 322 |
+
text_widget.insert('end', clean_text + '\n', 'bold')
|
| 323 |
+
else:
|
| 324 |
+
text_widget.insert('end', clean_text + '\n')
|
| 325 |
+
|
| 326 |
+
# Empty lines
|
| 327 |
+
else:
|
| 328 |
+
text_widget.insert('end', '\n')
|
| 329 |
+
|
| 330 |
+
def _clean_markdown_text(self, text):
|
| 331 |
+
"""Remove markdown formatting from text
|
| 332 |
+
|
| 333 |
+
Args:
|
| 334 |
+
text: Text with markdown formatting
|
| 335 |
+
|
| 336 |
+
Returns:
|
| 337 |
+
Clean text without markdown symbols
|
| 338 |
+
"""
|
| 339 |
+
# Remove inline code backticks
|
| 340 |
+
text = re.sub(r'`([^`]+)`', r'\1', text)
|
| 341 |
+
|
| 342 |
+
# Remove bold markers
|
| 343 |
+
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
|
| 344 |
+
text = re.sub(r'__([^_]+)__', r'\1', text)
|
| 345 |
+
|
| 346 |
+
# Remove italic markers
|
| 347 |
+
text = re.sub(r'\*([^*]+)\*', r'\1', text)
|
| 348 |
+
text = re.sub(r'_([^_]+)_', r'\1', text)
|
| 349 |
+
|
| 350 |
+
# Remove links but keep link text
|
| 351 |
+
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
|
| 352 |
+
|
| 353 |
+
# Remove any remaining special characters that might cause issues
|
| 354 |
+
text = text.replace('\u200b', '') # Remove zero-width spaces
|
| 355 |
+
text = text.replace('\ufeff', '') # Remove BOM
|
| 356 |
+
|
| 357 |
+
return text.strip()
|
| 358 |
+
|
| 359 |
+
def show_update_dialog(self):
|
| 360 |
+
"""Show update dialog (for updates or version history)"""
|
| 361 |
+
if not self.latest_release and not self.all_releases:
|
| 362 |
+
# Try to fetch releases if we don't have them
|
| 363 |
+
self.all_releases = self.fetch_multiple_releases(count=10)
|
| 364 |
+
if self.all_releases:
|
| 365 |
+
self.latest_release = self.all_releases[0]
|
| 366 |
+
else:
|
| 367 |
+
messagebox.showerror("Error", "Unable to fetch version information from GitHub.")
|
| 368 |
+
return
|
| 369 |
+
|
| 370 |
+
# Set appropriate title
|
| 371 |
+
if self.update_available:
|
| 372 |
+
title = "Update Available"
|
| 373 |
+
else:
|
| 374 |
+
title = "Version History"
|
| 375 |
+
|
| 376 |
+
# Create dialog first without content
|
| 377 |
+
dialog, scrollable_frame, canvas = self.main_gui.wm.setup_scrollable(
|
| 378 |
+
self.main_gui.master,
|
| 379 |
+
title,
|
| 380 |
+
width=None,
|
| 381 |
+
height=None,
|
| 382 |
+
max_width_ratio=0.5,
|
| 383 |
+
max_height_ratio=0.8
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
# Show dialog immediately
|
| 387 |
+
dialog.update_idletasks()
|
| 388 |
+
|
| 389 |
+
# Then populate content
|
| 390 |
+
self.main_gui.master.after(10, lambda: self._populate_update_dialog(dialog, scrollable_frame, canvas))
|
| 391 |
+
|
| 392 |
+
def _populate_update_dialog(self, dialog, scrollable_frame, canvas):
|
| 393 |
+
"""Populate the update dialog content"""
|
| 394 |
+
# Main container
|
| 395 |
+
main_frame = ttk.Frame(scrollable_frame)
|
| 396 |
+
main_frame.pack(fill='both', expand=True, padx=20, pady=20)
|
| 397 |
+
|
| 398 |
+
# Initialize selected_asset to None
|
| 399 |
+
self.selected_asset = None
|
| 400 |
+
|
| 401 |
+
# Version info
|
| 402 |
+
version_frame = ttk.LabelFrame(main_frame, text="Version Information", padding=10)
|
| 403 |
+
version_frame.pack(fill='x', pady=(0, 10))
|
| 404 |
+
|
| 405 |
+
ttk.Label(version_frame,
|
| 406 |
+
text=f"Current Version: {self.CURRENT_VERSION}").pack(anchor='w')
|
| 407 |
+
|
| 408 |
+
if self.latest_release:
|
| 409 |
+
latest_version = self.latest_release['tag_name']
|
| 410 |
+
if self.update_available:
|
| 411 |
+
ttk.Label(version_frame,
|
| 412 |
+
text=f"Latest Version: {latest_version}",
|
| 413 |
+
font=('TkDefaultFont', 10, 'bold')).pack(anchor='w')
|
| 414 |
+
else:
|
| 415 |
+
ttk.Label(version_frame,
|
| 416 |
+
text=f"Latest Version: {latest_version} ✓ You are up to date!",
|
| 417 |
+
foreground='green',
|
| 418 |
+
font=('TkDefaultFont', 10, 'bold')).pack(anchor='w')
|
| 419 |
+
|
| 420 |
+
# ALWAYS show asset selection when we have the first release data (current or latest)
|
| 421 |
+
release_to_check = self.all_releases[0] if self.all_releases else self.latest_release
|
| 422 |
+
|
| 423 |
+
if release_to_check:
|
| 424 |
+
# Get exe files from the first/latest release
|
| 425 |
+
exe_assets = [a for a in release_to_check.get('assets', [])
|
| 426 |
+
if a['name'].lower().endswith('.exe')]
|
| 427 |
+
|
| 428 |
+
print(f"[DEBUG] Found {len(exe_assets)} exe files in release {release_to_check.get('tag_name')}")
|
| 429 |
+
|
| 430 |
+
# Show selection UI if there are exe files
|
| 431 |
+
if exe_assets:
|
| 432 |
+
# Determine the title based on whether there are multiple variants
|
| 433 |
+
if len(exe_assets) > 1:
|
| 434 |
+
frame_title = "Select Version to Download"
|
| 435 |
+
else:
|
| 436 |
+
frame_title = "Available Download"
|
| 437 |
+
|
| 438 |
+
asset_frame = ttk.LabelFrame(main_frame, text=frame_title, padding=10)
|
| 439 |
+
asset_frame.pack(fill='x', pady=(0, 10))
|
| 440 |
+
|
| 441 |
+
if len(exe_assets) > 1:
|
| 442 |
+
# Multiple exe files - show radio buttons to choose
|
| 443 |
+
self.asset_var = tk.StringVar()
|
| 444 |
+
for i, asset in enumerate(exe_assets):
|
| 445 |
+
filename = asset['name']
|
| 446 |
+
size_mb = asset['size'] / (1024 * 1024)
|
| 447 |
+
|
| 448 |
+
# Try to identify variant type from filename
|
| 449 |
+
if 'full' in filename.lower():
|
| 450 |
+
variant_label = f"Full Version - {filename} ({size_mb:.1f} MB)"
|
| 451 |
+
else:
|
| 452 |
+
variant_label = f"Standard Version - {filename} ({size_mb:.1f} MB)"
|
| 453 |
+
|
| 454 |
+
rb = ttk.Radiobutton(asset_frame, text=variant_label,
|
| 455 |
+
variable=self.asset_var,
|
| 456 |
+
value=str(i))
|
| 457 |
+
rb.pack(anchor='w', pady=2)
|
| 458 |
+
|
| 459 |
+
# Select first option by default
|
| 460 |
+
if i == 0:
|
| 461 |
+
self.asset_var.set(str(i))
|
| 462 |
+
self.selected_asset = asset
|
| 463 |
+
|
| 464 |
+
# Add listener for selection changes
|
| 465 |
+
def on_asset_change(*args):
|
| 466 |
+
idx = int(self.asset_var.get())
|
| 467 |
+
self.selected_asset = exe_assets[idx]
|
| 468 |
+
|
| 469 |
+
self.asset_var.trace_add('write', on_asset_change)
|
| 470 |
+
else:
|
| 471 |
+
# Only one exe file - just show it and set it as selected
|
| 472 |
+
self.selected_asset = exe_assets[0]
|
| 473 |
+
filename = exe_assets[0]['name']
|
| 474 |
+
size_mb = exe_assets[0]['size'] / (1024 * 1024)
|
| 475 |
+
ttk.Label(asset_frame,
|
| 476 |
+
text=f"{filename} ({size_mb:.1f} MB)").pack(anchor='w')
|
| 477 |
+
|
| 478 |
+
# Create notebook for version history
|
| 479 |
+
notebook = ttk.Notebook(main_frame)
|
| 480 |
+
notebook.pack(fill='both', expand=True, pady=(0, 10))
|
| 481 |
+
|
| 482 |
+
# Add tabs for different versions
|
| 483 |
+
if self.all_releases:
|
| 484 |
+
for i, release in enumerate(self.all_releases[:5]): # Show up to 5 versions
|
| 485 |
+
version_tag = release['tag_name']
|
| 486 |
+
version_num = version_tag.lstrip('v')
|
| 487 |
+
is_current = version_num == self.CURRENT_VERSION
|
| 488 |
+
is_latest = i == 0
|
| 489 |
+
|
| 490 |
+
# Create tab label
|
| 491 |
+
tab_label = version_tag
|
| 492 |
+
if is_current and is_latest:
|
| 493 |
+
tab_label += " (Current)"
|
| 494 |
+
elif is_current:
|
| 495 |
+
tab_label += " (Current)"
|
| 496 |
+
elif is_latest:
|
| 497 |
+
tab_label += " (Latest)"
|
| 498 |
+
|
| 499 |
+
# Create frame for this version
|
| 500 |
+
tab_frame = ttk.Frame(notebook)
|
| 501 |
+
notebook.add(tab_frame, text=tab_label)
|
| 502 |
+
|
| 503 |
+
# Add release date
|
| 504 |
+
if 'published_at' in release:
|
| 505 |
+
date_str = release['published_at'][:10] # Get YYYY-MM-DD
|
| 506 |
+
date_label = ttk.Label(tab_frame, text=f"Released: {date_str}",
|
| 507 |
+
font=('TkDefaultFont', 9, 'italic'))
|
| 508 |
+
date_label.pack(anchor='w', padx=10, pady=(10, 5))
|
| 509 |
+
|
| 510 |
+
# Create text widget for release notes
|
| 511 |
+
text_frame = ttk.Frame(tab_frame)
|
| 512 |
+
text_frame.pack(fill='both', expand=True, padx=10, pady=(0, 10))
|
| 513 |
+
|
| 514 |
+
notes_text = tk.Text(text_frame, height=12, wrap='word', width=60)
|
| 515 |
+
notes_scroll = ttk.Scrollbar(text_frame, command=notes_text.yview)
|
| 516 |
+
notes_text.config(yscrollcommand=notes_scroll.set)
|
| 517 |
+
|
| 518 |
+
notes_text.pack(side='left', fill='both', expand=True)
|
| 519 |
+
notes_scroll.pack(side='right', fill='y')
|
| 520 |
+
|
| 521 |
+
# Format and insert release notes with markdown support
|
| 522 |
+
release_notes = release.get('body', 'No release notes available')
|
| 523 |
+
self.format_markdown_to_tkinter(notes_text, release_notes)
|
| 524 |
+
|
| 525 |
+
notes_text.config(state='disabled') # Make read-only
|
| 526 |
+
|
| 527 |
+
# Don't set background color as it causes rendering artifacts
|
| 528 |
+
else:
|
| 529 |
+
# Fallback to simple display if no releases fetched
|
| 530 |
+
notes_frame = ttk.LabelFrame(main_frame, text="Release Notes", padding=10)
|
| 531 |
+
notes_frame.pack(fill='both', expand=True, pady=(0, 10))
|
| 532 |
+
|
| 533 |
+
notes_text = tk.Text(notes_frame, height=10, wrap='word')
|
| 534 |
+
notes_scroll = ttk.Scrollbar(notes_frame, command=notes_text.yview)
|
| 535 |
+
notes_text.config(yscrollcommand=notes_scroll.set)
|
| 536 |
+
|
| 537 |
+
notes_text.pack(side='left', fill='both', expand=True)
|
| 538 |
+
notes_scroll.pack(side='right', fill='y')
|
| 539 |
+
|
| 540 |
+
if self.latest_release:
|
| 541 |
+
release_notes = self.latest_release.get('body', 'No release notes available')
|
| 542 |
+
self.format_markdown_to_tkinter(notes_text, release_notes)
|
| 543 |
+
else:
|
| 544 |
+
notes_text.insert('1.0', 'Unable to fetch release notes.')
|
| 545 |
+
|
| 546 |
+
notes_text.config(state='disabled')
|
| 547 |
+
|
| 548 |
+
# Download progress (initially hidden)
|
| 549 |
+
self.progress_frame = ttk.Frame(main_frame)
|
| 550 |
+
self.progress_label = ttk.Label(self.progress_frame, text="Downloading update...")
|
| 551 |
+
self.progress_label.pack(anchor='w')
|
| 552 |
+
self.progress_bar = ttk.Progressbar(self.progress_frame, mode='determinate', length=400)
|
| 553 |
+
self.progress_bar.pack(fill='x', pady=5)
|
| 554 |
+
|
| 555 |
+
# Add status label for download details
|
| 556 |
+
self.status_label = ttk.Label(self.progress_frame, text="", font=('TkDefaultFont', 8))
|
| 557 |
+
self.status_label.pack(anchor='w')
|
| 558 |
+
|
| 559 |
+
# Buttons
|
| 560 |
+
button_frame = ttk.Frame(main_frame)
|
| 561 |
+
button_frame.pack(fill='x', pady=(10, 0))
|
| 562 |
+
|
| 563 |
+
def start_download():
|
| 564 |
+
if not self.selected_asset:
|
| 565 |
+
messagebox.showerror("No File Selected",
|
| 566 |
+
"Please select a version to download.")
|
| 567 |
+
return
|
| 568 |
+
|
| 569 |
+
self.progress_frame.pack(fill='x', pady=(0, 10), before=button_frame)
|
| 570 |
+
download_btn.config(state='disabled')
|
| 571 |
+
if 'remind_btn' in locals():
|
| 572 |
+
remind_btn.config(state='disabled')
|
| 573 |
+
if 'skip_btn' in locals():
|
| 574 |
+
skip_btn.config(state='disabled')
|
| 575 |
+
if 'close_btn' in locals():
|
| 576 |
+
close_btn.config(state='disabled')
|
| 577 |
+
|
| 578 |
+
# Reset progress
|
| 579 |
+
self.progress_bar['value'] = 0
|
| 580 |
+
self.download_progress = 0
|
| 581 |
+
|
| 582 |
+
# Start download using shared executor if available
|
| 583 |
+
try:
|
| 584 |
+
if hasattr(self.main_gui, '_ensure_executor'):
|
| 585 |
+
self.main_gui._ensure_executor()
|
| 586 |
+
execu = getattr(self, 'executor', None) or getattr(self.main_gui, 'executor', None)
|
| 587 |
+
if execu:
|
| 588 |
+
execu.submit(self.download_update, dialog)
|
| 589 |
+
else:
|
| 590 |
+
thread = threading.Thread(target=self.download_update, args=(dialog,), daemon=True)
|
| 591 |
+
thread.start()
|
| 592 |
+
except Exception:
|
| 593 |
+
thread = threading.Thread(target=self.download_update, args=(dialog,), daemon=True)
|
| 594 |
+
thread.start()
|
| 595 |
+
|
| 596 |
+
# Always show download button if we have exe files
|
| 597 |
+
has_exe_files = self.selected_asset is not None
|
| 598 |
+
|
| 599 |
+
if self.update_available:
|
| 600 |
+
# Show update-specific buttons
|
| 601 |
+
download_btn = tb.Button(button_frame, text="Download Update",
|
| 602 |
+
command=start_download, bootstyle="success")
|
| 603 |
+
download_btn.pack(side='left', padx=(0, 5))
|
| 604 |
+
|
| 605 |
+
remind_btn = tb.Button(button_frame, text="Remind Me Later",
|
| 606 |
+
command=dialog.destroy, bootstyle="secondary")
|
| 607 |
+
remind_btn.pack(side='left', padx=5)
|
| 608 |
+
|
| 609 |
+
skip_btn = tb.Button(button_frame, text="Skip This Version",
|
| 610 |
+
command=lambda: self.skip_version(dialog),
|
| 611 |
+
bootstyle="link")
|
| 612 |
+
skip_btn.pack(side='left', padx=5)
|
| 613 |
+
elif has_exe_files:
|
| 614 |
+
# We're up to date but have downloadable files
|
| 615 |
+
# Check if there are multiple exe files
|
| 616 |
+
release_to_check = self.all_releases[0] if self.all_releases else self.latest_release
|
| 617 |
+
exe_count = 0
|
| 618 |
+
if release_to_check:
|
| 619 |
+
exe_count = len([a for a in release_to_check.get('assets', [])
|
| 620 |
+
if a['name'].lower().endswith('.exe')])
|
| 621 |
+
|
| 622 |
+
if exe_count > 1:
|
| 623 |
+
# Multiple versions available
|
| 624 |
+
download_btn = tb.Button(button_frame, text="Download Different Path",
|
| 625 |
+
command=start_download, bootstyle="info")
|
| 626 |
+
else:
|
| 627 |
+
# Single version available
|
| 628 |
+
download_btn = tb.Button(button_frame, text="Re-download",
|
| 629 |
+
command=start_download, bootstyle="secondary")
|
| 630 |
+
download_btn.pack(side='left', padx=(0, 5))
|
| 631 |
+
|
| 632 |
+
close_btn = tb.Button(button_frame, text="Close",
|
| 633 |
+
command=dialog.destroy,
|
| 634 |
+
bootstyle="secondary")
|
| 635 |
+
close_btn.pack(side='left', padx=(0, 5))
|
| 636 |
+
else:
|
| 637 |
+
# No downloadable files
|
| 638 |
+
close_btn = tb.Button(button_frame, text="Close",
|
| 639 |
+
command=dialog.destroy,
|
| 640 |
+
bootstyle="primary")
|
| 641 |
+
close_btn.pack(side='left', padx=(0, 5))
|
| 642 |
+
|
| 643 |
+
# Add "View All Releases" link button
|
| 644 |
+
def open_releases_page():
|
| 645 |
+
import webbrowser
|
| 646 |
+
webbrowser.open("https://github.com/Shirochi-stack/Glossarion/releases")
|
| 647 |
+
|
| 648 |
+
tb.Button(button_frame, text="View All Releases",
|
| 649 |
+
command=open_releases_page,
|
| 650 |
+
bootstyle="link").pack(side='right', padx=5)
|
| 651 |
+
|
| 652 |
+
# Auto-resize at the end
|
| 653 |
+
dialog.after(100, lambda: self.main_gui.wm.auto_resize_dialog(dialog, canvas, max_width_ratio=0.5, max_height_ratio=0.8))
|
| 654 |
+
|
| 655 |
+
# Handle window close
|
| 656 |
+
dialog.protocol("WM_DELETE_WINDOW", lambda: [dialog._cleanup_scrolling(), dialog.destroy()])
|
| 657 |
+
|
| 658 |
+
def skip_version(self, dialog):
|
| 659 |
+
"""Mark this version as skipped and close dialog"""
|
| 660 |
+
if not self.latest_release:
|
| 661 |
+
dialog.destroy()
|
| 662 |
+
return
|
| 663 |
+
|
| 664 |
+
# Get current skipped versions list
|
| 665 |
+
if 'skipped_versions' not in self.main_gui.config:
|
| 666 |
+
self.main_gui.config['skipped_versions'] = []
|
| 667 |
+
|
| 668 |
+
# Add this version to skipped list
|
| 669 |
+
version_tag = self.latest_release['tag_name']
|
| 670 |
+
if version_tag not in self.main_gui.config['skipped_versions']:
|
| 671 |
+
self.main_gui.config['skipped_versions'].append(version_tag)
|
| 672 |
+
|
| 673 |
+
# Save config
|
| 674 |
+
self.main_gui.save_config(show_message=False)
|
| 675 |
+
|
| 676 |
+
# Close dialog
|
| 677 |
+
dialog.destroy()
|
| 678 |
+
|
| 679 |
+
# Show confirmation
|
| 680 |
+
messagebox.showinfo("Version Skipped",
|
| 681 |
+
f"Version {version_tag} will be skipped in future update checks.\n"
|
| 682 |
+
"You can manually check for updates from the Help menu.")
|
| 683 |
+
|
| 684 |
+
def download_update(self, dialog):
|
| 685 |
+
"""Download the update file"""
|
| 686 |
+
try:
|
| 687 |
+
# Use the selected asset
|
| 688 |
+
asset = self.selected_asset
|
| 689 |
+
|
| 690 |
+
if not asset:
|
| 691 |
+
dialog.after(0, lambda: messagebox.showerror("Download Error",
|
| 692 |
+
"No file selected for download."))
|
| 693 |
+
return
|
| 694 |
+
|
| 695 |
+
# Get the current executable path
|
| 696 |
+
if getattr(sys, 'frozen', False):
|
| 697 |
+
# Running as compiled executable
|
| 698 |
+
current_exe = sys.executable
|
| 699 |
+
download_dir = os.path.dirname(current_exe)
|
| 700 |
+
else:
|
| 701 |
+
# Running as script
|
| 702 |
+
current_exe = None
|
| 703 |
+
download_dir = self.base_dir
|
| 704 |
+
|
| 705 |
+
# Use the exact filename from GitHub
|
| 706 |
+
original_filename = asset['name'] # e.g., "Glossarion v3.1.3.exe"
|
| 707 |
+
new_exe_path = os.path.join(download_dir, original_filename)
|
| 708 |
+
|
| 709 |
+
# If new file would overwrite current executable, download to temp name first
|
| 710 |
+
if current_exe and os.path.normpath(new_exe_path) == os.path.normpath(current_exe):
|
| 711 |
+
temp_path = new_exe_path + ".new"
|
| 712 |
+
download_path = temp_path
|
| 713 |
+
else:
|
| 714 |
+
download_path = new_exe_path
|
| 715 |
+
|
| 716 |
+
# Download with progress tracking and shorter timeout
|
| 717 |
+
response = requests.get(asset['browser_download_url'], stream=True, timeout=15)
|
| 718 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 719 |
+
|
| 720 |
+
downloaded = 0
|
| 721 |
+
chunk_size = 8192
|
| 722 |
+
|
| 723 |
+
with open(download_path, 'wb') as f:
|
| 724 |
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
| 725 |
+
if chunk:
|
| 726 |
+
f.write(chunk)
|
| 727 |
+
downloaded += len(chunk)
|
| 728 |
+
|
| 729 |
+
# Update progress bar
|
| 730 |
+
if total_size > 0:
|
| 731 |
+
progress = int((downloaded / total_size) * 100)
|
| 732 |
+
size_mb = downloaded / (1024 * 1024)
|
| 733 |
+
total_mb = total_size / (1024 * 1024)
|
| 734 |
+
|
| 735 |
+
# Use after_idle for smoother updates
|
| 736 |
+
def update_progress(p=progress, d=size_mb, t=total_mb):
|
| 737 |
+
try:
|
| 738 |
+
self.progress_bar['value'] = p
|
| 739 |
+
self.progress_label.config(text=f"Downloading update... {p}%")
|
| 740 |
+
self.status_label.config(text=f"{d:.1f} MB / {t:.1f} MB")
|
| 741 |
+
except:
|
| 742 |
+
pass # Dialog might have been closed
|
| 743 |
+
|
| 744 |
+
dialog.after_idle(update_progress)
|
| 745 |
+
|
| 746 |
+
# Download complete
|
| 747 |
+
dialog.after(0, lambda: self.download_complete(dialog, download_path))
|
| 748 |
+
|
| 749 |
+
except Exception as e:
|
| 750 |
+
# Capture the error message immediately
|
| 751 |
+
error_msg = str(e)
|
| 752 |
+
dialog.after(0, lambda: messagebox.showerror("Download Failed", error_msg))
|
| 753 |
+
|
| 754 |
+
def download_complete(self, dialog, file_path):
|
| 755 |
+
"""Handle completed download"""
|
| 756 |
+
dialog.destroy()
|
| 757 |
+
|
| 758 |
+
result = messagebox.askyesno(
|
| 759 |
+
"Download Complete",
|
| 760 |
+
"Update downloaded successfully.\n\n"
|
| 761 |
+
"Would you like to install it now?\n"
|
| 762 |
+
"(The application will need to restart)"
|
| 763 |
+
)
|
| 764 |
+
|
| 765 |
+
if result:
|
| 766 |
+
self.install_update(file_path)
|
| 767 |
+
|
| 768 |
+
def install_update(self, update_file):
|
| 769 |
+
"""Launch the update installer and exit current app"""
|
| 770 |
+
try:
|
| 771 |
+
# Save current state/config if needed
|
| 772 |
+
self.main_gui.save_config(show_message=False)
|
| 773 |
+
|
| 774 |
+
# Get current executable path
|
| 775 |
+
if getattr(sys, 'frozen', False):
|
| 776 |
+
current_exe = sys.executable
|
| 777 |
+
current_dir = os.path.dirname(current_exe)
|
| 778 |
+
|
| 779 |
+
# Create a batch file to handle the update
|
| 780 |
+
batch_content = f"""@echo off
|
| 781 |
+
echo Updating Glossarion...
|
| 782 |
+
echo Waiting for current version to close...
|
| 783 |
+
timeout /t 3 /nobreak > nul
|
| 784 |
+
|
| 785 |
+
:: Delete the old executable
|
| 786 |
+
echo Deleting old version...
|
| 787 |
+
if exist "{current_exe}" (
|
| 788 |
+
del /f /q "{current_exe}"
|
| 789 |
+
if exist "{current_exe}" (
|
| 790 |
+
echo Failed to delete old version, retrying...
|
| 791 |
+
timeout /t 2 /nobreak > nul
|
| 792 |
+
del /f /q "{current_exe}"
|
| 793 |
+
)
|
| 794 |
+
)
|
| 795 |
+
|
| 796 |
+
:: Start the new version
|
| 797 |
+
echo Starting new version...
|
| 798 |
+
start "" "{update_file}"
|
| 799 |
+
|
| 800 |
+
:: Clean up this batch file
|
| 801 |
+
del "%~f0"
|
| 802 |
+
"""
|
| 803 |
+
batch_path = os.path.join(current_dir, "update_glossarion.bat")
|
| 804 |
+
with open(batch_path, 'w') as f:
|
| 805 |
+
f.write(batch_content)
|
| 806 |
+
|
| 807 |
+
# Run the batch file
|
| 808 |
+
import subprocess
|
| 809 |
+
subprocess.Popen([batch_path], shell=True, creationflags=subprocess.CREATE_NO_WINDOW)
|
| 810 |
+
|
| 811 |
+
print(f"[DEBUG] Update batch file created: {batch_path}")
|
| 812 |
+
print(f"[DEBUG] Will delete: {current_exe}")
|
| 813 |
+
print(f"[DEBUG] Will start: {update_file}")
|
| 814 |
+
else:
|
| 815 |
+
# Running as script, just start the new exe
|
| 816 |
+
import subprocess
|
| 817 |
+
subprocess.Popen([update_file], shell=True)
|
| 818 |
+
|
| 819 |
+
# Exit current application
|
| 820 |
+
print("[DEBUG] Closing application for update...")
|
| 821 |
+
self.main_gui.master.quit()
|
| 822 |
+
sys.exit(0)
|
| 823 |
+
|
| 824 |
+
except Exception as e:
|
| 825 |
+
messagebox.showerror("Installation Error",
|
| 826 |
+
f"Could not start update process:\n{str(e)}")
|
wait_and_open.ps1
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Wait for Gradio server to be ready and then open browser
|
| 2 |
+
param(
|
| 3 |
+
[string]$url = "http://127.0.0.1:7860",
|
| 4 |
+
[int]$maxWaitSeconds = 60
|
| 5 |
+
)
|
| 6 |
+
|
| 7 |
+
Write-Host "Waiting for server to be ready at $url..." -ForegroundColor Cyan
|
| 8 |
+
|
| 9 |
+
$startTime = Get-Date
|
| 10 |
+
$ready = $false
|
| 11 |
+
|
| 12 |
+
while (-not $ready -and ((Get-Date) - $startTime).TotalSeconds -lt $maxWaitSeconds) {
|
| 13 |
+
try {
|
| 14 |
+
$response = Invoke-WebRequest -Uri $url -Method Head -TimeoutSec 2 -UseBasicParsing -ErrorAction SilentlyContinue
|
| 15 |
+
if ($response.StatusCode -eq 200) {
|
| 16 |
+
$ready = $true
|
| 17 |
+
Write-Host "Server is ready!" -ForegroundColor Green
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
catch {
|
| 21 |
+
# Server not ready yet, wait a bit
|
| 22 |
+
Start-Sleep -Milliseconds 500
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
if ($ready) {
|
| 27 |
+
Write-Host "Opening browser..." -ForegroundColor Green
|
| 28 |
+
Start-Process $url
|
| 29 |
+
} else {
|
| 30 |
+
Write-Host "Timeout waiting for server. Please open $url manually." -ForegroundColor Yellow
|
| 31 |
+
}
|