Shirochi commited on
Commit
ea97ae9
·
verified ·
1 Parent(s): 55fcc46

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +0 -0
  2. manga_integration.py +0 -0
  3. manga_translator.py +0 -0
  4. model_options.py +129 -0
  5. ocr_manager.py +1970 -0
  6. translator_gui.py +0 -0
app.py ADDED
The diff for this file is too large to render. See raw diff
 
manga_integration.py ADDED
The diff for this file is too large to render. See raw diff
 
manga_translator.py ADDED
The diff for this file is too large to render. See raw diff
 
model_options.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model_options.py
2
+ """
3
+ Centralized model catalog for Glossarion UIs.
4
+ Returned list should mirror the main GUI model dropdown.
5
+ """
6
+ from typing import List
7
+
8
+ def get_model_options() -> List[str]:
9
+ return [
10
+
11
+ # OpenAI Models
12
+ "gpt-4o", "gpt-4o-mini", "gpt-4-turbo", "gpt-4.1-nano", "gpt-4.1-mini", "gpt-4.1",
13
+ "gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k",
14
+ "gpt-5-mini","gpt-5","gpt-5-nano",
15
+ "o1-preview", "o1-mini", "o3", "o4-mini",
16
+
17
+ # Google Gemini Models
18
+ "gemini-2.0-flash","gemini-2.0-flash-lite",
19
+ "gemini-2.5-flash","gemini-2.5-flash-lite", "gemini-2.5-pro", "gemini-pro", "gemini-pro-vision",
20
+
21
+ # Anthropic Claude Models
22
+ "claude-opus-4-20250514", "claude-sonnet-4-20250514",
23
+ "claude-3-5-sonnet-20241022", "claude-3-7-sonnet-20250219",
24
+ "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307",
25
+ "claude-2.1", "claude-2", "claude-instant-1.2",
26
+
27
+ # Grok Models
28
+ "grok-4-0709", "grok-4-fast", "grok-4-fast-reasoning", "grok-4-fast-reasoning-latest", "grok-3", "grok-3-mini",
29
+
30
+ # Vertex AI Model Garden - Claude models (confirmed)
31
+ "claude-4-opus@20250514",
32
+ "claude-4-sonnet@20250514",
33
+ "claude-opus-4@20250514",
34
+ "claude-sonnet-4@20250514",
35
+ "claude-3-7-sonnet@20250219",
36
+ "claude-3-5-sonnet@20240620",
37
+ "claude-3-5-sonnet-v2@20241022",
38
+ "claude-3-opus@20240229",
39
+ "claude-3-sonnet@20240229",
40
+ "claude-3-haiku@20240307",
41
+
42
+
43
+ # Alternative format with vertex_ai prefix
44
+ "vertex/claude-3-7-sonnet@20250219",
45
+ "vertex/claude-3-5-sonnet@20240620",
46
+ "vertex/claude-3-opus@20240229",
47
+ "vertex/claude-4-opus@20250514",
48
+ "vertex/claude-4-sonnet@20250514",
49
+ "vertex/gemini-1.5-pro",
50
+ "vertex/gemini-1.5-flash",
51
+ "vertex/gemini-2.0-flash",
52
+ "vertex/gemini-2.5-pro",
53
+ "vertex/gemini-2.5-flash",
54
+ "vertex/gemini-2.5-flash-lite",
55
+
56
+ # Chute AI
57
+ "chutes/openai/gpt-oss-120b",
58
+ "chutes/deepseek-ai/DeepSeek-V3.1",
59
+
60
+ # DeepSeek Models
61
+ "deepseek-chat", "deepseek-coder", "deepseek-coder-33b-instruct",
62
+
63
+ # Mistral Models
64
+ "mistral-large", "mistral-medium", "mistral-small", "mistral-tiny",
65
+ "mixtral-8x7b-instruct", "mixtral-8x22b", "codestral-latest",
66
+
67
+ # Meta Llama Models (via Together/other providers)
68
+ "llama-2-7b-chat", "llama-2-13b-chat", "llama-2-70b-chat",
69
+ "llama-3-8b-instruct", "llama-3-70b-instruct", "codellama-34b-instruct",
70
+
71
+ # Yi Models
72
+ "yi-34b-chat", "yi-34b-chat-200k", "yi-6b-chat",
73
+
74
+ # Qwen Models
75
+ "qwen-72b-chat", "qwen-14b-chat", "qwen-7b-chat", "qwen-plus", "qwen-turbo",
76
+
77
+ # Cohere Models
78
+ "command", "command-light", "command-nightly", "command-r", "command-r-plus",
79
+
80
+ # AI21 Models
81
+ "j2-ultra", "j2-mid", "j2-light", "jamba-instruct",
82
+
83
+ # Perplexity Models
84
+ "perplexity-70b-online", "perplexity-7b-online", "pplx-70b-online", "pplx-7b-online",
85
+
86
+ # Groq Models (usually with suffix)
87
+ "llama-3-70b-groq", "llama-3-8b-groq", "mixtral-8x7b-groq",
88
+
89
+ # Chinese Models
90
+ "glm-4", "glm-3-turbo", "chatglm-6b", "chatglm2-6b", "chatglm3-6b",
91
+ "baichuan-13b-chat", "baichuan2-13b-chat",
92
+ "moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k",
93
+
94
+ # Other Models
95
+ "falcon-40b-instruct", "falcon-7b-instruct",
96
+ "phi-2", "phi-3-mini", "phi-3-small", "phi-3-medium",
97
+ "orca-2-13b", "orca-2-7b",
98
+ "vicuna-13b", "vicuna-7b",
99
+ "alpaca-7b",
100
+ "wizardlm-70b", "wizardlm-13b",
101
+ "openchat-3.5",
102
+
103
+ # For POE, prefix with 'poe/'
104
+ "poe/gpt-4", "poe/gpt-4o", "poe/gpt-4.5", "poe/gpt-4.1",
105
+ "poe/claude-3-opus", "poe/claude-4-opus", "poe/claude-3-sonnet", "poe/claude-4-sonnet",
106
+ "poe/claude", "poe/Assistant",
107
+ "poe/gemini-2.5-flash", "poe/gemini-2.5-pro",
108
+
109
+ # For OR, prevfix with 'or/'
110
+ "or/google/gemini-2.5-pro",
111
+ "or/google/gemini-2.5-flash",
112
+ "or/google/gemini-2.5-flash-lite",
113
+ "or/openai/gpt-5",
114
+ "or/openai/gpt-5-mini",
115
+ "or/openai/gpt-5-nano",
116
+ "or/openai/chatgpt-4o-latest",
117
+ "or/deepseek/deepseek-r1-0528:free",
118
+ "or/google/gemma-3-27b-it:free",
119
+
120
+ # For ElectronHub, prefix with 'eh/'
121
+ "eh/gpt-4", "eh/gpt-3.5-turbo", "eh/claude-3-opus", "eh/claude-3-sonnet",
122
+ "eh/llama-2-70b-chat", "eh/yi-34b-chat-200k", "eh/mistral-large",
123
+ "eh/gemini-pro", "eh/deepseek-coder-33b",
124
+
125
+ # Last Resort
126
+ "deepl", # Will use DeepL API
127
+ "google-translate-free", # Uses free web endpoint (no key)
128
+ "google-translate", # Will use Google Cloud Translate
129
+ ]
ocr_manager.py ADDED
@@ -0,0 +1,1970 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ocr_manager.py
2
+ """
3
+ OCR Manager for handling multiple OCR providers
4
+ Handles installation, model downloading, and OCR processing
5
+ Updated with HuggingFace donut model and proper bubble detection integration
6
+ """
7
+ import os
8
+ import sys
9
+ import cv2
10
+ import json
11
+ import subprocess
12
+ import threading
13
+ import traceback
14
+ from typing import List, Dict, Optional, Tuple, Any
15
+ import numpy as np
16
+ from dataclasses import dataclass
17
+ from PIL import Image
18
+ import logging
19
+ import time
20
+ import random
21
+ import base64
22
+ import io
23
+ import requests
24
+
25
+ try:
26
+ import gptqmodel
27
+ HAS_GPTQ = True
28
+ except ImportError:
29
+ try:
30
+ import auto_gptq
31
+ HAS_GPTQ = True
32
+ except ImportError:
33
+ HAS_GPTQ = False
34
+
35
+ try:
36
+ import optimum
37
+ HAS_OPTIMUM = True
38
+ except ImportError:
39
+ HAS_OPTIMUM = False
40
+
41
+ try:
42
+ import accelerate
43
+ HAS_ACCELERATE = True
44
+ except ImportError:
45
+ HAS_ACCELERATE = False
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ @dataclass
50
+ class OCRResult:
51
+ """Unified OCR result format with built-in sanitization to prevent data corruption."""
52
+ text: str
53
+ bbox: Tuple[int, int, int, int] # x, y, w, h
54
+ confidence: float
55
+ vertices: Optional[List[Tuple[int, int]]] = None
56
+
57
+ def __post_init__(self):
58
+ """
59
+ This special method is called automatically after the object is created.
60
+ It acts as a final safeguard to ensure the 'text' attribute is ALWAYS a clean string.
61
+ """
62
+ # --- THIS IS THE DEFINITIVE FIX ---
63
+ # If the text we received is a tuple, we extract the first element.
64
+ # This makes it impossible for a tuple to exist in a finished object.
65
+ if isinstance(self.text, tuple):
66
+ # Log that we are fixing a critical data error.
67
+ print(f"CRITICAL WARNING: Corrupted tuple detected in OCRResult. Sanitizing '{self.text}' to '{self.text[0]}'.")
68
+ self.text = self.text[0]
69
+
70
+ # Ensure the final result is always a stripped string.
71
+ self.text = str(self.text).strip()
72
+
73
+ class OCRProvider:
74
+ """Base class for OCR providers"""
75
+
76
+ def __init__(self, log_callback=None):
77
+ # Set thread limits early if environment indicates single-threaded mode
78
+ try:
79
+ if os.environ.get('OMP_NUM_THREADS') == '1':
80
+ # Already in single-threaded mode, ensure it's applied to this process
81
+ try:
82
+ import sys
83
+ if 'torch' in sys.modules:
84
+ import torch
85
+ torch.set_num_threads(1)
86
+ except (ImportError, RuntimeError, AttributeError):
87
+ pass
88
+ try:
89
+ import cv2
90
+ cv2.setNumThreads(1)
91
+ except (ImportError, AttributeError):
92
+ pass
93
+ except Exception:
94
+ pass
95
+
96
+ self.log_callback = log_callback
97
+ self.is_installed = False
98
+ self.is_loaded = False
99
+ self.model = None
100
+ self.stop_flag = None
101
+ self._stopped = False
102
+
103
+ def _log(self, message: str, level: str = "info"):
104
+ """Log message with stop suppression"""
105
+ # Suppress logs when stopped (allow only essential stop confirmation messages)
106
+ if self._check_stop():
107
+ essential_stop_keywords = [
108
+ "⏹️ Translation stopped by user",
109
+ "⏹️ OCR processing stopped",
110
+ "cleanup", "🧹"
111
+ ]
112
+ if not any(keyword in message for keyword in essential_stop_keywords):
113
+ return
114
+
115
+ if self.log_callback:
116
+ self.log_callback(message, level)
117
+ else:
118
+ print(f"[{level.upper()}] {message}")
119
+
120
+ def set_stop_flag(self, stop_flag):
121
+ """Set the stop flag for checking interruptions"""
122
+ self.stop_flag = stop_flag
123
+ self._stopped = False
124
+
125
+ def _check_stop(self) -> bool:
126
+ """Check if stop has been requested"""
127
+ if self._stopped:
128
+ return True
129
+ if self.stop_flag and self.stop_flag.is_set():
130
+ self._stopped = True
131
+ return True
132
+ # Check global manga translator cancellation
133
+ try:
134
+ from manga_translator import MangaTranslator
135
+ if MangaTranslator.is_globally_cancelled():
136
+ self._stopped = True
137
+ return True
138
+ except Exception:
139
+ pass
140
+ return False
141
+
142
+ def reset_stop_flags(self):
143
+ """Reset stop flags when starting new processing"""
144
+ self._stopped = False
145
+
146
+ def check_installation(self) -> bool:
147
+ """Check if provider is installed"""
148
+ raise NotImplementedError
149
+
150
+ def install(self, progress_callback=None) -> bool:
151
+ """Install the provider"""
152
+ raise NotImplementedError
153
+
154
+ def load_model(self, **kwargs) -> bool:
155
+ """Load the OCR model"""
156
+ raise NotImplementedError
157
+
158
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
159
+ """Detect text in image"""
160
+ raise NotImplementedError
161
+
162
+ class CustomAPIProvider(OCRProvider):
163
+ """Custom API OCR provider that uses existing GUI variables"""
164
+
165
+ def __init__(self, log_callback=None):
166
+ super().__init__(log_callback)
167
+
168
+ # Use EXISTING environment variables from TranslatorGUI
169
+ self.api_url = os.environ.get('OPENAI_CUSTOM_BASE_URL', '')
170
+ self.api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
171
+ self.model_name = os.environ.get('MODEL', 'gpt-4o-mini')
172
+
173
+ # OCR prompt - use system prompt or a dedicated OCR prompt variable
174
+ self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
175
+ os.environ.get('SYSTEM_PROMPT',
176
+ "YOU ARE A TEXT EXTRACTION MACHINE. EXTRACT EXACTLY WHAT YOU SEE.\n\n"
177
+ "ABSOLUTE RULES:\n"
178
+ "1. OUTPUT ONLY THE VISIBLE TEXT/SYMBOLS - NOTHING ELSE\n"
179
+ "2. NEVER TRANSLATE OR MODIFY\n"
180
+ "3. NEVER EXPLAIN, DESCRIBE, OR COMMENT\n"
181
+ "4. NEVER SAY \"I can't\" or \"I cannot\" or \"no text\" or \"blank image\"\n"
182
+ "5. IF YOU SEE DOTS, OUTPUT THE DOTS: .\n"
183
+ "6. IF YOU SEE PUNCTUATION, OUTPUT THE PUNCTUATION\n"
184
+ "7. IF YOU SEE A SINGLE CHARACTER, OUTPUT THAT CHARACTER\n"
185
+ "8. IF YOU SEE NOTHING, OUTPUT NOTHING (empty response)\n\n"
186
+ "LANGUAGE PRESERVATION:\n"
187
+ "- Korean text → Output in Korean\n"
188
+ "- Japanese text → Output in Japanese\n"
189
+ "- Chinese text → Output in Chinese\n"
190
+ "- English text → Output in English\n"
191
+ "- CJK quotation marks (「」『』【】《》〈〉) → Preserve exactly as shown\n\n"
192
+ "FORMATTING:\n"
193
+ "- OUTPUT ALL TEXT ON A SINGLE LINE WITH NO LINE BREAKS\n"
194
+ "- NEVER use \\n or line breaks in your output\n\n"
195
+ "FORBIDDEN RESPONSES:\n"
196
+ "- \"I can see this appears to be...\"\n"
197
+ "- \"I cannot make out any clear text...\"\n"
198
+ "- \"This appears to be blank...\"\n"
199
+ "- \"If there is text present...\"\n"
200
+ "- ANY explanatory text\n\n"
201
+ "YOUR ONLY OUTPUT: The exact visible text. Nothing more. Nothing less.\n"
202
+ "If image has a dot → Output: .\n"
203
+ "If image has two dots → Output: . .\n"
204
+ "If image has text → Output: [that text]\n"
205
+ "If image is truly blank → Output: [empty/no response]"
206
+ ))
207
+
208
+ # Use existing temperature and token settings
209
+ self.temperature = float(os.environ.get('TRANSLATION_TEMPERATURE', '0.01'))
210
+ # NOTE: max_tokens is NOT cached here - it's read fresh from environment in detect_text()
211
+ # to ensure we always get the latest value from the GUI
212
+
213
+ # Image settings from existing compression variables
214
+ self.image_format = 'jpeg' if os.environ.get('IMAGE_COMPRESSION_FORMAT', 'auto') != 'png' else 'png'
215
+ self.image_quality = int(os.environ.get('JPEG_QUALITY', '100'))
216
+
217
+ # Simple defaults
218
+ self.api_format = 'openai' # Most custom endpoints are OpenAI-compatible
219
+ self.timeout = int(os.environ.get('CHUNK_TIMEOUT', '30'))
220
+ self.api_headers = {} # Additional custom headers
221
+
222
+ # Retry configuration for Custom API OCR calls
223
+ self.max_retries = int(os.environ.get('CUSTOM_OCR_MAX_RETRIES', '3'))
224
+ self.retry_initial_delay = float(os.environ.get('CUSTOM_OCR_RETRY_INITIAL_DELAY', '0.8'))
225
+ self.retry_backoff = float(os.environ.get('CUSTOM_OCR_RETRY_BACKOFF', '1.8'))
226
+ self.retry_jitter = float(os.environ.get('CUSTOM_OCR_RETRY_JITTER', '0.4'))
227
+ self.retry_on_empty = os.environ.get('CUSTOM_OCR_RETRY_ON_EMPTY', '1') == '1'
228
+
229
+ def check_installation(self) -> bool:
230
+ """Always installed - uses UnifiedClient"""
231
+ self.is_installed = True
232
+ return True
233
+
234
+ def install(self, progress_callback=None) -> bool:
235
+ """No installation needed for API-based provider"""
236
+ return self.check_installation()
237
+
238
+ def load_model(self, **kwargs) -> bool:
239
+ """Initialize UnifiedClient with current settings"""
240
+ try:
241
+ from unified_api_client import UnifiedClient
242
+
243
+ # Support passing API key from GUI if available
244
+ if 'api_key' in kwargs:
245
+ api_key = kwargs['api_key']
246
+ else:
247
+ api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
248
+
249
+ if 'model' in kwargs:
250
+ model = kwargs['model']
251
+ else:
252
+ model = os.environ.get('MODEL', 'gpt-4o-mini')
253
+
254
+ if not api_key:
255
+ self._log("❌ No API key configured", "error")
256
+ return False
257
+
258
+ # Create UnifiedClient just like translations do
259
+ self.client = UnifiedClient(model=model, api_key=api_key)
260
+
261
+ #self._log(f"✅ Using {model} for OCR via UnifiedClient")
262
+ self.is_loaded = True
263
+ return True
264
+
265
+ except Exception as e:
266
+ self._log(f"❌ Failed to initialize UnifiedClient: {str(e)}", "error")
267
+ return False
268
+
269
+ def _test_connection(self) -> bool:
270
+ """Test API connection with a simple request"""
271
+ try:
272
+ # Create a small test image
273
+ test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255
274
+ cv2.putText(test_image, "TEST", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
275
+
276
+ # Encode image
277
+ image_base64 = self._encode_image(test_image)
278
+
279
+ # Prepare test request based on API format
280
+ if self.api_format == 'openai':
281
+ test_payload = {
282
+ "model": self.model_name,
283
+ "messages": [
284
+ {
285
+ "role": "user",
286
+ "content": [
287
+ {"type": "text", "text": "What text do you see?"},
288
+ {"type": "image_url", "image_url": {"url": f"data:image/{self.image_format};base64,{image_base64}"}}
289
+ ]
290
+ }
291
+ ],
292
+ "max_tokens": 50
293
+ }
294
+ else:
295
+ # For other formats, just try a basic health check
296
+ return True
297
+
298
+ headers = self._prepare_headers()
299
+ response = requests.post(
300
+ self.api_url,
301
+ headers=headers,
302
+ json=test_payload,
303
+ timeout=10
304
+ )
305
+
306
+ return response.status_code == 200
307
+
308
+ except Exception:
309
+ return False
310
+
311
+ def _encode_image(self, image: np.ndarray) -> str:
312
+ """Encode numpy array to base64 string"""
313
+ # Convert BGR to RGB if needed
314
+ if len(image.shape) == 3 and image.shape[2] == 3:
315
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
316
+ else:
317
+ image_rgb = image
318
+
319
+ # Convert to PIL Image
320
+ pil_image = Image.fromarray(image_rgb)
321
+
322
+ # Save to bytes buffer
323
+ buffer = io.BytesIO()
324
+ if self.image_format.lower() == 'png':
325
+ pil_image.save(buffer, format='PNG')
326
+ else:
327
+ pil_image.save(buffer, format='JPEG', quality=self.image_quality)
328
+
329
+ # Encode to base64
330
+ buffer.seek(0)
331
+ image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
332
+
333
+ return image_base64
334
+
335
+ def _prepare_headers(self) -> dict:
336
+ """Prepare request headers"""
337
+ headers = {
338
+ "Content-Type": "application/json"
339
+ }
340
+
341
+ # Add API key if configured
342
+ if self.api_key:
343
+ if self.api_format == 'anthropic':
344
+ headers["x-api-key"] = self.api_key
345
+ else:
346
+ headers["Authorization"] = f"Bearer {self.api_key}"
347
+
348
+ # Add any custom headers
349
+ headers.update(self.api_headers)
350
+
351
+ return headers
352
+
353
+ def _prepare_request_payload(self, image_base64: str) -> dict:
354
+ """Prepare request payload based on API format"""
355
+ if self.api_format == 'openai':
356
+ return {
357
+ "model": self.model_name,
358
+ "messages": [
359
+ {
360
+ "role": "user",
361
+ "content": [
362
+ {"type": "text", "text": self.ocr_prompt},
363
+ {
364
+ "type": "image_url",
365
+ "image_url": {
366
+ "url": f"data:image/{self.image_format};base64,{image_base64}"
367
+ }
368
+ }
369
+ ]
370
+ }
371
+ ],
372
+ "max_tokens": self.max_tokens,
373
+ "temperature": self.temperature
374
+ }
375
+
376
+ elif self.api_format == 'anthropic':
377
+ return {
378
+ "model": self.model_name,
379
+ "max_tokens": self.max_tokens,
380
+ "temperature": self.temperature,
381
+ "messages": [
382
+ {
383
+ "role": "user",
384
+ "content": [
385
+ {
386
+ "type": "text",
387
+ "text": self.ocr_prompt
388
+ },
389
+ {
390
+ "type": "image",
391
+ "source": {
392
+ "type": "base64",
393
+ "media_type": f"image/{self.image_format}",
394
+ "data": image_base64
395
+ }
396
+ }
397
+ ]
398
+ }
399
+ ]
400
+ }
401
+
402
+ else:
403
+ # Custom format - use environment variable for template
404
+ template = os.environ.get('CUSTOM_OCR_REQUEST_TEMPLATE', '{}')
405
+ payload = json.loads(template)
406
+
407
+ # Replace placeholders
408
+ payload_str = json.dumps(payload)
409
+ payload_str = payload_str.replace('{{IMAGE_BASE64}}', image_base64)
410
+ payload_str = payload_str.replace('{{PROMPT}}', self.ocr_prompt)
411
+ payload_str = payload_str.replace('{{MODEL}}', self.model_name)
412
+ payload_str = payload_str.replace('{{MAX_TOKENS}}', str(self.max_tokens))
413
+ payload_str = payload_str.replace('{{TEMPERATURE}}', str(self.temperature))
414
+
415
+ return json.loads(payload_str)
416
+
417
+ def _extract_text_from_response(self, response_data: dict) -> str:
418
+ """Extract text from API response based on format"""
419
+ try:
420
+ if self.api_format == 'openai':
421
+ # OpenAI format: response.choices[0].message.content
422
+ return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
423
+
424
+ elif self.api_format == 'anthropic':
425
+ # Anthropic format: response.content[0].text
426
+ content = response_data.get('content', [])
427
+ if content and isinstance(content, list):
428
+ return content[0].get('text', '')
429
+ return ''
430
+
431
+ else:
432
+ # Custom format - use environment variable for path
433
+ response_path = os.environ.get('CUSTOM_OCR_RESPONSE_PATH', 'text')
434
+
435
+ # Navigate through the response using the path
436
+ result = response_data
437
+ for key in response_path.split('.'):
438
+ if isinstance(result, dict):
439
+ result = result.get(key, '')
440
+ elif isinstance(result, list) and key.isdigit():
441
+ idx = int(key)
442
+ result = result[idx] if idx < len(result) else ''
443
+ else:
444
+ result = ''
445
+ break
446
+
447
+ return str(result)
448
+
449
+ except Exception as e:
450
+ self._log(f"Failed to extract text from response: {e}", "error")
451
+ return ''
452
+
453
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
454
+ """Process image using UnifiedClient.send_image()"""
455
+ results = []
456
+
457
+ try:
458
+ # CRITICAL: Reload OCR prompt from environment before each detection
459
+ # This ensures we use the latest prompt set by manga_integration.py
460
+ self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT', self.ocr_prompt)
461
+
462
+ # Get fresh max_tokens from environment - GUI will have set this
463
+ max_tokens = int(os.environ.get('MAX_OUTPUT_TOKENS', '8192'))
464
+ if not self.is_loaded:
465
+ if not self.load_model():
466
+ return results
467
+
468
+ import cv2
469
+ from PIL import Image
470
+ import base64
471
+ import io
472
+
473
+ # Validate and resize image if too small (consistent with Google/Azure logic)
474
+ h, w = image.shape[:2]
475
+ MIN_SIZE = 50 # Minimum dimension for good OCR quality
476
+ MIN_AREA = 2500 # Minimum area (50x50)
477
+
478
+ # Skip completely invalid/corrupted images (0 or negative dimensions)
479
+ if h <= 0 or w <= 0:
480
+ self._log(f"⚠️ Invalid image dimensions ({w}x{h}px), skipping", "warning")
481
+ return results
482
+
483
+ if h < MIN_SIZE or w < MIN_SIZE or h * w < MIN_AREA:
484
+ # Image too small - resize it
485
+ scale_w = MIN_SIZE / w if w < MIN_SIZE else 1.0
486
+ scale_h = MIN_SIZE / h if h < MIN_SIZE else 1.0
487
+ scale = max(scale_w, scale_h)
488
+
489
+ if scale > 1.0:
490
+ new_w = int(w * scale)
491
+ new_h = int(h * scale)
492
+ image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
493
+ self._log(f"🔍 Image resized from {w}x{h}px to {new_w}x{new_h}px for Custom API OCR", "debug")
494
+ h, w = new_h, new_w
495
+
496
+ # Convert numpy array to PIL Image
497
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
498
+ pil_image = Image.fromarray(image_rgb)
499
+
500
+ # Convert PIL Image to base64 string
501
+ buffer = io.BytesIO()
502
+
503
+ # Use the image format from settings
504
+ if self.image_format.lower() == 'png':
505
+ pil_image.save(buffer, format='PNG')
506
+ else:
507
+ pil_image.save(buffer, format='JPEG', quality=self.image_quality)
508
+
509
+ buffer.seek(0)
510
+ image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
511
+
512
+ # For OpenAI vision models, we need BOTH:
513
+ # 1. System prompt with instructions
514
+ # 2. User message that includes the image
515
+ messages = [
516
+ {
517
+ "role": "system",
518
+ "content": self.ocr_prompt # The OCR instruction as system prompt
519
+ },
520
+ {
521
+ "role": "user",
522
+ "content": [
523
+ {
524
+ "type": "text",
525
+ "text": "Image:" # Minimal text, just to have something
526
+ },
527
+ {
528
+ "type": "image_url",
529
+ "image_url": {
530
+ "url": f"data:image/jpeg;base64,{image_base64}"
531
+ }
532
+ }
533
+ ]
534
+ }
535
+ ]
536
+
537
+ # Now send this properly formatted message
538
+ # The UnifiedClient should handle this correctly
539
+ # But we're NOT using send_image, we're using regular send
540
+
541
+ # Retry-aware call
542
+ from unified_api_client import UnifiedClientError # local import to avoid hard dependency at module import time
543
+ max_attempts = max(1, self.max_retries)
544
+ attempt = 0
545
+ last_error = None
546
+
547
+ # Common refusal/error phrases that indicate a non-OCR response (expanded list)
548
+ refusal_phrases = [
549
+ "I can't extract", "I cannot extract",
550
+ "I'm sorry", "I am sorry",
551
+ "I'm unable", "I am unable",
552
+ "cannot process images",
553
+ "I can't help with that",
554
+ "cannot view images",
555
+ "no text in the image",
556
+ "I can see this appears",
557
+ "I cannot make out",
558
+ "appears to be blank",
559
+ "appears to be a mostly blank",
560
+ "mostly blank or white image",
561
+ "If there is text present",
562
+ "too small, faint, or unclear",
563
+ "cannot accurately extract",
564
+ "may be too",
565
+ "However, I cannot",
566
+ "I don't see any",
567
+ "no clear text",
568
+ "no visible text",
569
+ "does not contain",
570
+ "doesn't contain",
571
+ "I do not see"
572
+ ]
573
+
574
+ while attempt < max_attempts:
575
+ # Check for stop before each attempt
576
+ if self._check_stop():
577
+ self._log("⏹️ OCR processing stopped by user", "warning")
578
+ return results
579
+
580
+ try:
581
+ response = self.client.send(
582
+ messages=messages,
583
+ temperature=self.temperature,
584
+ max_tokens=max_tokens
585
+ )
586
+
587
+ # Extract content from response object
588
+ content, finish_reason = response
589
+
590
+ # Validate content
591
+ has_content = bool(content and str(content).strip())
592
+ refused = False
593
+ if has_content:
594
+ # Filter out explicit failure markers
595
+ if "[" in content and "FAILED]" in content:
596
+ refused = True
597
+ elif any(phrase.lower() in content.lower() for phrase in refusal_phrases):
598
+ refused = True
599
+
600
+ # Decide success or retry
601
+ if has_content and not refused:
602
+ text = str(content).strip()
603
+ results.append(OCRResult(
604
+ text=text,
605
+ bbox=(0, 0, w, h),
606
+ confidence=kwargs.get('confidence', 0.85),
607
+ vertices=[(0, 0), (w, 0), (w, h), (0, h)]
608
+ ))
609
+ self._log(f"✅ Detected: {text[:50]}...")
610
+ break # success
611
+ else:
612
+ reason = "empty result" if not has_content else "refusal/non-OCR response"
613
+ last_error = f"{reason} (finish_reason: {finish_reason})"
614
+ # Check if we should retry on empty or refusal
615
+ should_retry = (not has_content and self.retry_on_empty) or refused
616
+ attempt += 1
617
+ if attempt >= max_attempts or not should_retry:
618
+ # No more retries or shouldn't retry
619
+ if not has_content:
620
+ self._log(f"⚠️ No text detected (finish_reason: {finish_reason})")
621
+ else:
622
+ self._log(f"❌ Model returned non-OCR response: {str(content)[:120]}", "warning")
623
+ break
624
+ # Backoff before retrying
625
+ delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
626
+ self._log(f"🔄 Retry {attempt}/{max_attempts - 1} after {delay:.1f}s due to {reason}...", "warning")
627
+ time.sleep(delay)
628
+ time.sleep(0.1) # Brief pause for stability
629
+ self._log("💤 OCR retry pausing briefly for stability", "debug")
630
+ continue
631
+
632
+ except UnifiedClientError as ue:
633
+ msg = str(ue)
634
+ last_error = msg
635
+ # Do not retry on explicit user cancellation
636
+ if 'cancelled' in msg.lower() or 'stopped by user' in msg.lower():
637
+ self._log(f"❌ OCR cancelled: {msg}", "error")
638
+ break
639
+ attempt += 1
640
+ if attempt >= max_attempts:
641
+ self._log(f"❌ OCR failed after {attempt} attempts: {msg}", "error")
642
+ break
643
+ delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
644
+ self._log(f"🔄 API error, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {msg}", "warning")
645
+ time.sleep(delay)
646
+ time.sleep(0.1) # Brief pause for stability
647
+ self._log("💤 OCR API error retry pausing briefly for stability", "debug")
648
+ continue
649
+ except Exception as e_inner:
650
+ last_error = str(e_inner)
651
+ attempt += 1
652
+ if attempt >= max_attempts:
653
+ self._log(f"❌ OCR exception after {attempt} attempts: {last_error}", "error")
654
+ break
655
+ delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
656
+ self._log(f"🔄 Exception, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {last_error}", "warning")
657
+ time.sleep(delay)
658
+ time.sleep(0.1) # Brief pause for stability
659
+ self._log("💤 OCR exception retry pausing briefly for stability", "debug")
660
+ continue
661
+
662
+ except Exception as e:
663
+ self._log(f"❌ Error: {str(e)}", "error")
664
+ import traceback
665
+ self._log(traceback.format_exc(), "debug")
666
+
667
+ return results
668
+
669
+ class MangaOCRProvider(OCRProvider):
670
+ """Manga OCR provider using HuggingFace model directly"""
671
+
672
+ def __init__(self, log_callback=None):
673
+ super().__init__(log_callback)
674
+ self.processor = None
675
+ self.model = None
676
+ self.tokenizer = None
677
+
678
+ def check_installation(self) -> bool:
679
+ """Check if transformers is installed"""
680
+ try:
681
+ import transformers
682
+ import torch
683
+ self.is_installed = True
684
+ return True
685
+ except ImportError:
686
+ return False
687
+
688
+ def install(self, progress_callback=None) -> bool:
689
+ """Install transformers and torch"""
690
+ pass
691
+
692
+ def _is_valid_local_model_dir(self, path: str) -> bool:
693
+ """Check that a local HF model directory has required files."""
694
+ try:
695
+ if not path or not os.path.isdir(path):
696
+ return False
697
+ needed_any_weights = any(
698
+ os.path.exists(os.path.join(path, name)) for name in (
699
+ 'pytorch_model.bin',
700
+ 'model.safetensors'
701
+ )
702
+ )
703
+ has_config = os.path.exists(os.path.join(path, 'config.json'))
704
+ has_processor = (
705
+ os.path.exists(os.path.join(path, 'preprocessor_config.json')) or
706
+ os.path.exists(os.path.join(path, 'processor_config.json'))
707
+ )
708
+ has_tokenizer = (
709
+ os.path.exists(os.path.join(path, 'tokenizer.json')) or
710
+ os.path.exists(os.path.join(path, 'tokenizer_config.json'))
711
+ )
712
+ return has_config and needed_any_weights and has_processor and has_tokenizer
713
+ except Exception:
714
+ return False
715
+
716
+ def load_model(self, **kwargs) -> bool:
717
+ """Load the manga-ocr model, preferring a local directory to avoid re-downloading"""
718
+ print("\n>>> MangaOCRProvider.load_model() called")
719
+ try:
720
+ if not self.is_installed and not self.check_installation():
721
+ print("ERROR: Transformers not installed")
722
+ self._log("❌ Transformers not installed", "error")
723
+ return False
724
+
725
+ # Always disable progress bars to avoid tqdm issues in some environments
726
+ import os
727
+ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
728
+
729
+ from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoImageProcessor
730
+ import torch
731
+
732
+ # Prefer a local model directory if present to avoid any Hub access
733
+ candidates = []
734
+ env_local = os.environ.get("MANGA_OCR_LOCAL_DIR")
735
+ if env_local:
736
+ candidates.append(env_local)
737
+
738
+ # Project root one level up from this file
739
+ root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
740
+ candidates.append(os.path.join(root_dir, 'models', 'manga-ocr-base'))
741
+ candidates.append(os.path.join(root_dir, 'models', 'kha-white', 'manga-ocr-base'))
742
+
743
+ model_source = None
744
+ local_only = False
745
+ # Find a valid local dir
746
+ for cand in candidates:
747
+ if self._is_valid_local_model_dir(cand):
748
+ model_source = cand
749
+ local_only = True
750
+ break
751
+
752
+ # If no valid local dir, use Hub
753
+ if not model_source:
754
+ model_source = "kha-white/manga-ocr-base"
755
+ # Make sure we are not forcing offline mode
756
+ if os.environ.get("HF_HUB_OFFLINE") == "1":
757
+ try:
758
+ del os.environ["HF_HUB_OFFLINE"]
759
+ except Exception:
760
+ pass
761
+ self._log("🔥 Loading manga-ocr model from Hugging Face Hub")
762
+ self._log(f" Repo: {model_source}")
763
+ else:
764
+ # Only set offline when local dir is fully valid
765
+ os.environ.setdefault("HF_HUB_OFFLINE", "1")
766
+ self._log("🔥 Loading manga-ocr model from local directory")
767
+ self._log(f" Local path: {model_source}")
768
+
769
+ # Decide target device once; we will move after full CPU load to avoid meta tensors
770
+ use_cuda = torch.cuda.is_available()
771
+
772
+ # Try loading components, falling back to Hub if local-only fails
773
+ def _load_components(source: str, local_flag: bool):
774
+ self._log(" Loading tokenizer...")
775
+ tok = AutoTokenizer.from_pretrained(source, local_files_only=local_flag)
776
+
777
+ self._log(" Loading image processor...")
778
+ try:
779
+ from transformers import AutoProcessor
780
+ except Exception:
781
+ AutoProcessor = None
782
+ try:
783
+ proc = AutoImageProcessor.from_pretrained(source, local_files_only=local_flag)
784
+ except Exception as e_proc:
785
+ if AutoProcessor is not None:
786
+ self._log(f" ⚠️ AutoImageProcessor failed: {e_proc}. Trying AutoProcessor...", "warning")
787
+ proc = AutoProcessor.from_pretrained(source, local_files_only=local_flag)
788
+ else:
789
+ raise
790
+
791
+ self._log(" Loading model...")
792
+ # Prevent meta tensors by forcing full materialization on CPU at load time
793
+ os.environ.setdefault('TORCHDYNAMO_DISABLE', '1')
794
+ mdl = VisionEncoderDecoderModel.from_pretrained(
795
+ source,
796
+ local_files_only=local_flag,
797
+ low_cpu_mem_usage=False,
798
+ device_map=None,
799
+ torch_dtype=torch.float32 # Use torch_dtype instead of dtype
800
+ )
801
+ return tok, proc, mdl
802
+
803
+ try:
804
+ self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
805
+ except Exception as e_local:
806
+ if local_only:
807
+ # Fallback to Hub once if local fails
808
+ self._log(f" ⚠️ Local model load failed: {e_local}", "warning")
809
+ try:
810
+ if os.environ.get("HF_HUB_OFFLINE") == "1":
811
+ del os.environ["HF_HUB_OFFLINE"]
812
+ except Exception:
813
+ pass
814
+ model_source = "kha-white/manga-ocr-base"
815
+ local_only = False
816
+ self._log(" Retrying from Hugging Face Hub...")
817
+ self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
818
+ else:
819
+ raise
820
+
821
+ # Move to CUDA only after full CPU materialization
822
+ target_device = 'cpu'
823
+ if use_cuda:
824
+ try:
825
+ self.model = self.model.to('cuda')
826
+ target_device = 'cuda'
827
+ except Exception as move_err:
828
+ self._log(f" ⚠️ Could not move model to CUDA: {move_err}", "warning")
829
+ target_device = 'cpu'
830
+
831
+ # Finalize eval mode
832
+ self.model.eval()
833
+
834
+ # Sanity-check: ensure no parameter remains on 'meta' device
835
+ try:
836
+ for n, p in self.model.named_parameters():
837
+ dev = getattr(p, 'device', None)
838
+ if dev is not None and getattr(dev, 'type', '') == 'meta':
839
+ raise RuntimeError(f"Parameter {n} is on 'meta' after load")
840
+ except Exception as sanity_err:
841
+ self._log(f"❌ Manga-OCR model load sanity check failed: {sanity_err}", "error")
842
+ return False
843
+
844
+ print(f"SUCCESS: Model loaded on {target_device.upper()}")
845
+ self._log(f" ✅ Model loaded on {target_device.upper()}")
846
+ self.is_loaded = True
847
+ self._log("✅ Manga OCR model ready")
848
+ print(">>> Returning True from load_model()")
849
+ return True
850
+
851
+ except Exception as e:
852
+ print(f"\nEXCEPTION in load_model: {e}")
853
+ import traceback
854
+ print(traceback.format_exc())
855
+ self._log(f"❌ Failed to load manga-ocr model: {str(e)}", "error")
856
+ self._log(traceback.format_exc(), "error")
857
+ try:
858
+ if 'local_only' in locals() and local_only:
859
+ self._log("Hint: Local load failed. Ensure your models/manga-ocr-base contains required files (config.json, preprocessor_config.json, tokenizer.json or tokenizer_config.json, and model weights).", "warning")
860
+ except Exception:
861
+ pass
862
+ return False
863
+
864
+ def _run_ocr(self, pil_image):
865
+ """Run OCR on a PIL image using the HuggingFace model"""
866
+ import torch
867
+
868
+ # Process image (keyword arg for broader compatibility across transformers versions)
869
+ inputs = self.processor(images=pil_image, return_tensors="pt")
870
+ pixel_values = inputs["pixel_values"]
871
+
872
+ # Move to same device as model
873
+ try:
874
+ model_device = next(self.model.parameters()).device
875
+ except StopIteration:
876
+ model_device = torch.device('cpu')
877
+ pixel_values = pixel_values.to(model_device)
878
+
879
+ # Generate text
880
+ with torch.no_grad():
881
+ generated_ids = self.model.generate(pixel_values)
882
+
883
+ # Decode
884
+ generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
885
+
886
+ return generated_text
887
+
888
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
889
+ """
890
+ Process the image region passed to it.
891
+ This could be a bubble region or the full image.
892
+ """
893
+ results = []
894
+
895
+ # Check for stop at start
896
+ if self._check_stop():
897
+ self._log("⏹️ Manga-OCR processing stopped by user", "warning")
898
+ return results
899
+
900
+ try:
901
+ if not self.is_loaded:
902
+ if not self.load_model():
903
+ return results
904
+
905
+ import cv2
906
+ from PIL import Image
907
+
908
+ # Get confidence from kwargs
909
+ confidence = kwargs.get('confidence', 0.7)
910
+
911
+ # Convert numpy array to PIL
912
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
913
+ pil_image = Image.fromarray(image_rgb)
914
+ h, w = image.shape[:2]
915
+
916
+ self._log("🔍 Processing region with manga-ocr...")
917
+
918
+ # Check for stop before inference
919
+ if self._check_stop():
920
+ self._log("⏹️ Manga-OCR inference stopped by user", "warning")
921
+ return results
922
+
923
+ # Run OCR on the image region
924
+ text = self._run_ocr(pil_image)
925
+
926
+ if text and text.strip():
927
+ # Return result for this region with its actual bbox
928
+ results.append(OCRResult(
929
+ text=text.strip(),
930
+ bbox=(0, 0, w, h), # Relative to the region passed in
931
+ confidence=confidence,
932
+ vertices=[(0, 0), (w, 0), (w, h), (0, h)]
933
+ ))
934
+ self._log(f"✅ Detected text: {text[:50]}...")
935
+
936
+ except Exception as e:
937
+ self._log(f"❌ Error in manga-ocr: {str(e)}", "error")
938
+
939
+ return results
940
+
941
+ class Qwen2VL(OCRProvider):
942
+ """OCR using Qwen2-VL - Vision Language Model that can read Korean text"""
943
+
944
+ def __init__(self, log_callback=None):
945
+ super().__init__(log_callback)
946
+ self.processor = None
947
+ self.model = None
948
+ self.tokenizer = None
949
+
950
+ # Get OCR prompt from environment or use default (UPDATED: Improved prompt)
951
+ self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
952
+ "YOU ARE A TEXT EXTRACTION MACHINE. EXTRACT EXACTLY WHAT YOU SEE.\n\n"
953
+ "ABSOLUTE RULES:\n"
954
+ "1. OUTPUT ONLY THE VISIBLE TEXT/SYMBOLS - NOTHING ELSE\n"
955
+ "2. NEVER TRANSLATE OR MODIFY\n"
956
+ "3. NEVER EXPLAIN, DESCRIBE, OR COMMENT\n"
957
+ "4. NEVER SAY \"I can't\" or \"I cannot\" or \"no text\" or \"blank image\"\n"
958
+ "5. IF YOU SEE DOTS, OUTPUT THE DOTS: .\n"
959
+ "6. IF YOU SEE PUNCTUATION, OUTPUT THE PUNCTUATION\n"
960
+ "7. IF YOU SEE A SINGLE CHARACTER, OUTPUT THAT CHARACTER\n"
961
+ "8. IF YOU SEE NOTHING, OUTPUT NOTHING (empty response)\n\n"
962
+ "LANGUAGE PRESERVATION:\n"
963
+ "- Korean text → Output in Korean\n"
964
+ "- Japanese text → Output in Japanese\n"
965
+ "- Chinese text → Output in Chinese\n"
966
+ "- English text → Output in English\n"
967
+ "- CJK quotation marks (「」『』【】《》〈〉) → Preserve exactly as shown\n\n"
968
+ "FORMATTING:\n"
969
+ "- OUTPUT ALL TEXT ON A SINGLE LINE WITH NO LINE BREAKS\n"
970
+ "- NEVER use \\n or line breaks in your output\n\n"
971
+ "FORBIDDEN RESPONSES:\n"
972
+ "- \"I can see this appears to be...\"\n"
973
+ "- \"I cannot make out any clear text...\"\n"
974
+ "- \"This appears to be blank...\"\n"
975
+ "- \"If there is text present...\"\n"
976
+ "- ANY explanatory text\n\n"
977
+ "YOUR ONLY OUTPUT: The exact visible text. Nothing more. Nothing less.\n"
978
+ "If image has a dot → Output: .\n"
979
+ "If image has two dots → Output: . .\n"
980
+ "If image has text → Output: [that text]\n"
981
+ "If image is truly blank → Output: [empty/no response]"
982
+ )
983
+
984
+ def set_ocr_prompt(self, prompt: str):
985
+ """Allow setting the OCR prompt dynamically"""
986
+ self.ocr_prompt = prompt
987
+
988
+ def check_installation(self) -> bool:
989
+ """Check if required packages are installed"""
990
+ try:
991
+ import transformers
992
+ import torch
993
+ self.is_installed = True
994
+ return True
995
+ except ImportError:
996
+ return False
997
+
998
+ def install(self, progress_callback=None) -> bool:
999
+ """Install requirements for Qwen2-VL"""
1000
+ pass
1001
+
1002
+ def load_model(self, model_size=None, **kwargs) -> bool:
1003
+ """Load Qwen2-VL model with size selection"""
1004
+ self._log(f"DEBUG: load_model called with model_size={model_size}")
1005
+
1006
+ try:
1007
+ if not self.is_installed and not self.check_installation():
1008
+ self._log("❌ Not installed", "error")
1009
+ return False
1010
+
1011
+ self._log("🔥 Loading Qwen2-VL for Advanced OCR...")
1012
+
1013
+
1014
+
1015
+ from transformers import AutoProcessor, AutoTokenizer
1016
+ import torch
1017
+
1018
+ # Model options
1019
+ model_options = {
1020
+ "1": "Qwen/Qwen2-VL-2B-Instruct",
1021
+ "2": "Qwen/Qwen2-VL-7B-Instruct",
1022
+ "3": "Qwen/Qwen2-VL-72B-Instruct",
1023
+ "4": "custom"
1024
+ }
1025
+ # CHANGE: Default to 7B instead of 2B
1026
+ # Check for saved preference first
1027
+ if model_size is None:
1028
+ # Try to get from environment or config
1029
+ import os
1030
+ model_size = os.environ.get('QWEN2VL_MODEL_SIZE', '1')
1031
+
1032
+ # Determine which model to load
1033
+ if model_size and str(model_size).startswith("custom:"):
1034
+ # Custom model passed with ID
1035
+ model_id = str(model_size).replace("custom:", "")
1036
+ self.loaded_model_size = "Custom"
1037
+ self.model_id = model_id
1038
+ self._log(f"Loading custom model: {model_id}")
1039
+ elif model_size == "4":
1040
+ # Custom option selected but no ID - shouldn't happen
1041
+ self._log("❌ Custom model selected but no ID provided", "error")
1042
+ return False
1043
+ elif model_size and str(model_size) in model_options:
1044
+ # Standard model option
1045
+ option = model_options[str(model_size)]
1046
+ if option == "custom":
1047
+ self._log("❌ Custom model needs an ID", "error")
1048
+ return False
1049
+ model_id = option
1050
+ # Set loaded_model_size for status display
1051
+ if model_size == "1":
1052
+ self.loaded_model_size = "2B"
1053
+ elif model_size == "2":
1054
+ self.loaded_model_size = "7B"
1055
+ elif model_size == "3":
1056
+ self.loaded_model_size = "72B"
1057
+ else:
1058
+ # CHANGE: Default to 7B (option "2") instead of 2B
1059
+ model_id = model_options["1"] # Changed from "1" to "2"
1060
+ self.loaded_model_size = "2B" # Changed from "2B" to "7B"
1061
+ self._log("No model size specified, defaulting to 2B") # Changed message
1062
+
1063
+ self._log(f"Loading model: {model_id}")
1064
+
1065
+ # Load processor and tokenizer
1066
+ self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
1067
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
1068
+
1069
+ # Load the model - let it figure out the class dynamically
1070
+ if torch.cuda.is_available():
1071
+ self._log(f"GPU: {torch.cuda.get_device_name(0)}")
1072
+ # Use auto model class
1073
+ from transformers import AutoModelForVision2Seq
1074
+ self.model = AutoModelForVision2Seq.from_pretrained(
1075
+ model_id,
1076
+ dtype=torch.float16,
1077
+ device_map="auto",
1078
+ trust_remote_code=True
1079
+ )
1080
+ self._log("✅ Model loaded on GPU")
1081
+ else:
1082
+ self._log("Loading on CPU...")
1083
+ from transformers import AutoModelForVision2Seq
1084
+ self.model = AutoModelForVision2Seq.from_pretrained(
1085
+ model_id,
1086
+ dtype=torch.float32,
1087
+ trust_remote_code=True
1088
+ )
1089
+ self._log("✅ Model loaded on CPU")
1090
+
1091
+ self.model.eval()
1092
+ self.is_loaded = True
1093
+ self._log("✅ Qwen2-VL ready for Advanced OCR!")
1094
+ return True
1095
+
1096
+ except Exception as e:
1097
+ self._log(f"❌ Failed to load: {str(e)}", "error")
1098
+ import traceback
1099
+ self._log(traceback.format_exc(), "debug")
1100
+ return False
1101
+
1102
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1103
+ """Process image with Qwen2-VL for Korean text extraction"""
1104
+ results = []
1105
+ if hasattr(self, 'model_id'):
1106
+ self._log(f"DEBUG: Using model: {self.model_id}", "debug")
1107
+
1108
+ # Check if OCR prompt was passed in kwargs (for dynamic updates)
1109
+ if 'ocr_prompt' in kwargs:
1110
+ self.ocr_prompt = kwargs['ocr_prompt']
1111
+
1112
+ try:
1113
+ if not self.is_loaded:
1114
+ if not self.load_model():
1115
+ return results
1116
+
1117
+ import cv2
1118
+ from PIL import Image
1119
+ import torch
1120
+
1121
+ # Convert to PIL
1122
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
1123
+ pil_image = Image.fromarray(image_rgb)
1124
+ h, w = image.shape[:2]
1125
+
1126
+ self._log(f"🔍 Processing with Qwen2-VL ({w}x{h} pixels)...")
1127
+
1128
+ # Use the configurable OCR prompt
1129
+ messages = [
1130
+ {
1131
+ "role": "user",
1132
+ "content": [
1133
+ {
1134
+ "type": "image",
1135
+ "image": pil_image,
1136
+ },
1137
+ {
1138
+ "type": "text",
1139
+ "text": self.ocr_prompt # Use the configurable prompt
1140
+ }
1141
+ ]
1142
+ }
1143
+ ]
1144
+
1145
+ # Alternative simpler prompt if the above still causes issues:
1146
+ # "text": "OCR: Extract text as-is"
1147
+
1148
+ # Process with Qwen2-VL
1149
+ text = self.processor.apply_chat_template(
1150
+ messages,
1151
+ tokenize=False,
1152
+ add_generation_prompt=True
1153
+ )
1154
+
1155
+ inputs = self.processor(
1156
+ text=[text],
1157
+ images=[pil_image],
1158
+ padding=True,
1159
+ return_tensors="pt"
1160
+ )
1161
+
1162
+ # Get the device and dtype the model is currently on
1163
+ model_device = next(self.model.parameters()).device
1164
+ model_dtype = next(self.model.parameters()).dtype
1165
+
1166
+ # Move inputs to the same device as the model and cast float tensors to model dtype
1167
+ try:
1168
+ # Move first
1169
+ inputs = inputs.to(model_device)
1170
+ # Then align dtypes only for floating tensors (e.g., pixel_values)
1171
+ for k, v in inputs.items():
1172
+ if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
1173
+ inputs[k] = v.to(model_dtype)
1174
+ except Exception:
1175
+ # Fallback: ensure at least pixel_values is correct if present
1176
+ try:
1177
+ if isinstance(inputs, dict) and "pixel_values" in inputs:
1178
+ pv = inputs["pixel_values"].to(model_device)
1179
+ if torch.is_floating_point(pv):
1180
+ inputs["pixel_values"] = pv.to(model_dtype)
1181
+ except Exception:
1182
+ pass
1183
+
1184
+ # Ensure pixel_values explicitly matches model dtype if present
1185
+ try:
1186
+ if isinstance(inputs, dict) and "pixel_values" in inputs:
1187
+ inputs["pixel_values"] = inputs["pixel_values"].to(device=model_device, dtype=model_dtype)
1188
+ except Exception:
1189
+ pass
1190
+
1191
+ # Generate text with stricter parameters to avoid creative responses
1192
+ use_amp = (hasattr(torch, 'cuda') and model_device.type == 'cuda' and model_dtype in (torch.float16, torch.bfloat16))
1193
+ autocast_dev = 'cuda' if model_device.type == 'cuda' else 'cpu'
1194
+ autocast_dtype = model_dtype if model_dtype in (torch.float16, torch.bfloat16) else None
1195
+
1196
+ with torch.no_grad():
1197
+ if use_amp and autocast_dtype is not None:
1198
+ with torch.autocast(autocast_dev, dtype=autocast_dtype):
1199
+ generated_ids = self.model.generate(
1200
+ **inputs,
1201
+ max_new_tokens=128, # Reduced from 512 - manga bubbles are typically short
1202
+ do_sample=False, # Keep deterministic
1203
+ temperature=0.01, # Keep your very low temperature
1204
+ top_p=1.0, # Keep no nucleus sampling
1205
+ repetition_penalty=1.0, # Keep no repetition penalty
1206
+ num_beams=1, # Ensure greedy decoding (faster than beam search)
1207
+ use_cache=True, # Enable KV cache for speed
1208
+ early_stopping=True, # Stop at EOS token
1209
+ pad_token_id=self.tokenizer.pad_token_id, # Proper padding
1210
+ eos_token_id=self.tokenizer.eos_token_id, # Proper stopping
1211
+ )
1212
+ else:
1213
+ generated_ids = self.model.generate(
1214
+ **inputs,
1215
+ max_new_tokens=128, # Reduced from 512 - manga bubbles are typically short
1216
+ do_sample=False, # Keep deterministic
1217
+ temperature=0.01, # Keep your very low temperature
1218
+ top_p=1.0, # Keep no nucleus sampling
1219
+ repetition_penalty=1.0, # Keep no repetition penalty
1220
+ num_beams=1, # Ensure greedy decoding (faster than beam search)
1221
+ use_cache=True, # Enable KV cache for speed
1222
+ early_stopping=True, # Stop at EOS token
1223
+ pad_token_id=self.tokenizer.pad_token_id, # Proper padding
1224
+ eos_token_id=self.tokenizer.eos_token_id, # Proper stopping
1225
+ )
1226
+
1227
+ # Decode the output
1228
+ generated_ids_trimmed = [
1229
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
1230
+ ]
1231
+ output_text = self.processor.batch_decode(
1232
+ generated_ids_trimmed,
1233
+ skip_special_tokens=True,
1234
+ clean_up_tokenization_spaces=False
1235
+ )[0]
1236
+
1237
+ if output_text and output_text.strip():
1238
+ text = output_text.strip()
1239
+
1240
+ # ADDED: Filter out any response that looks like an explanation or apology
1241
+ # Common patterns that indicate the model is being "helpful" instead of just extracting
1242
+ unwanted_patterns = [
1243
+ "죄송합니다", # "I apologize"
1244
+ "sorry",
1245
+ "apologize",
1246
+ "이미지에는", # "in this image"
1247
+ "텍스트가 없습니다", # "there is no text"
1248
+ "I cannot",
1249
+ "I don't see",
1250
+ "There is no",
1251
+ "질문이 있으시면", # "if you have questions"
1252
+ ]
1253
+
1254
+ # Check if response contains unwanted patterns
1255
+ text_lower = text.lower()
1256
+ is_explanation = any(pattern.lower() in text_lower for pattern in unwanted_patterns)
1257
+
1258
+ # Also check if the response is suspiciously long for a bubble
1259
+ # Most manga bubbles are short, if we get 50+ chars it might be an explanation
1260
+ is_too_long = len(text) > 100 and ('.' in text or ',' in text or '!' in text)
1261
+
1262
+ if is_explanation or is_too_long:
1263
+ self._log(f"⚠️ Model returned explanation instead of text, ignoring", "warning")
1264
+ # Return empty result or just skip this region
1265
+ return results
1266
+
1267
+ # Check language
1268
+ has_korean = any('\uAC00' <= c <= '\uD7AF' for c in text)
1269
+ has_japanese = any('\u3040' <= c <= '\u309F' or '\u30A0' <= c <= '\u30FF' for c in text)
1270
+ has_chinese = any('\u4E00' <= c <= '\u9FFF' for c in text)
1271
+
1272
+ if has_korean:
1273
+ self._log(f"✅ Korean detected: {text[:50]}...")
1274
+ elif has_japanese:
1275
+ self._log(f"✅ Japanese detected: {text[:50]}...")
1276
+ elif has_chinese:
1277
+ self._log(f"✅ Chinese detected: {text[:50]}...")
1278
+ else:
1279
+ self._log(f"✅ Text: {text[:50]}...")
1280
+
1281
+ results.append(OCRResult(
1282
+ text=text,
1283
+ bbox=(0, 0, w, h),
1284
+ confidence=0.9,
1285
+ vertices=[(0, 0), (w, 0), (w, h), (0, h)]
1286
+ ))
1287
+ else:
1288
+ self._log("⚠️ No text detected", "warning")
1289
+
1290
+ except Exception as e:
1291
+ self._log(f"❌ Error: {str(e)}", "error")
1292
+ import traceback
1293
+ self._log(traceback.format_exc(), "debug")
1294
+
1295
+ return results
1296
+
1297
+ class EasyOCRProvider(OCRProvider):
1298
+ """EasyOCR provider for multiple languages"""
1299
+
1300
+ def __init__(self, log_callback=None, languages=None):
1301
+ super().__init__(log_callback)
1302
+ # Default to safe language combination
1303
+ self.languages = languages or ['ja', 'en'] # Safe default
1304
+ self._validate_language_combination()
1305
+
1306
+ def _validate_language_combination(self):
1307
+ """Validate and fix EasyOCR language combinations"""
1308
+ # EasyOCR language compatibility rules
1309
+ incompatible_pairs = [
1310
+ (['ja', 'ko'], 'Japanese and Korean cannot be used together'),
1311
+ (['ja', 'zh'], 'Japanese and Chinese cannot be used together'),
1312
+ (['ko', 'zh'], 'Korean and Chinese cannot be used together')
1313
+ ]
1314
+
1315
+ for incompatible, reason in incompatible_pairs:
1316
+ if all(lang in self.languages for lang in incompatible):
1317
+ self._log(f"⚠️ EasyOCR: {reason}", "warning")
1318
+ # Keep first language + English
1319
+ self.languages = [self.languages[0], 'en']
1320
+ self._log(f"🔧 Auto-adjusted to: {self.languages}", "info")
1321
+ break
1322
+
1323
+ def check_installation(self) -> bool:
1324
+ """Check if easyocr is installed"""
1325
+ try:
1326
+ import easyocr
1327
+ self.is_installed = True
1328
+ return True
1329
+ except ImportError:
1330
+ return False
1331
+
1332
+ def install(self, progress_callback=None) -> bool:
1333
+ """Install easyocr"""
1334
+ pass
1335
+
1336
+ def load_model(self, **kwargs) -> bool:
1337
+ """Load easyocr model"""
1338
+ try:
1339
+ if not self.is_installed and not self.check_installation():
1340
+ self._log("❌ easyocr not installed", "error")
1341
+ return False
1342
+
1343
+ self._log(f"🔥 Loading easyocr model for languages: {self.languages}...")
1344
+ import easyocr
1345
+
1346
+ # This will download models on first run
1347
+ self.model = easyocr.Reader(self.languages, gpu=True)
1348
+ self.is_loaded = True
1349
+
1350
+ self._log("✅ easyocr model loaded successfully")
1351
+ return True
1352
+
1353
+ except Exception as e:
1354
+ self._log(f"❌ Failed to load easyocr: {str(e)}", "error")
1355
+ # Try CPU mode if GPU fails
1356
+ try:
1357
+ import easyocr
1358
+ self.model = easyocr.Reader(self.languages, gpu=False)
1359
+ self.is_loaded = True
1360
+ self._log("✅ easyocr loaded in CPU mode")
1361
+ return True
1362
+ except:
1363
+ return False
1364
+
1365
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1366
+ """Detect text using easyocr"""
1367
+ results = []
1368
+
1369
+ try:
1370
+ if not self.is_loaded:
1371
+ if not self.load_model():
1372
+ return results
1373
+
1374
+ # EasyOCR can work directly with numpy arrays
1375
+ ocr_results = self.model.readtext(image, detail=1)
1376
+
1377
+ # Parse results
1378
+ for (bbox, text, confidence) in ocr_results:
1379
+ # bbox is a list of 4 points
1380
+ xs = [point[0] for point in bbox]
1381
+ ys = [point[1] for point in bbox]
1382
+ x_min, x_max = min(xs), max(xs)
1383
+ y_min, y_max = min(ys), max(ys)
1384
+
1385
+ results.append(OCRResult(
1386
+ text=text,
1387
+ bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
1388
+ confidence=confidence,
1389
+ vertices=[(int(p[0]), int(p[1])) for p in bbox]
1390
+ ))
1391
+
1392
+ self._log(f"✅ Detected {len(results)} text regions")
1393
+
1394
+ except Exception as e:
1395
+ self._log(f"❌ Error in easyocr detection: {str(e)}", "error")
1396
+
1397
+ return results
1398
+
1399
+
1400
+ class PaddleOCRProvider(OCRProvider):
1401
+ """PaddleOCR provider with memory safety measures"""
1402
+
1403
+ def check_installation(self) -> bool:
1404
+ """Check if paddleocr is installed"""
1405
+ try:
1406
+ from paddleocr import PaddleOCR
1407
+ self.is_installed = True
1408
+ return True
1409
+ except ImportError:
1410
+ return False
1411
+
1412
+ def install(self, progress_callback=None) -> bool:
1413
+ """Install paddleocr"""
1414
+ pass
1415
+
1416
+ def load_model(self, **kwargs) -> bool:
1417
+ """Load paddleocr model with memory-safe configurations"""
1418
+ try:
1419
+ if not self.is_installed and not self.check_installation():
1420
+ self._log("❌ paddleocr not installed", "error")
1421
+ return False
1422
+
1423
+ self._log("🔥 Loading PaddleOCR model...")
1424
+
1425
+ # Set memory-safe environment variables BEFORE importing
1426
+ import os
1427
+ os.environ['OMP_NUM_THREADS'] = '1' # Prevent OpenMP conflicts
1428
+ os.environ['MKL_NUM_THREADS'] = '1' # Prevent MKL conflicts
1429
+ os.environ['OPENBLAS_NUM_THREADS'] = '1' # Prevent OpenBLAS conflicts
1430
+ os.environ['FLAGS_use_mkldnn'] = '0' # Disable MKL-DNN
1431
+
1432
+ from paddleocr import PaddleOCR
1433
+
1434
+ # Try memory-safe configurations
1435
+ configs_to_try = [
1436
+ # Config 1: Most memory-safe configuration
1437
+ {
1438
+ 'use_angle_cls': False, # Disable angle to save memory
1439
+ 'lang': 'ch',
1440
+ 'rec_batch_num': 1, # Process one at a time
1441
+ 'max_text_length': 100, # Limit text length
1442
+ 'drop_score': 0.5, # Higher threshold to reduce detections
1443
+ 'cpu_threads': 1, # Single thread to avoid conflicts
1444
+ },
1445
+ # Config 2: Minimal memory footprint
1446
+ {
1447
+ 'lang': 'ch',
1448
+ 'rec_batch_num': 1,
1449
+ 'cpu_threads': 1,
1450
+ },
1451
+ # Config 3: Absolute minimal
1452
+ {
1453
+ 'lang': 'ch'
1454
+ },
1455
+ # Config 4: Empty config
1456
+ {}
1457
+ ]
1458
+
1459
+ for i, config in enumerate(configs_to_try):
1460
+ try:
1461
+ self._log(f" Trying configuration {i+1}/{len(configs_to_try)}: {config}")
1462
+
1463
+ # Force garbage collection before loading
1464
+ import gc
1465
+ gc.collect()
1466
+
1467
+ self.model = PaddleOCR(**config)
1468
+ self.is_loaded = True
1469
+ self.current_config = config
1470
+ self._log(f"✅ PaddleOCR loaded successfully with config: {config}")
1471
+ return True
1472
+ except Exception as e:
1473
+ error_str = str(e)
1474
+ self._log(f" Config {i+1} failed: {error_str}", "debug")
1475
+
1476
+ # Clean up on failure
1477
+ if hasattr(self, 'model'):
1478
+ del self.model
1479
+ gc.collect()
1480
+ continue
1481
+
1482
+ self._log(f"❌ PaddleOCR failed to load with any configuration", "error")
1483
+ return False
1484
+
1485
+ except Exception as e:
1486
+ self._log(f"❌ Failed to load paddleocr: {str(e)}", "error")
1487
+ import traceback
1488
+ self._log(traceback.format_exc(), "debug")
1489
+ return False
1490
+
1491
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1492
+ """Detect text with memory safety measures"""
1493
+ results = []
1494
+
1495
+ try:
1496
+ if not self.is_loaded:
1497
+ if not self.load_model():
1498
+ return results
1499
+
1500
+ import cv2
1501
+ import numpy as np
1502
+ import gc
1503
+
1504
+ # Memory safety: Ensure image isn't too large
1505
+ h, w = image.shape[:2] if len(image.shape) >= 2 else (0, 0)
1506
+
1507
+ # Limit image size to prevent memory issues
1508
+ MAX_DIMENSION = 1500
1509
+ if h > MAX_DIMENSION or w > MAX_DIMENSION:
1510
+ scale = min(MAX_DIMENSION/h, MAX_DIMENSION/w)
1511
+ new_h, new_w = int(h*scale), int(w*scale)
1512
+ self._log(f"⚠️ Resizing large image from {w}x{h} to {new_w}x{new_h} for memory safety", "warning")
1513
+ image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
1514
+ scale_factor = 1/scale
1515
+ else:
1516
+ scale_factor = 1.0
1517
+
1518
+ # Ensure correct format
1519
+ if len(image.shape) == 2: # Grayscale
1520
+ image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
1521
+ elif len(image.shape) == 4: # Batch
1522
+ image = image[0]
1523
+
1524
+ # Ensure uint8 type
1525
+ if image.dtype != np.uint8:
1526
+ if image.max() <= 1.0:
1527
+ image = (image * 255).astype(np.uint8)
1528
+ else:
1529
+ image = image.astype(np.uint8)
1530
+
1531
+ # Make a copy to avoid memory corruption
1532
+ image_copy = image.copy()
1533
+
1534
+ # Force garbage collection before OCR
1535
+ gc.collect()
1536
+
1537
+ # Process with timeout protection
1538
+ import signal
1539
+ import threading
1540
+
1541
+ ocr_results = None
1542
+ ocr_error = None
1543
+
1544
+ def run_ocr():
1545
+ nonlocal ocr_results, ocr_error
1546
+ try:
1547
+ ocr_results = self.model.ocr(image_copy)
1548
+ except Exception as e:
1549
+ ocr_error = e
1550
+
1551
+ # Run OCR in a separate thread with timeout
1552
+ ocr_thread = threading.Thread(target=run_ocr)
1553
+ ocr_thread.daemon = True
1554
+ ocr_thread.start()
1555
+ ocr_thread.join(timeout=30) # 30 second timeout
1556
+
1557
+ if ocr_thread.is_alive():
1558
+ self._log("❌ PaddleOCR timeout - taking too long", "error")
1559
+ return results
1560
+
1561
+ if ocr_error:
1562
+ raise ocr_error
1563
+
1564
+ # Parse results
1565
+ results = self._parse_ocr_results(ocr_results)
1566
+
1567
+ # Scale coordinates back if image was resized
1568
+ if scale_factor != 1.0 and results:
1569
+ for r in results:
1570
+ x, y, width, height = r.bbox
1571
+ r.bbox = (int(x*scale_factor), int(y*scale_factor),
1572
+ int(width*scale_factor), int(height*scale_factor))
1573
+ r.vertices = [(int(v[0]*scale_factor), int(v[1]*scale_factor))
1574
+ for v in r.vertices]
1575
+
1576
+ if results:
1577
+ self._log(f"✅ Detected {len(results)} text regions", "info")
1578
+ else:
1579
+ self._log("No text regions found", "debug")
1580
+
1581
+ # Clean up
1582
+ del image_copy
1583
+ gc.collect()
1584
+
1585
+ except Exception as e:
1586
+ error_msg = str(e) if str(e) else type(e).__name__
1587
+
1588
+ if "memory" in error_msg.lower() or "0x" in error_msg:
1589
+ self._log("❌ Memory access violation in PaddleOCR", "error")
1590
+ self._log(" This is a known Windows issue with PaddleOCR", "info")
1591
+ self._log(" Please switch to EasyOCR or manga-ocr instead", "warning")
1592
+ elif "trace_order.size()" in error_msg:
1593
+ self._log("❌ PaddleOCR internal error", "error")
1594
+ self._log(" Please switch to EasyOCR or manga-ocr", "warning")
1595
+ else:
1596
+ self._log(f"❌ Error in paddleocr detection: {error_msg}", "error")
1597
+
1598
+ import traceback
1599
+ self._log(traceback.format_exc(), "debug")
1600
+
1601
+ return results
1602
+
1603
+ def _parse_ocr_results(self, ocr_results) -> List[OCRResult]:
1604
+ """Parse OCR results safely"""
1605
+ results = []
1606
+
1607
+ if isinstance(ocr_results, bool) and ocr_results == False:
1608
+ return results
1609
+
1610
+ if ocr_results is None or not isinstance(ocr_results, list):
1611
+ return results
1612
+
1613
+ if len(ocr_results) == 0:
1614
+ return results
1615
+
1616
+ # Handle batch format
1617
+ if isinstance(ocr_results[0], list) and len(ocr_results[0]) > 0:
1618
+ first_item = ocr_results[0][0]
1619
+ if isinstance(first_item, list) and len(first_item) > 0:
1620
+ if isinstance(first_item[0], (list, tuple)) and len(first_item[0]) == 2:
1621
+ ocr_results = ocr_results[0]
1622
+
1623
+ # Parse detections
1624
+ for detection in ocr_results:
1625
+ if not detection or isinstance(detection, bool):
1626
+ continue
1627
+
1628
+ if not isinstance(detection, (list, tuple)) or len(detection) < 2:
1629
+ continue
1630
+
1631
+ try:
1632
+ bbox_points = detection[0]
1633
+ text_data = detection[1]
1634
+
1635
+ if not isinstance(bbox_points, (list, tuple)) or len(bbox_points) != 4:
1636
+ continue
1637
+
1638
+ if not isinstance(text_data, (tuple, list)) or len(text_data) < 2:
1639
+ continue
1640
+
1641
+ text = str(text_data[0]).strip()
1642
+ confidence = float(text_data[1])
1643
+
1644
+ if not text or confidence < 0.3:
1645
+ continue
1646
+
1647
+ xs = [float(p[0]) for p in bbox_points]
1648
+ ys = [float(p[1]) for p in bbox_points]
1649
+ x_min, x_max = min(xs), max(xs)
1650
+ y_min, y_max = min(ys), max(ys)
1651
+
1652
+ if (x_max - x_min) < 5 or (y_max - y_min) < 5:
1653
+ continue
1654
+
1655
+ results.append(OCRResult(
1656
+ text=text,
1657
+ bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
1658
+ confidence=confidence,
1659
+ vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
1660
+ ))
1661
+
1662
+ except Exception:
1663
+ continue
1664
+
1665
+ return results
1666
+
1667
+ class DocTROCRProvider(OCRProvider):
1668
+ """DocTR OCR provider"""
1669
+
1670
+ def check_installation(self) -> bool:
1671
+ """Check if doctr is installed"""
1672
+ try:
1673
+ from doctr.models import ocr_predictor
1674
+ self.is_installed = True
1675
+ return True
1676
+ except ImportError:
1677
+ return False
1678
+
1679
+ def install(self, progress_callback=None) -> bool:
1680
+ """Install doctr"""
1681
+ pass
1682
+
1683
+ def load_model(self, **kwargs) -> bool:
1684
+ """Load doctr model"""
1685
+ try:
1686
+ if not self.is_installed and not self.check_installation():
1687
+ self._log("❌ doctr not installed", "error")
1688
+ return False
1689
+
1690
+ self._log("🔥 Loading DocTR model...")
1691
+ from doctr.models import ocr_predictor
1692
+
1693
+ # Load pretrained model
1694
+ self.model = ocr_predictor(pretrained=True)
1695
+ self.is_loaded = True
1696
+
1697
+ self._log("✅ DocTR model loaded successfully")
1698
+ return True
1699
+
1700
+ except Exception as e:
1701
+ self._log(f"❌ Failed to load doctr: {str(e)}", "error")
1702
+ return False
1703
+
1704
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1705
+ """Detect text using doctr"""
1706
+ results = []
1707
+
1708
+ try:
1709
+ if not self.is_loaded:
1710
+ if not self.load_model():
1711
+ return results
1712
+
1713
+ from doctr.io import DocumentFile
1714
+
1715
+ # DocTR expects document format
1716
+ # Convert numpy array to PIL and save temporarily
1717
+ import tempfile
1718
+ import cv2
1719
+
1720
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
1721
+ cv2.imwrite(tmp.name, image)
1722
+ doc = DocumentFile.from_images(tmp.name)
1723
+
1724
+ # Run OCR
1725
+ result = self.model(doc)
1726
+
1727
+ # Parse results
1728
+ h, w = image.shape[:2]
1729
+ for page in result.pages:
1730
+ for block in page.blocks:
1731
+ for line in block.lines:
1732
+ for word in line.words:
1733
+ # Handle different geometry formats
1734
+ geometry = word.geometry
1735
+
1736
+ if len(geometry) == 4:
1737
+ # Standard format: (x1, y1, x2, y2)
1738
+ x1, y1, x2, y2 = geometry
1739
+ elif len(geometry) == 2:
1740
+ # Alternative format: ((x1, y1), (x2, y2))
1741
+ (x1, y1), (x2, y2) = geometry
1742
+ else:
1743
+ self._log(f"Unexpected geometry format: {geometry}", "warning")
1744
+ continue
1745
+
1746
+ # Convert relative coordinates to absolute
1747
+ x1, x2 = int(x1 * w), int(x2 * w)
1748
+ y1, y2 = int(y1 * h), int(y2 * h)
1749
+
1750
+ results.append(OCRResult(
1751
+ text=word.value,
1752
+ bbox=(x1, y1, x2 - x1, y2 - y1),
1753
+ confidence=word.confidence,
1754
+ vertices=[(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
1755
+ ))
1756
+
1757
+ # Clean up temp file
1758
+ try:
1759
+ os.unlink(tmp.name)
1760
+ except:
1761
+ pass
1762
+
1763
+ self._log(f"DocTR detected {len(results)} text regions")
1764
+
1765
+ except Exception as e:
1766
+ self._log(f"Error in doctr detection: {str(e)}", "error")
1767
+ import traceback
1768
+ self._log(traceback.format_exc(), "error")
1769
+
1770
+ return results
1771
+
1772
+
1773
+ class RapidOCRProvider(OCRProvider):
1774
+ """RapidOCR provider for fast local OCR"""
1775
+
1776
+ def check_installation(self) -> bool:
1777
+ """Check if rapidocr is installed"""
1778
+ try:
1779
+ import rapidocr_onnxruntime
1780
+ self.is_installed = True
1781
+ return True
1782
+ except ImportError:
1783
+ return False
1784
+
1785
+ def install(self, progress_callback=None) -> bool:
1786
+ """Install rapidocr (requires manual pip install)"""
1787
+ # RapidOCR requires manual installation
1788
+ if progress_callback:
1789
+ progress_callback("RapidOCR requires manual pip installation")
1790
+ self._log("Run: pip install rapidocr-onnxruntime", "info")
1791
+ return False # Always return False since we can't auto-install
1792
+
1793
+ def load_model(self, **kwargs) -> bool:
1794
+ """Load RapidOCR model"""
1795
+ try:
1796
+ if not self.is_installed and not self.check_installation():
1797
+ self._log("RapidOCR not installed", "error")
1798
+ return False
1799
+
1800
+ self._log("Loading RapidOCR...")
1801
+ from rapidocr_onnxruntime import RapidOCR
1802
+
1803
+ self.model = RapidOCR()
1804
+ self.is_loaded = True
1805
+
1806
+ self._log("RapidOCR model loaded successfully")
1807
+ return True
1808
+
1809
+ except Exception as e:
1810
+ self._log(f"Failed to load RapidOCR: {str(e)}", "error")
1811
+ return False
1812
+
1813
+ def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
1814
+ """Detect text using RapidOCR"""
1815
+ if not self.is_loaded:
1816
+ self._log("RapidOCR model not loaded", "error")
1817
+ return []
1818
+
1819
+ results = []
1820
+
1821
+ try:
1822
+ # Convert numpy array to PIL Image for RapidOCR
1823
+ if len(image.shape) == 3:
1824
+ # BGR to RGB
1825
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
1826
+ else:
1827
+ image_rgb = image
1828
+
1829
+ # RapidOCR expects PIL Image or numpy array
1830
+ ocr_results, _ = self.model(image_rgb)
1831
+
1832
+ if ocr_results:
1833
+ for result in ocr_results:
1834
+ # RapidOCR returns [bbox, text, confidence]
1835
+ bbox_points = result[0] # 4 corner points
1836
+ text = result[1]
1837
+ confidence = float(result[2])
1838
+
1839
+ if not text or not text.strip():
1840
+ continue
1841
+
1842
+ # Convert 4-point bbox to x,y,w,h format
1843
+ xs = [point[0] for point in bbox_points]
1844
+ ys = [point[1] for point in bbox_points]
1845
+ x_min, x_max = min(xs), max(xs)
1846
+ y_min, y_max = min(ys), max(ys)
1847
+
1848
+ results.append(OCRResult(
1849
+ text=text.strip(),
1850
+ bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
1851
+ confidence=confidence,
1852
+ vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
1853
+ ))
1854
+
1855
+ self._log(f"Detected {len(results)} text regions")
1856
+
1857
+ except Exception as e:
1858
+ self._log(f"Error in RapidOCR detection: {str(e)}", "error")
1859
+
1860
+ return results
1861
+
1862
+ class OCRManager:
1863
+ """Manager for multiple OCR providers"""
1864
+
1865
+ def __init__(self, log_callback=None):
1866
+ self.log_callback = log_callback
1867
+ self.providers = {
1868
+ 'custom-api': CustomAPIProvider(log_callback) ,
1869
+ 'manga-ocr': MangaOCRProvider(log_callback),
1870
+ 'easyocr': EasyOCRProvider(log_callback),
1871
+ 'paddleocr': PaddleOCRProvider(log_callback),
1872
+ 'doctr': DocTROCRProvider(log_callback),
1873
+ 'rapidocr': RapidOCRProvider(log_callback),
1874
+ 'Qwen2-VL': Qwen2VL(log_callback)
1875
+ }
1876
+ self.current_provider = None
1877
+ self.stop_flag = None
1878
+
1879
+ def get_provider(self, name: str) -> Optional[OCRProvider]:
1880
+ """Get OCR provider by name"""
1881
+ return self.providers.get(name)
1882
+
1883
+ def set_current_provider(self, name: str):
1884
+ """Set current active provider"""
1885
+ if name in self.providers:
1886
+ self.current_provider = name
1887
+ return True
1888
+ return False
1889
+
1890
+ def check_provider_status(self, name: str) -> Dict[str, bool]:
1891
+ """Check installation and loading status of provider"""
1892
+ provider = self.providers.get(name)
1893
+ if not provider:
1894
+ return {'installed': False, 'loaded': False}
1895
+
1896
+ result = {
1897
+ 'installed': provider.check_installation(),
1898
+ 'loaded': provider.is_loaded
1899
+ }
1900
+ if self.log_callback:
1901
+ self.log_callback(f"DEBUG: check_provider_status({name}) returning loaded={result['loaded']}", "debug")
1902
+ return result
1903
+
1904
+ def install_provider(self, name: str, progress_callback=None) -> bool:
1905
+ """Install a provider"""
1906
+ provider = self.providers.get(name)
1907
+ if not provider:
1908
+ return False
1909
+
1910
+ return provider.install(progress_callback)
1911
+
1912
+ def load_provider(self, name: str, **kwargs) -> bool:
1913
+ """Load a provider's model with optional parameters"""
1914
+ provider = self.providers.get(name)
1915
+ if not provider:
1916
+ return False
1917
+
1918
+ return provider.load_model(**kwargs) # <-- Passes model_size and any other kwargs
1919
+
1920
+ def shutdown(self):
1921
+ """Release models/processors/tokenizers for all providers and clear caches."""
1922
+ try:
1923
+ import gc
1924
+ for name, provider in list(self.providers.items()):
1925
+ try:
1926
+ if hasattr(provider, 'model'):
1927
+ provider.model = None
1928
+ if hasattr(provider, 'processor'):
1929
+ provider.processor = None
1930
+ if hasattr(provider, 'tokenizer'):
1931
+ provider.tokenizer = None
1932
+ if hasattr(provider, 'reader'):
1933
+ provider.reader = None
1934
+ if hasattr(provider, 'is_loaded'):
1935
+ provider.is_loaded = False
1936
+ except Exception:
1937
+ pass
1938
+ gc.collect()
1939
+ try:
1940
+ import torch
1941
+ torch.cuda.empty_cache()
1942
+ except Exception:
1943
+ pass
1944
+ except Exception:
1945
+ pass
1946
+
1947
+ def detect_text(self, image: np.ndarray, provider_name: str = None, **kwargs) -> List[OCRResult]:
1948
+ """Detect text using specified or current provider"""
1949
+ provider_name = provider_name or self.current_provider
1950
+ if not provider_name:
1951
+ return []
1952
+
1953
+ provider = self.providers.get(provider_name)
1954
+ if not provider:
1955
+ return []
1956
+
1957
+ return provider.detect_text(image, **kwargs)
1958
+
1959
+ def set_stop_flag(self, stop_flag):
1960
+ """Set stop flag for all providers"""
1961
+ self.stop_flag = stop_flag
1962
+ for provider in self.providers.values():
1963
+ if hasattr(provider, 'set_stop_flag'):
1964
+ provider.set_stop_flag(stop_flag)
1965
+
1966
+ def reset_stop_flags(self):
1967
+ """Reset stop flags for all providers"""
1968
+ for provider in self.providers.values():
1969
+ if hasattr(provider, 'reset_stop_flags'):
1970
+ provider.reset_stop_flags()
translator_gui.py ADDED
The diff for this file is too large to render. See raw diff