Spaces:

Shirochi
/

Glossarion

Running

App Files Files Community

Shirochi commited on Oct 5

Commit

7769622

verified ·

1 Parent(s): 0cc2912

Upload 8 files

Browse files

Files changed (8) hide show

app.py +0 -0
bubble_detector.py +2030 -0
hyphen_textwrap.py +508 -0
local_inpainter.py +0 -0
manga_integration.py +0 -0
manga_settings_dialog.py +0 -0
manga_translator.py +0 -0
ocr_manager.py +1904 -0

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

bubble_detector.py ADDED Viewed

	@@ -0,0 +1,2030 @@

+"""
+bubble_detector.py - Modified version that works in frozen PyInstaller executables
+Replace your bubble_detector.py with this version
+"""
+import os
+import sys
+import json
+import numpy as np
+import cv2
+from typing import List, Tuple, Optional, Dict, Any
+import logging
+import traceback
+import hashlib
+from pathlib import Path
+import threading
+import time
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Check if we're running in a frozen environment
+IS_FROZEN = getattr(sys, 'frozen', False)
+if IS_FROZEN:
+    # In frozen environment, set proper paths for ML libraries
+    MEIPASS = sys._MEIPASS
+    os.environ['TORCH_HOME'] = MEIPASS
+    os.environ['TRANSFORMERS_CACHE'] = os.path.join(MEIPASS, 'transformers')
+    os.environ['HF_HOME'] = os.path.join(MEIPASS, 'huggingface')
+    logger.info(f"Running in frozen environment: {MEIPASS}")
+# Modified import checks for frozen environment
+YOLO_AVAILABLE = False
+YOLO = None
+torch = None
+TORCH_AVAILABLE = False
+ONNX_AVAILABLE = False
+TRANSFORMERS_AVAILABLE = False
+RTDetrForObjectDetection = None
+RTDetrImageProcessor = None
+PIL_AVAILABLE = False
+# Try to import YOLO dependencies with better error handling
+if IS_FROZEN:
+    # In frozen environment, try harder to import
+    try:
+        # First try to import torch components individually
+        import torch
+        import torch.nn
+        import torch.cuda
+        TORCH_AVAILABLE = True
+        logger.info("✓ PyTorch loaded in frozen environment")
+    except Exception as e:
+        logger.warning(f"PyTorch not available in frozen environment: {e}")
+        TORCH_AVAILABLE = False
+        torch = None
+    # Try ultralytics after torch
+    if TORCH_AVAILABLE:
+        try:
+            from ultralytics import YOLO
+            YOLO_AVAILABLE = True
+            logger.info("✓ Ultralytics YOLO loaded in frozen environment")
+        except Exception as e:
+            logger.warning(f"Ultralytics not available in frozen environment: {e}")
+            YOLO_AVAILABLE = False
+    # Try transformers
+    try:
+        import transformers
+        # Try specific imports
+        try:
+            from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
+            TRANSFORMERS_AVAILABLE = True
+            logger.info("✓ Transformers RT-DETR loaded in frozen environment")
+        except ImportError:
+            # Try alternative import
+            try:
+                from transformers import AutoModel, AutoImageProcessor
+                RTDetrForObjectDetection = AutoModel
+                RTDetrImageProcessor = AutoImageProcessor
+                TRANSFORMERS_AVAILABLE = True
+                logger.info("✓ Transformers loaded with AutoModel fallback")
+            except:
+                TRANSFORMERS_AVAILABLE = False
+                logger.warning("Transformers RT-DETR not available in frozen environment")
+    except Exception as e:
+        logger.warning(f"Transformers not available in frozen environment: {e}")
+        TRANSFORMERS_AVAILABLE = False
+else:
+    # Normal environment - original import logic
+    try:
+        from ultralytics import YOLO
+        YOLO_AVAILABLE = True
+    except:
+        YOLO_AVAILABLE = False
+        logger.warning("Ultralytics YOLO not available")
+    try:
+        import torch
+        # Test if cuda attribute exists
+        _ = torch.cuda
+        TORCH_AVAILABLE = True
+    except (ImportError, AttributeError):
+        TORCH_AVAILABLE = False
+        torch = None
+        logger.warning("PyTorch not available or incomplete")
+    try:
+        from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
+        try:
+            from transformers import RTDetrV2ForObjectDetection
+            RTDetrForObjectDetection = RTDetrV2ForObjectDetection
+        except ImportError:
+            pass
+        TRANSFORMERS_AVAILABLE = True
+    except:
+        TRANSFORMERS_AVAILABLE = False
+        logger.info("Transformers not available for RT-DETR")
+# Configure ORT memory behavior before importing
+try:
+    os.environ.setdefault('ORT_DISABLE_MEMORY_ARENA', '1')
+except Exception:
+    pass
+# ONNX Runtime - works well in frozen environments
+try:
+    import onnxruntime as ort
+    ONNX_AVAILABLE = True
+    logger.info("✓ ONNX Runtime available")
+except ImportError:
+    ONNX_AVAILABLE = False
+    logger.warning("ONNX Runtime not available")
+# PIL
+try:
+    from PIL import Image
+    PIL_AVAILABLE = True
+except ImportError:
+    PIL_AVAILABLE = False
+    logger.info("PIL not available")
+class BubbleDetector:
+    """
+    Combined YOLOv8 and RT-DETR speech bubble detector for comics and manga.
+    Supports multiple model formats and provides configurable detection.
+    Backward compatible with existing code while adding RT-DETR support.
+    """
+    # Process-wide shared RT-DETR to avoid concurrent meta-device loads
+    _rtdetr_init_lock = threading.Lock()
+    _rtdetr_shared_model = None
+    _rtdetr_shared_processor = None
+    _rtdetr_loaded = False
+    _rtdetr_repo_id = 'ogkalu/comic-text-and-bubble-detector'
+    # Shared RT-DETR (ONNX) across process to avoid device/context storms
+    _rtdetr_onnx_init_lock = threading.Lock()
+    _rtdetr_onnx_shared_session = None
+    _rtdetr_onnx_loaded = False
+    _rtdetr_onnx_providers = None
+    _rtdetr_onnx_model_path = None
+    # Limit concurrent runs to avoid device hangs. Defaults to 2 for better parallelism.
+    # Can be overridden via env DML_MAX_CONCURRENT or config rtdetr_max_concurrency
+    try:
+        _rtdetr_onnx_max_concurrent = int(os.environ.get('DML_MAX_CONCURRENT', '2'))
+    except Exception:
+        _rtdetr_onnx_max_concurrent = 2
+    _rtdetr_onnx_sema = threading.Semaphore(max(1, _rtdetr_onnx_max_concurrent))
+    _rtdetr_onnx_sema_initialized = False
+    def __init__(self, config_path: str = "config.json"):
+        """
+        Initialize the bubble detector.
+        Args:
+            config_path: Path to configuration file
+        """
+        # Set thread limits early if environment indicates single-threaded mode
+        try:
+            if os.environ.get('OMP_NUM_THREADS') == '1':
+                # Already in single-threaded mode, ensure it's applied to this process
+                # Check if torch is available at module level before trying to use it
+                if TORCH_AVAILABLE and torch is not None:
+                    try:
+                        torch.set_num_threads(1)
+                    except (RuntimeError, AttributeError):
+                        pass
+                try:
+                    import cv2
+                    cv2.setNumThreads(1)
+                except (ImportError, AttributeError):
+                    pass
+        except Exception:
+            pass
+        self.config_path = config_path
+        self.config = self._load_config()
+        # YOLOv8 components (original)
+        self.model = None
+        self.model_loaded = False
+        self.model_type = None  # 'yolo', 'onnx', or 'torch'
+        self.onnx_session = None
+        # RT-DETR components (new)
+        self.rtdetr_model = None
+        self.rtdetr_processor = None
+        self.rtdetr_loaded = False
+        self.rtdetr_repo = 'ogkalu/comic-text-and-bubble-detector'
+        # RT-DETR (ONNX) backend components
+        self.rtdetr_onnx_session = None
+        self.rtdetr_onnx_loaded = False
+        self.rtdetr_onnx_repo = 'ogkalu/comic-text-and-bubble-detector'
+        # RT-DETR class definitions
+        self.CLASS_BUBBLE = 0      # Empty speech bubble
+        self.CLASS_TEXT_BUBBLE = 1 # Bubble with text
+        self.CLASS_TEXT_FREE = 2   # Text without bubble
+        # Detection settings
+        self.default_confidence = 0.3
+        self.default_iou_threshold = 0.45
+        # Allow override from settings
+        try:
+            ocr_cfg = self.config.get('manga_settings', {}).get('ocr', {}) if isinstance(self.config, dict) else {}
+            self.default_max_detections = int(ocr_cfg.get('bubble_max_detections', 100))
+            self.max_det_yolo = int(ocr_cfg.get('bubble_max_detections_yolo', self.default_max_detections))
+            self.max_det_rtdetr = int(ocr_cfg.get('bubble_max_detections_rtdetr', self.default_max_detections))
+        except Exception:
+            self.default_max_detections = 100
+            self.max_det_yolo = 100
+            self.max_det_rtdetr = 100
+        # Cache directory for ONNX conversions
+        self.cache_dir = os.environ.get('BUBBLE_CACHE_DIR', 'models')
+        os.makedirs(self.cache_dir, exist_ok=True)
+        # RT-DETR concurrency setting from config
+        try:
+            rtdetr_max_conc = int(ocr_cfg.get('rtdetr_max_concurrency', 2))
+            # Update class-level semaphore if not yet initialized or if value changed
+            if not BubbleDetector._rtdetr_onnx_sema_initialized or rtdetr_max_conc != BubbleDetector._rtdetr_onnx_max_concurrent:
+                BubbleDetector._rtdetr_onnx_max_concurrent = max(1, rtdetr_max_conc)
+                BubbleDetector._rtdetr_onnx_sema = threading.Semaphore(BubbleDetector._rtdetr_onnx_max_concurrent)
+                BubbleDetector._rtdetr_onnx_sema_initialized = True
+                logger.info(f"RT-DETR concurrency set to: {BubbleDetector._rtdetr_onnx_max_concurrent}")
+        except Exception as e:
+            logger.warning(f"Failed to set RT-DETR concurrency: {e}")
+        # GPU availability
+        self.use_gpu = TORCH_AVAILABLE and torch.cuda.is_available()
+        self.device = 'cuda' if self.use_gpu else 'cpu'
+        # Quantization/precision settings
+        adv_cfg = self.config.get('manga_settings', {}).get('advanced', {}) if isinstance(self.config, dict) else {}
+        ocr_cfg = self.config.get('manga_settings', {}).get('ocr', {}) if isinstance(self.config, dict) else {}
+        env_quant = os.environ.get('MODEL_QUANTIZE', 'false').lower() == 'true'
+        self.quantize_enabled = bool(env_quant or adv_cfg.get('quantize_models', False) or ocr_cfg.get('quantize_bubble_detector', False))
+        self.quantize_dtype = str(adv_cfg.get('torch_precision', os.environ.get('TORCH_PRECISION', 'auto'))).lower()
+        # Prefer advanced.onnx_quantize; fall back to env or global quantize
+        self.onnx_quantize_enabled = bool(adv_cfg.get('onnx_quantize', os.environ.get('ONNX_QUANTIZE', 'false').lower() == 'true' or self.quantize_enabled))
+        # Stop flag support
+        self.stop_flag = None
+        self._stopped = False
+        self.log_callback = None
+        logger.info(f"🗨️ BubbleDetector initialized")
+        logger.info(f"   GPU: {'Available' if self.use_gpu else 'Not available'}")
+        logger.info(f"   YOLO: {'Available' if YOLO_AVAILABLE else 'Not installed'}")
+        logger.info(f"   ONNX: {'Available' if ONNX_AVAILABLE else 'Not installed'}")
+        logger.info(f"   RT-DETR: {'Available' if TRANSFORMERS_AVAILABLE else 'Not installed'}")
+        logger.info(f"   Quantization: {'ENABLED' if self.quantize_enabled else 'disabled'} (torch_precision={self.quantize_dtype}, onnx_quantize={'on' if self.onnx_quantize_enabled else 'off'})" )
+    def _load_config(self) -> Dict[str, Any]:
+        """Load configuration from file."""
+        if os.path.exists(self.config_path):
+            try:
+                with open(self.config_path, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            except Exception as e:
+                logger.warning(f"Failed to load config: {e}")
+        return {}
+    def _save_config(self):
+        """Save configuration to file."""
+        try:
+            with open(self.config_path, 'w', encoding='utf-8') as f:
+                json.dump(self.config, f, indent=2)
+        except Exception as e:
+            logger.error(f"Failed to save config: {e}")
+    def set_stop_flag(self, stop_flag):
+        """Set the stop flag for checking interruptions"""
+        self.stop_flag = stop_flag
+        self._stopped = False
+    def set_log_callback(self, log_callback):
+        """Set log callback for GUI integration"""
+        self.log_callback = log_callback
+    def _check_stop(self) -> bool:
+        """Check if stop has been requested"""
+        if self._stopped:
+            return True
+        if self.stop_flag and self.stop_flag.is_set():
+            self._stopped = True
+            return True
+        # Check global manga translator cancellation
+        try:
+            from manga_translator import MangaTranslator
+            if MangaTranslator.is_globally_cancelled():
+                self._stopped = True
+                return True
+        except Exception:
+            pass
+        return False
+    def _log(self, message: str, level: str = "info"):
+        """Log message with stop suppression"""
+        # Suppress logs when stopped (allow only essential stop confirmation messages)
+        if self._check_stop():
+            essential_stop_keywords = [
+                "⏹️ Translation stopped by user",
+                "⏹️ Bubble detection stopped",
+                "cleanup", "🧹"
+            ]
+            if not any(keyword in message for keyword in essential_stop_keywords):
+                return
+        if self.log_callback:
+            self.log_callback(message, level)
+        else:
+            logger.info(message) if level == 'info' else getattr(logger, level, logger.info)(message)
+    def reset_stop_flags(self):
+        """Reset stop flags when starting new processing"""
+        self._stopped = False
+    def load_model(self, model_path: str, force_reload: bool = False) -> bool:
+        """
+        Load a YOLOv8 model for bubble detection.
+        Args:
+            model_path: Path to model file (.pt, .onnx, or .torchscript)
+            force_reload: Force reload even if model is already loaded
+        Returns:
+            True if model loaded successfully, False otherwise
+        """
+        try:
+            # If given a Hugging Face repo ID (e.g., 'owner/name'), fetch detector.onnx into models/
+            if model_path and (('/' in model_path) and not os.path.exists(model_path)):
+                try:
+                    from huggingface_hub import hf_hub_download
+                    os.makedirs(self.cache_dir, exist_ok=True)
+                    logger.info(f"📥 Resolving repo '{model_path}' to detector.onnx in {self.cache_dir}...")
+                    resolved = hf_hub_download(repo_id=model_path, filename='detector.onnx', cache_dir=self.cache_dir, local_dir=self.cache_dir, local_dir_use_symlinks=False)
+                    if resolved and os.path.exists(resolved):
+                        model_path = resolved
+                        logger.info(f"✅ Downloaded detector.onnx to: {model_path}")
+                except Exception as repo_err:
+                    logger.error(f"Failed to download from repo '{model_path}': {repo_err}")
+            if not os.path.exists(model_path):
+                logger.error(f"Model file not found: {model_path}")
+                return False
+            # Check if it's the same model already loaded
+            if self.model_loaded and not force_reload:
+                last_path = self.config.get('last_model_path', '')
+                if last_path == model_path:
+                    logger.info("Model already loaded (same path)")
+                    return True
+                else:
+                    logger.info(f"Model path changed from {last_path} to {model_path}, reloading...")
+                    force_reload = True
+            # Clear previous model if force reload
+            if force_reload:
+                logger.info("Force reloading model...")
+                self.model = None
+                self.onnx_session = None
+                self.model_loaded = False
+                self.model_type = None
+            logger.info(f"📥 Loading bubble detection model: {model_path}")
+            # Determine model type by extension
+            ext = Path(model_path).suffix.lower()
+            if ext in ['.pt', '.pth']:
+                if not YOLO_AVAILABLE:
+                    logger.warning("Ultralytics package not available in this build")
+                    logger.info("Bubble detection will be disabled - this is normal for lightweight builds")
+                    # Don't return False immediately, try other fallbacks
+                    self.model_loaded = False
+                    return False
+                # Load YOLOv8 model
+                try:
+                    self.model = YOLO(model_path)
+                    self.model_type = 'yolo'
+                    # Set to eval mode
+                    if hasattr(self.model, 'model'):
+                        self.model.model.eval()
+                    # Move to GPU if available
+                    if self.use_gpu and TORCH_AVAILABLE:
+                        try:
+                            self.model.to('cuda')
+                        except Exception as gpu_error:
+                            logger.warning(f"Could not move model to GPU: {gpu_error}")
+                    logger.info("✅ YOLOv8 model loaded successfully")
+                    # Apply optional FP16 precision to reduce VRAM if enabled
+                    if self.quantize_enabled and self.use_gpu and TORCH_AVAILABLE:
+                        try:
+                            m = self.model.model if hasattr(self.model, 'model') else self.model
+                            m.half()
+                            logger.info("🔻 Applied FP16 precision to YOLO model (GPU)")
+                        except Exception as _e:
+                            logger.warning(f"Could not switch YOLO model to FP16: {_e}")
+                except Exception as yolo_error:
+                    logger.error(f"Failed to load YOLO model: {yolo_error}")
+                    return False
+            elif ext == '.onnx':
+                if not ONNX_AVAILABLE:
+                    logger.warning("ONNX Runtime not available in this build")
+                    logger.info("ONNX model support disabled - this is normal for lightweight builds")
+                    return False
+                try:
+                    # Load ONNX model
+                    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if self.use_gpu else ['CPUExecutionProvider']
+                    session_path = model_path
+                    if self.quantize_enabled:
+                        try:
+                            from onnxruntime.quantization import quantize_dynamic, QuantType
+                            quant_path = os.path.splitext(model_path)[0] + ".int8.onnx"
+                            if not os.path.exists(quant_path) or os.environ.get('FORCE_ONNX_REBUILD', 'false').lower() == 'true':
+                                logger.info("🔻 Quantizing ONNX model weights to INT8 (dynamic)...")
+                                quantize_dynamic(model_input=model_path, model_output=quant_path, weight_type=QuantType.QInt8, op_types_to_quantize=['Conv', 'MatMul'])
+                            session_path = quant_path
+                            self.config['last_onnx_quantized_path'] = quant_path
+                            self._save_config()
+                            logger.info(f"✅ Using quantized ONNX model: {quant_path}")
+                        except Exception as qe:
+                            logger.warning(f"ONNX quantization not applied: {qe}")
+                    # Use conservative ORT memory options to reduce RAM growth
+                    so = ort.SessionOptions()
+                    try:
+                        so.enable_mem_pattern = False
+                        so.enable_cpu_mem_arena = False
+                    except Exception:
+                        pass
+                    self.onnx_session = ort.InferenceSession(session_path, sess_options=so, providers=providers)
+                    self.model_type = 'onnx'
+                    logger.info("✅ ONNX model loaded successfully")
+                except Exception as onnx_error:
+                    logger.error(f"Failed to load ONNX model: {onnx_error}")
+                    return False
+            elif ext == '.torchscript':
+                if not TORCH_AVAILABLE:
+                    logger.warning("PyTorch not available in this build")
+                    logger.info("TorchScript model support disabled - this is normal for lightweight builds")
+                    return False
+                try:
+                    # Add safety check for torch being None
+                    if torch is None:
+                        logger.error("PyTorch module is None - cannot load TorchScript model")
+                        return False
+                    # Load TorchScript model
+                    self.model = torch.jit.load(model_path, map_location='cpu')
+                    self.model.eval()
+                    self.model_type = 'torch'
+                    if self.use_gpu:
+                        try:
+                            self.model = self.model.cuda()
+                        except Exception as gpu_error:
+                            logger.warning(f"Could not move TorchScript model to GPU: {gpu_error}")
+                    logger.info("✅ TorchScript model loaded successfully")
+                    # Optional FP16 precision on GPU
+                    if self.quantize_enabled and self.use_gpu and TORCH_AVAILABLE:
+                        try:
+                            self.model = self.model.half()
+                            logger.info("🔻 Applied FP16 precision to TorchScript model (GPU)")
+                        except Exception as _e:
+                            logger.warning(f"Could not switch TorchScript model to FP16: {_e}")
+                except Exception as torch_error:
+                    logger.error(f"Failed to load TorchScript model: {torch_error}")
+                    return False
+            else:
+                logger.error(f"Unsupported model format: {ext}")
+                logger.info("Supported formats: .pt/.pth (YOLOv8), .onnx (ONNX), .torchscript (TorchScript)")
+                return False
+            # Only set loaded if we actually succeeded
+            self.model_loaded = True
+            self.config['last_model_path'] = model_path
+            self.config['model_type'] = self.model_type
+            self._save_config()
+            return True
+        except Exception as e:
+            logger.error(f"Failed to load model: {e}")
+            logger.error(traceback.format_exc())
+            self.model_loaded = False
+            # Provide helpful context for .exe users
+            logger.info("Note: If running from .exe, some ML libraries may not be included")
+            logger.info("This is normal for lightweight builds - bubble detection will be disabled")
+            return False
+    def load_rtdetr_model(self, model_path: str = None, model_id: str = None, force_reload: bool = False) -> bool:
+        """
+        Load RT-DETR model for advanced bubble and text detection.
+        This implementation avoids the 'meta tensor' copy error by:
+        - Serializing the entire load under a class lock (no concurrent loads)
+        - Loading directly onto the target device (CUDA if available) via device_map='auto'
+        - Avoiding .to() on a potentially-meta model; no device migration post-load
+        Args:
+            model_path: Optional path to local model
+            model_id: Optional HuggingFace model ID (default: 'ogkalu/comic-text-and-bubble-detector')
+            force_reload: Force reload even if already loaded
+        Returns:
+            True if successful, False otherwise
+        """
+        if not TRANSFORMERS_AVAILABLE:
+            logger.error("Transformers library required for RT-DETR. Install with: pip install transformers")
+            return False
+        if not PIL_AVAILABLE:
+            logger.error("PIL required for RT-DETR. Install with: pip install pillow")
+            return False
+        if self.rtdetr_loaded and not force_reload:
+            logger.info("RT-DETR model already loaded")
+            return True
+        # Fast path: if shared already loaded and not forcing reload, attach
+        if BubbleDetector._rtdetr_loaded and not force_reload:
+            self.rtdetr_model = BubbleDetector._rtdetr_shared_model
+            self.rtdetr_processor = BubbleDetector._rtdetr_shared_processor
+            self.rtdetr_loaded = True
+            logger.info("RT-DETR model attached from shared cache")
+            return True
+        # Serialize the ENTIRE loading sequence to avoid concurrent init issues
+        with BubbleDetector._rtdetr_init_lock:
+            try:
+                # Re-check after acquiring lock
+                if BubbleDetector._rtdetr_loaded and not force_reload:
+                    self.rtdetr_model = BubbleDetector._rtdetr_shared_model
+                    self.rtdetr_processor = BubbleDetector._rtdetr_shared_processor
+                    self.rtdetr_loaded = True
+                    logger.info("RT-DETR model attached from shared cache (post-lock)")
+                    return True
+                # Use custom model_id if provided, otherwise use default
+                repo_id = model_id if model_id else self.rtdetr_repo
+                logger.info(f"📥 Loading RT-DETR model from {repo_id}...")
+                # Ensure TorchDynamo/compile doesn't interfere on some builds
+                try:
+                    os.environ.setdefault('TORCHDYNAMO_DISABLE', '1')
+                except Exception:
+                    pass
+                # Decide device strategy
+                gpu_available = bool(TORCH_AVAILABLE and hasattr(torch, 'cuda') and torch.cuda.is_available())
+                device_map = 'auto' if gpu_available else None
+                # Choose dtype
+                dtype = None
+                if TORCH_AVAILABLE:
+                    try:
+                        dtype = torch.float16 if gpu_available else torch.float32
+                    except Exception:
+                        dtype = None
+                low_cpu = True if gpu_available else False
+                # Load processor (once)
+                self.rtdetr_processor = RTDetrImageProcessor.from_pretrained(
+                    repo_id,
+                    size={"width": 640, "height": 640},
+                    cache_dir=self.cache_dir if not model_path else None
+                )
+                # Prepare kwargs for from_pretrained
+                from_kwargs = {
+                    'cache_dir': self.cache_dir if not model_path else None,
+                    'low_cpu_mem_usage': low_cpu,
+                    'device_map': device_map,
+                }
+                if dtype is not None:
+                    from_kwargs['dtype'] = dtype
+                # First attempt: load directly to target (CUDA if available)
+                try:
+                    self.rtdetr_model = RTDetrForObjectDetection.from_pretrained(
+                        model_path if model_path else repo_id,
+                        **from_kwargs,
+                    )
+                except Exception as primary_err:
+                    # Fallback to a simple CPU load (no device move) if CUDA path fails
+                    logger.warning(f"RT-DETR primary load failed ({primary_err}); retrying on CPU...")
+                    from_kwargs_fallback = {
+                        'cache_dir': self.cache_dir if not model_path else None,
+                        'low_cpu_mem_usage': False,
+                        'device_map': None,
+                    }
+                    if TORCH_AVAILABLE:
+                        from_kwargs_fallback['dtype'] = torch.float32
+                    self.rtdetr_model = RTDetrForObjectDetection.from_pretrained(
+                        model_path if model_path else repo_id,
+                        **from_kwargs_fallback,
+                    )
+                # Optional dynamic quantization for linear layers (CPU only)
+                if self.quantize_enabled and TORCH_AVAILABLE and (not gpu_available):
+                    try:
+                        try:
+                            import torch.ao.quantization as tq
+                            quantize_dynamic = tq.quantize_dynamic  # type: ignore
+                        except Exception:
+                            import torch.quantization as tq  # type: ignore
+                            quantize_dynamic = tq.quantize_dynamic  # type: ignore
+                        self.rtdetr_model = quantize_dynamic(self.rtdetr_model, {torch.nn.Linear}, dtype=torch.qint8)
+                        logger.info("🔻 Applied dynamic INT8 quantization to RT-DETR linear layers (CPU)")
+                    except Exception as qe:
+                        logger.warning(f"RT-DETR dynamic quantization skipped: {qe}")
+                # Finalize
+                self.rtdetr_model.eval()
+                # Sanity check: ensure no parameter is left on 'meta' device
+                try:
+                    for n, p in self.rtdetr_model.named_parameters():
+                        dev = getattr(p, 'device', None)
+                        if dev is not None and getattr(dev, 'type', '') == 'meta':
+                            raise RuntimeError(f"Parameter {n} is on 'meta' device after load")
+                except Exception as e:
+                    logger.error(f"RT-DETR load sanity check failed: {e}")
+                    self.rtdetr_loaded = False
+                    return False
+                # Publish shared cache
+                BubbleDetector._rtdetr_shared_model = self.rtdetr_model
+                BubbleDetector._rtdetr_shared_processor = self.rtdetr_processor
+                BubbleDetector._rtdetr_loaded = True
+                BubbleDetector._rtdetr_repo_id = repo_id
+                self.rtdetr_loaded = True
+                # Save the model ID that was used
+                self.config['rtdetr_loaded'] = True
+                self.config['rtdetr_model_id'] = repo_id
+                self._save_config()
+                loc = 'CUDA' if gpu_available else 'CPU'
+                logger.info(f"✅ RT-DETR model loaded successfully ({loc})")
+                logger.info("   Classes: Empty bubbles, Text bubbles, Free text")
+                # Auto-convert to ONNX for RT-DETR only if explicitly enabled
+                if os.environ.get('AUTO_CONVERT_RTDETR_ONNX', 'false').lower() == 'true':
+                    onnx_path = os.path.join(self.cache_dir, 'rtdetr_comic.onnx')
+                    if self.convert_to_onnx('rtdetr', onnx_path):
+                        logger.info("🚀 RT-DETR converted to ONNX for faster inference")
+                        # Store ONNX path for later use
+                        self.config['rtdetr_onnx_path'] = onnx_path
+                        self._save_config()
+                        # Optionally quantize ONNX for reduced RAM
+                        if self.onnx_quantize_enabled:
+                            try:
+                                from onnxruntime.quantization import quantize_dynamic, QuantType
+                                quant_path = os.path.splitext(onnx_path)[0] + ".int8.onnx"
+                                if not os.path.exists(quant_path) or os.environ.get('FORCE_ONNX_REBUILD', 'false').lower() == 'true':
+                                    logger.info("🔻 Quantizing RT-DETR ONNX to INT8 (dynamic)...")
+                                    quantize_dynamic(model_input=onnx_path, model_output=quant_path, weight_type=QuantType.QInt8, op_types_to_quantize=['Conv', 'MatMul'])
+                                self.config['rtdetr_onnx_quantized_path'] = quant_path
+                                self._save_config()
+                                logger.info(f"✅ Quantized RT-DETR ONNX saved to: {quant_path}")
+                            except Exception as qe:
+                                logger.warning(f"ONNX quantization for RT-DETR skipped: {qe}")
+                    else:
+                        logger.info("ℹ️ Skipping RT-DETR ONNX export (converter not supported in current environment)")
+                return True
+            except Exception as e:
+                logger.error(f"❌ Failed to load RT-DETR: {e}")
+                self.rtdetr_loaded = False
+                return False
+    def check_rtdetr_available(self, model_id: str = None) -> bool:
+        """
+        Check if RT-DETR model is available (cached).
+        Args:
+            model_id: Optional HuggingFace model ID
+        Returns:
+            True if model is cached and available
+        """
+        try:
+            from pathlib import Path
+            # Use provided model_id or default
+            repo_id = model_id if model_id else self.rtdetr_repo
+            # Check HuggingFace cache
+            cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
+            model_id_formatted = repo_id.replace("/", "--")
+            # Look for model folder
+            model_folders = list(cache_dir.glob(f"models--{model_id_formatted}*"))
+            if model_folders:
+                for folder in model_folders:
+                    if (folder / "snapshots").exists():
+                        snapshots = list((folder / "snapshots").iterdir())
+                        if snapshots:
+                            return True
+            return False
+        except Exception:
+            return False
+    def detect_bubbles(self,
+                      image_path: str,
+                      confidence: float = None,
+                      iou_threshold: float = None,
+                      max_detections: int = None,
+                      use_rtdetr: bool = None) -> List[Tuple[int, int, int, int]]:
+        """
+        Detect speech bubbles in an image (backward compatible method).
+        Args:
+            image_path: Path to image file
+            confidence: Minimum confidence threshold (0-1)
+            iou_threshold: IOU threshold for NMS (0-1)
+            max_detections: Maximum number of detections to return
+            use_rtdetr: If True, use RT-DETR instead of YOLOv8 (if available)
+        Returns:
+            List of bubble bounding boxes as (x, y, width, height) tuples
+        """
+        # Check for stop at start
+        if self._check_stop():
+            self._log("⏹️ Bubble detection stopped by user", "warning")
+            return []
+        # Decide which model to use
+        if use_rtdetr is None:
+            # Auto-select: prefer RT-DETR if available
+            use_rtdetr = self.rtdetr_loaded
+        if use_rtdetr:
+            # Prefer ONNX backend if available, else PyTorch
+            if getattr(self, 'rtdetr_onnx_loaded', False):
+                results = self.detect_with_rtdetr_onnx(
+                    image_path=image_path,
+                    confidence=confidence,
+                    return_all_bubbles=True
+                )
+                return results
+            if self.rtdetr_loaded:
+                results = self.detect_with_rtdetr(
+                    image_path=image_path,
+                    confidence=confidence,
+                    return_all_bubbles=True
+                )
+                return results
+        # Original YOLOv8 detection
+        if not self.model_loaded:
+            logger.error("No model loaded. Call load_model() first.")
+            return []
+        # Use defaults if not specified
+        confidence = confidence or self.default_confidence
+        iou_threshold = iou_threshold or self.default_iou_threshold
+        max_detections = max_detections or self.default_max_detections
+        try:
+            # Load image
+            image = cv2.imread(image_path)
+            if image is None:
+                logger.error(f"Failed to load image: {image_path}")
+                return []
+            h, w = image.shape[:2]
+            self._log(f"🔍 Detecting bubbles in {w}x{h} image")
+            # Check for stop before inference
+            if self._check_stop():
+                self._log("⏹️ Bubble detection inference stopped by user", "warning")
+                return []
+            if self.model_type == 'yolo':
+                # YOLOv8 inference
+                results = self.model(
+                    image_path,
+                    conf=confidence,
+                    iou=iou_threshold,
+                    max_det=min(max_detections, getattr(self, 'max_det_yolo', max_detections)),
+                    verbose=False
+                )
+                bubbles = []
+                for r in results:
+                    if r.boxes is not None:
+                        for box in r.boxes:
+                            # Get box coordinates
+                            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                            x, y = int(x1), int(y1)
+                            width = int(x2 - x1)
+                            height = int(y2 - y1)
+                            # Get confidence
+                            conf = float(box.conf[0])
+                            # Add to list
+                            if len(bubbles) < max_detections:
+                                bubbles.append((x, y, width, height))
+                            logger.debug(f"   Bubble: ({x},{y}) {width}x{height} conf={conf:.2f}")
+            elif self.model_type == 'onnx':
+                # ONNX inference
+                bubbles = self._detect_with_onnx(image, confidence, iou_threshold, max_detections)
+            elif self.model_type == 'torch':
+                # TorchScript inference
+                bubbles = self._detect_with_torchscript(image, confidence, iou_threshold, max_detections)
+            else:
+                logger.error(f"Unknown model type: {self.model_type}")
+                return []
+            logger.info(f"✅ Detected {len(bubbles)} speech bubbles")
+            time.sleep(0.1)  # Brief pause for stability
+            logger.debug("💤 Bubble detection pausing briefly for stability")
+            return bubbles
+        except Exception as e:
+            logger.error(f"Detection failed: {e}")
+            logger.error(traceback.format_exc())
+            return []
+    def detect_with_rtdetr(self,
+                          image_path: str = None,
+                          image: np.ndarray = None,
+                          confidence: float = None,
+                          return_all_bubbles: bool = False) -> Any:
+        """
+        Detect using RT-DETR model with 3-class detection (PyTorch backend).
+        Args:
+            image_path: Path to image file
+            image: Image array (BGR format)
+            confidence: Confidence threshold
+            return_all_bubbles: If True, return list of bubble boxes (for compatibility)
+                               If False, return dict with all classes
+        Returns:
+            List of bubbles if return_all_bubbles=True, else dict with classes
+        """
+        # Check for stop at start
+        if self._check_stop():
+            self._log("⏹️ RT-DETR detection stopped by user", "warning")
+            if return_all_bubbles:
+                return []
+            return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
+        if not self.rtdetr_loaded:
+            self._log("RT-DETR not loaded. Call load_rtdetr_model() first.", "warning")
+            if return_all_bubbles:
+                return []
+            return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
+        confidence = confidence or self.default_confidence
+        try:
+            # Load image
+            if image_path:
+                image = cv2.imread(image_path)
+            elif image is None:
+                logger.error("No image provided")
+                if return_all_bubbles:
+                    return []
+                return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
+            # Convert BGR to RGB for PIL
+            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image_rgb)
+            # Prepare image for model
+            inputs = self.rtdetr_processor(images=pil_image, return_tensors="pt")
+            # Move inputs to the same device as the model and match model dtype for floating tensors
+            model_device = next(self.rtdetr_model.parameters()).device if self.rtdetr_model is not None else (torch.device('cpu') if TORCH_AVAILABLE else 'cpu')
+            model_dtype = None
+            if TORCH_AVAILABLE and self.rtdetr_model is not None:
+                try:
+                    model_dtype = next(self.rtdetr_model.parameters()).dtype
+                except Exception:
+                    model_dtype = None
+            if TORCH_AVAILABLE:
+                new_inputs = {}
+                for k, v in inputs.items():
+                    if isinstance(v, torch.Tensor):
+                        v = v.to(model_device)
+                        if model_dtype is not None and torch.is_floating_point(v):
+                            v = v.to(model_dtype)
+                    new_inputs[k] = v
+                inputs = new_inputs
+            # Run inference with autocast when model is half/bfloat16 on CUDA
+            use_amp = TORCH_AVAILABLE and hasattr(model_device, 'type') and model_device.type == 'cuda' and (model_dtype in (torch.float16, torch.bfloat16))
+            autocast_dtype = model_dtype if model_dtype in (torch.float16, torch.bfloat16) else None
+            with torch.no_grad():
+                if use_amp and autocast_dtype is not None:
+                    with torch.autocast('cuda', dtype=autocast_dtype):
+                        outputs = self.rtdetr_model(**inputs)
+                else:
+                    outputs = self.rtdetr_model(**inputs)
+                # Brief pause for stability after inference
+                time.sleep(0.1)
+                logger.debug("💤 RT-DETR inference pausing briefly for stability")
+            # Post-process results
+            target_sizes = torch.tensor([pil_image.size[::-1]]) if TORCH_AVAILABLE else None
+            if TORCH_AVAILABLE and hasattr(model_device, 'type') and model_device.type == "cuda":
+                target_sizes = target_sizes.to(model_device)
+            results = self.rtdetr_processor.post_process_object_detection(
+                outputs,
+                target_sizes=target_sizes,
+                threshold=confidence
+            )[0]
+            # Apply per-detector cap if configured
+            cap = getattr(self, 'max_det_rtdetr', self.default_max_detections)
+            if cap and len(results['boxes']) > cap:
+                # Keep top-scoring first
+                scores = results['scores']
+                top_idx = scores.topk(k=cap).indices if hasattr(scores, 'topk') else range(cap)
+                results = {
+                    'boxes': [results['boxes'][i] for i in top_idx],
+                    'scores': [results['scores'][i] for i in top_idx],
+                    'labels': [results['labels'][i] for i in top_idx]
+                }
+            logger.info(f"📊 RT-DETR found {len(results['boxes'])} detections above {confidence:.2f} confidence")
+            # Apply NMS to remove duplicate detections
+            # Group detections by class
+            class_detections = {self.CLASS_BUBBLE: [], self.CLASS_TEXT_BUBBLE: [], self.CLASS_TEXT_FREE: []}
+            for box, score, label in zip(results['boxes'], results['scores'], results['labels']):
+                x1, y1, x2, y2 = map(float, box.tolist())
+                label_id = label.item()
+                if label_id in class_detections:
+                    class_detections[label_id].append((x1, y1, x2, y2, float(score.item())))
+            # Apply NMS per class to remove duplicates
+            def compute_iou(box1, box2):
+                """Compute IoU between two boxes (x1, y1, x2, y2)"""
+                x1_1, y1_1, x2_1, y2_1 = box1[:4]
+                x1_2, y1_2, x2_2, y2_2 = box2[:4]
+                # Intersection
+                x_left = max(x1_1, x1_2)
+                y_top = max(y1_1, y1_2)
+                x_right = min(x2_1, x2_2)
+                y_bottom = min(y2_1, y2_2)
+                if x_right < x_left or y_bottom < y_top:
+                    return 0.0
+                intersection = (x_right - x_left) * (y_bottom - y_top)
+                # Union
+                area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
+                area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
+                union = area1 + area2 - intersection
+                return intersection / union if union > 0 else 0.0
+            def apply_nms(boxes_with_scores, iou_threshold=0.45):
+                """Apply Non-Maximum Suppression"""
+                if not boxes_with_scores:
+                    return []
+                # Sort by score (descending)
+                sorted_boxes = sorted(boxes_with_scores, key=lambda x: x[4], reverse=True)
+                keep = []
+                while sorted_boxes:
+                    # Keep the box with highest score
+                    current = sorted_boxes.pop(0)
+                    keep.append(current)
+                    # Remove boxes with high IoU
+                    sorted_boxes = [box for box in sorted_boxes if compute_iou(current, box) < iou_threshold]
+                return keep
+            # Apply NMS and organize by class
+            detections = {
+                'bubbles': [],       # Empty speech bubbles
+                'text_bubbles': [],  # Bubbles with text
+                'text_free': []      # Text without bubbles
+            }
+            for class_id, boxes_list in class_detections.items():
+                nms_boxes = apply_nms(boxes_list, iou_threshold=self.default_iou_threshold)
+                for x1, y1, x2, y2, scr in nms_boxes:
+                    width = int(x2 - x1)
+                    height = int(y2 - y1)
+                    # Store as (x, y, width, height) to match YOLOv8 format
+                    bbox = (int(x1), int(y1), width, height)
+                    if class_id == self.CLASS_BUBBLE:
+                        detections['bubbles'].append(bbox)
+                    elif class_id == self.CLASS_TEXT_BUBBLE:
+                        detections['text_bubbles'].append(bbox)
+                    elif class_id == self.CLASS_TEXT_FREE:
+                        detections['text_free'].append(bbox)
+                    # Stop early if we hit the configured cap across all classes
+                    total_count = len(detections['bubbles']) + len(detections['text_bubbles']) + len(detections['text_free'])
+                    if total_count >= (self.config.get('manga_settings', {}).get('ocr', {}).get('bubble_max_detections', self.default_max_detections) if isinstance(self.config, dict) else self.default_max_detections):
+                        break
+            # Log results
+            total = len(detections['bubbles']) + len(detections['text_bubbles']) + len(detections['text_free'])
+            logger.info(f"✅ RT-DETR detected {total} objects:")
+            logger.info(f"   - Empty bubbles: {len(detections['bubbles'])}")
+            logger.info(f"   - Text bubbles: {len(detections['text_bubbles'])}")
+            logger.info(f"   - Free text: {len(detections['text_free'])}")
+            # Return format based on compatibility mode
+            if return_all_bubbles:
+                # Return all bubbles (empty + with text) for backward compatibility
+                all_bubbles = detections['bubbles'] + detections['text_bubbles']
+                return all_bubbles
+            else:
+                return detections
+        except Exception as e:
+            logger.error(f"RT-DETR detection failed: {e}")
+            logger.error(traceback.format_exc())
+            if return_all_bubbles:
+                return []
+            return {'bubbles': [], 'text_bubbles': [], 'text_free': []}
+    def detect_all_text_regions(self, image_path: str = None, image: np.ndarray = None) -> List[Tuple[int, int, int, int]]:
+        """
+        Detect all text regions using RT-DETR (both in bubbles and free text).
+        Returns:
+            List of bounding boxes for all text regions
+        """
+        if not self.rtdetr_loaded:
+            logger.warning("RT-DETR required for text detection")
+            return []
+        detections = self.detect_with_rtdetr(image_path=image_path, image=image, return_all_bubbles=False)
+        # Combine text bubbles and free text
+        all_text = detections['text_bubbles'] + detections['text_free']
+        logger.info(f"📝 Found {len(all_text)} text regions total")
+        return all_text
+    def _detect_with_onnx(self, image: np.ndarray, confidence: float,
+                         iou_threshold: float, max_detections: int) -> List[Tuple[int, int, int, int]]:
+        """Run detection using ONNX model."""
+        # Preprocess image
+        img_size = 640  # Standard YOLOv8 input size
+        img_resized = cv2.resize(image, (img_size, img_size))
+        img_norm = img_resized.astype(np.float32) / 255.0
+        img_transposed = np.transpose(img_norm, (2, 0, 1))
+        img_batch = np.expand_dims(img_transposed, axis=0)
+        # Run inference
+        input_name = self.onnx_session.get_inputs()[0].name
+        outputs = self.onnx_session.run(None, {input_name: img_batch})
+        # Process outputs (YOLOv8 format)
+        predictions = outputs[0][0]  # Remove batch dimension
+        # Filter by confidence and apply NMS
+        bubbles = []
+        boxes = []
+        scores = []
+        for pred in predictions.T:  # Transpose to get predictions per detection
+            if len(pred) >= 5:
+                x_center, y_center, width, height, obj_conf = pred[:5]
+                if obj_conf >= confidence:
+                    # Convert to corner coordinates
+                    x1 = x_center - width / 2
+                    y1 = y_center - height / 2
+                    # Scale to original image size
+                    h, w = image.shape[:2]
+                    x1 = int(x1 * w / img_size)
+                    y1 = int(y1 * h / img_size)
+                    width = int(width * w / img_size)
+                    height = int(height * h / img_size)
+                    boxes.append([x1, y1, x1 + width, y1 + height])
+                    scores.append(float(obj_conf))
+        # Apply NMS
+        if boxes:
+            indices = cv2.dnn.NMSBoxes(boxes, scores, confidence, iou_threshold)
+            if len(indices) > 0:
+                indices = indices.flatten()[:max_detections]
+                for i in indices:
+                    x1, y1, x2, y2 = boxes[i]
+                    bubbles.append((x1, y1, x2 - x1, y2 - y1))
+        return bubbles
+    def _detect_with_torchscript(self, image: np.ndarray, confidence: float,
+                                 iou_threshold: float, max_detections: int) -> List[Tuple[int, int, int, int]]:
+        """Run detection using TorchScript model."""
+        # Similar to ONNX but using PyTorch tensors
+        img_size = 640
+        img_resized = cv2.resize(image, (img_size, img_size))
+        img_norm = img_resized.astype(np.float32) / 255.0
+        img_tensor = torch.from_numpy(img_norm).permute(2, 0, 1).unsqueeze(0)
+        if self.use_gpu:
+            img_tensor = img_tensor.cuda()
+        with torch.no_grad():
+            outputs = self.model(img_tensor)
+        # Process outputs similar to ONNX
+        # Implementation depends on exact model output format
+        # This is a placeholder - adjust based on your model
+        return []
+    def visualize_detections(self, image_path: str, bubbles: List[Tuple[int, int, int, int]] = None,
+                            output_path: str = None, use_rtdetr: bool = False) -> np.ndarray:
+        """
+        Visualize detected bubbles on the image.
+        Args:
+            image_path: Path to original image
+            bubbles: List of bubble bounding boxes (if None, will detect)
+            output_path: Optional path to save visualization
+            use_rtdetr: Use RT-DETR for visualization with class colors
+        Returns:
+            Image with drawn bounding boxes
+        """
+        image = cv2.imread(image_path)
+        if image is None:
+            logger.error(f"Failed to load image: {image_path}")
+            return None
+        vis_image = image.copy()
+        if use_rtdetr and self.rtdetr_loaded:
+            # RT-DETR visualization with different colors per class
+            detections = self.detect_with_rtdetr(image_path=image_path, return_all_bubbles=False)
+            # Colors for each class
+            colors = {
+                'bubbles': (0, 255, 0),       # Green for empty bubbles
+                'text_bubbles': (255, 0, 0),  # Blue for text bubbles
+                'text_free': (0, 0, 255)      # Red for free text
+            }
+            # Draw detections
+            for class_name, bboxes in detections.items():
+                color = colors[class_name]
+                for i, (x, y, w, h) in enumerate(bboxes):
+                    # Draw rectangle
+                    cv2.rectangle(vis_image, (x, y), (x + w, y + h), color, 2)
+                    # Add label
+                    label = f"{class_name.replace('_', ' ').title()} {i+1}"
+                    label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+                    cv2.rectangle(vis_image, (x, y - label_size[1] - 4),
+                                (x + label_size[0], y), color, -1)
+                    cv2.putText(vis_image, label, (x, y - 2),
+                              cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+        else:
+            # Original YOLOv8 visualization
+            if bubbles is None:
+                bubbles = self.detect_bubbles(image_path)
+            # Draw bounding boxes
+            for i, (x, y, w, h) in enumerate(bubbles):
+                # Draw rectangle
+                color = (0, 255, 0)  # Green
+                thickness = 2
+                cv2.rectangle(vis_image, (x, y), (x + w, y + h), color, thickness)
+                # Add label
+                label = f"Bubble {i+1}"
+                label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+                cv2.rectangle(vis_image, (x, y - label_size[1] - 4), (x + label_size[0], y), color, -1)
+                cv2.putText(vis_image, label, (x, y - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+        # Save if output path provided
+        if output_path:
+            cv2.imwrite(output_path, vis_image)
+            logger.info(f"💾 Visualization saved to: {output_path}")
+        return vis_image
+    def convert_to_onnx(self, model_path: str, output_path: str = None) -> bool:
+        """
+        Convert a YOLOv8 or RT-DETR model to ONNX format.
+        Args:
+            model_path: Path to model file or 'rtdetr' for loaded RT-DETR
+            output_path: Path for ONNX output (auto-generated if None)
+        Returns:
+            True if conversion successful, False otherwise
+        """
+        try:
+            logger.info(f"🔄 Converting {model_path} to ONNX...")
+            # Generate output path if not provided
+            if output_path is None:
+                if model_path == 'rtdetr' and self.rtdetr_loaded:
+                    base_name = 'rtdetr_comic'
+                else:
+                    base_name = Path(model_path).stem
+                output_path = os.path.join(self.cache_dir, f"{base_name}.onnx")
+            # Check if already exists
+            if os.path.exists(output_path) and not os.environ.get('FORCE_ONNX_REBUILD', 'false').lower() == 'true':
+                logger.info(f"✅ ONNX model already exists: {output_path}")
+                return True
+            # Handle RT-DETR conversion
+            if model_path == 'rtdetr' and self.rtdetr_loaded:
+                if not TORCH_AVAILABLE:
+                    logger.error("PyTorch required for RT-DETR ONNX conversion")
+                    return False
+                # RT-DETR specific conversion
+                self.rtdetr_model.eval()
+                # Create dummy input (pixel values): BxCxHxW
+                dummy_input = torch.randn(1, 3, 640, 640)
+                if self.device == 'cuda':
+                    dummy_input = dummy_input.to('cuda')
+                # Wrap the model to return only tensors (logits, pred_boxes)
+                class _RTDetrExportWrapper(torch.nn.Module):
+                    def __init__(self, mdl):
+                        super().__init__()
+                        self.mdl = mdl
+                    def forward(self, images):
+                        out = self.mdl(pixel_values=images)
+                        # Handle dict/ModelOutput/tuple outputs
+                        logits = None
+                        boxes = None
+                        try:
+                            if isinstance(out, dict):
+                                logits = out.get('logits', None)
+                                boxes = out.get('pred_boxes', out.get('boxes', None))
+                            else:
+                                logits = getattr(out, 'logits', None)
+                                boxes = getattr(out, 'pred_boxes', getattr(out, 'boxes', None))
+                        except Exception:
+                            pass
+                        if (logits is None or boxes is None) and isinstance(out, (tuple, list)) and len(out) >= 2:
+                            logits, boxes = out[0], out[1]
+                        return logits, boxes
+                wrapper = _RTDetrExportWrapper(self.rtdetr_model)
+                if self.device == 'cuda':
+                    wrapper = wrapper.to('cuda')
+                # Try PyTorch 2.x dynamo_export first (more tolerant of newer aten ops)
+                try:
+                    success = False
+                    try:
+                        from torch.onnx import dynamo_export
+                        try:
+                            exp = dynamo_export(wrapper, dummy_input)
+                        except TypeError:
+                            # Older PyTorch dynamo_export may not support this calling convention
+                            exp = dynamo_export(wrapper, dummy_input)
+                        # exp may have save(); otherwise, it may expose model_proto
+                        try:
+                            exp.save(output_path)  # type: ignore
+                            success = True
+                        except Exception:
+                            try:
+                                import onnx as _onnx
+                                _onnx.save(exp.model_proto, output_path)  # type: ignore
+                                success = True
+                            except Exception as _se:
+                                logger.warning(f"dynamo_export produced model but could not save: {_se}")
+                    except Exception as de:
+                        logger.warning(f"dynamo_export failed; falling back to legacy exporter: {de}")
+                    if success:
+                        logger.info(f"✅ RT-DETR ONNX saved to: {output_path} (dynamo_export)")
+                        return True
+                except Exception as de2:
+                    logger.warning(f"dynamo_export path error: {de2}")
+                # Legacy exporter with opset fallback
+                last_err = None
+                for opset in [19, 18, 17, 16, 15, 14, 13]:
+                    try:
+                        torch.onnx.export(
+                            wrapper,
+                            dummy_input,
+                            output_path,
+                            export_params=True,
+                            opset_version=opset,
+                            do_constant_folding=True,
+                            input_names=['pixel_values'],
+                            output_names=['logits', 'boxes'],
+                            dynamic_axes={
+                                'pixel_values': {0: 'batch', 2: 'height', 3: 'width'},
+                                'logits': {0: 'batch'},
+                                'boxes': {0: 'batch'}
+                            }
+                        )
+                        logger.info(f"✅ RT-DETR ONNX saved to: {output_path} (opset {opset})")
+                        return True
+                    except Exception as _e:
+                        last_err = _e
+                        try:
+                            msg = str(_e)
+                        except Exception:
+                            msg = ''
+                        logger.warning(f"RT-DETR ONNX export failed at opset {opset}: {msg}")
+                        continue
+                logger.error(f"All RT-DETR ONNX export attempts failed. Last error: {last_err}")
+                return False
+            # Handle YOLOv8 conversion - FIXED
+            elif YOLO_AVAILABLE and os.path.exists(model_path):
+                logger.info(f"Loading YOLOv8 model from: {model_path}")
+                # Load model
+                model = YOLO(model_path)
+                # Export to ONNX - this returns the path to the exported model
+                logger.info("Exporting to ONNX format...")
+                exported_path = model.export(format='onnx', imgsz=640, simplify=True)
+                # exported_path could be a string or Path object
+                exported_path = str(exported_path) if exported_path else None
+                if exported_path and os.path.exists(exported_path):
+                    # Move to desired location if different
+                    if exported_path != output_path:
+                        import shutil
+                        logger.info(f"Moving ONNX from {exported_path} to {output_path}")
+                        shutil.move(exported_path, output_path)
+                    logger.info(f"✅ YOLOv8 ONNX saved to: {output_path}")
+                    return True
+                else:
+                    # Fallback: check if it was created with expected name
+                    expected_onnx = model_path.replace('.pt', '.onnx')
+                    if os.path.exists(expected_onnx):
+                        if expected_onnx != output_path:
+                            import shutil
+                            shutil.move(expected_onnx, output_path)
+                        logger.info(f"✅ YOLOv8 ONNX saved to: {output_path}")
+                        return True
+                    else:
+                        logger.error(f"ONNX export failed - no output file found")
+                        return False
+            else:
+                logger.error(f"Cannot convert {model_path}: Model not found or dependencies missing")
+                return False
+        except Exception as e:
+            logger.error(f"Conversion failed: {e}")
+            # Avoid noisy full stack trace in production logs; return False gracefully
+            return False
+    def batch_detect(self, image_paths: List[str], **kwargs) -> Dict[str, List[Tuple[int, int, int, int]]]:
+        """
+        Detect bubbles in multiple images.
+        Args:
+            image_paths: List of image paths
+            **kwargs: Detection parameters (confidence, iou_threshold, max_detections, use_rtdetr)
+        Returns:
+            Dictionary mapping image paths to bubble lists
+        """
+        results = {}
+        for i, image_path in enumerate(image_paths):
+            logger.info(f"Processing image {i+1}/{len(image_paths)}: {os.path.basename(image_path)}")
+            bubbles = self.detect_bubbles(image_path, **kwargs)
+            results[image_path] = bubbles
+        return results
+    def unload(self, release_shared: bool = False):
+        """Release model resources held by this detector instance.
+        Args:
+            release_shared: If True, also clear class-level shared RT-DETR caches.
+        """
+        try:
+            # Release instance-level models and sessions
+            try:
+                if getattr(self, 'onnx_session', None) is not None:
+                    self.onnx_session = None
+            except Exception:
+                pass
+            try:
+                if getattr(self, 'rtdetr_onnx_session', None) is not None:
+                    self.rtdetr_onnx_session = None
+            except Exception:
+                pass
+            for attr in ['model', 'rtdetr_model', 'rtdetr_processor']:
+                try:
+                    if hasattr(self, attr):
+                        setattr(self, attr, None)
+                except Exception:
+                    pass
+            for flag in ['model_loaded', 'rtdetr_loaded', 'rtdetr_onnx_loaded']:
+                try:
+                    if hasattr(self, flag):
+                        setattr(self, flag, False)
+                except Exception:
+                    pass
+            # Optional: release shared caches
+            if release_shared:
+                try:
+                    BubbleDetector._rtdetr_shared_model = None
+                    BubbleDetector._rtdetr_shared_processor = None
+                    BubbleDetector._rtdetr_loaded = False
+                except Exception:
+                    pass
+            # Free CUDA cache and trigger GC
+            try:
+                if TORCH_AVAILABLE and torch is not None and torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except Exception:
+                pass
+            try:
+                import gc
+                gc.collect()
+            except Exception:
+                pass
+        except Exception:
+            # Best-effort only
+            pass
+    def get_bubble_masks(self, image_path: str, bubbles: List[Tuple[int, int, int, int]]) -> np.ndarray:
+        """
+        Create a mask image with bubble regions.
+        Args:
+            image_path: Path to original image
+            bubbles: List of bubble bounding boxes
+        Returns:
+            Binary mask with bubble regions as white (255)
+        """
+        image = cv2.imread(image_path)
+        if image is None:
+            return None
+        h, w = image.shape[:2]
+        mask = np.zeros((h, w), dtype=np.uint8)
+        # Fill bubble regions
+        for x, y, bw, bh in bubbles:
+            cv2.rectangle(mask, (x, y), (x + bw, y + bh), 255, -1)
+        return mask
+    def filter_bubbles_by_size(self, bubbles: List[Tuple[int, int, int, int]],
+                              min_area: int = 100,
+                              max_area: int = None) -> List[Tuple[int, int, int, int]]:
+        """
+        Filter bubbles by area.
+        Args:
+            bubbles: List of bubble bounding boxes
+            min_area: Minimum area in pixels
+            max_area: Maximum area in pixels (None for no limit)
+        Returns:
+            Filtered list of bubbles
+        """
+        filtered = []
+        for x, y, w, h in bubbles:
+            area = w * h
+            if area >= min_area and (max_area is None or area <= max_area):
+                filtered.append((x, y, w, h))
+        return filtered
+    def merge_overlapping_bubbles(self, bubbles: List[Tuple[int, int, int, int]],
+                                 overlap_threshold: float = 0.1) -> List[Tuple[int, int, int, int]]:
+        """
+        Merge overlapping bubble detections.
+        Args:
+            bubbles: List of bubble bounding boxes
+            overlap_threshold: Minimum overlap ratio to merge
+        Returns:
+            Merged list of bubbles
+        """
+        if not bubbles:
+            return []
+        # Convert to numpy array for easier manipulation
+        boxes = np.array([(x, y, x+w, y+h) for x, y, w, h in bubbles])
+        merged = []
+        used = set()
+        for i, box1 in enumerate(boxes):
+            if i in used:
+                continue
+            # Start with current box
+            x1, y1, x2, y2 = box1
+            # Check for overlaps with remaining boxes
+            for j in range(i + 1, len(boxes)):
+                if j in used:
+                    continue
+                box2 = boxes[j]
+                # Calculate intersection
+                ix1 = max(x1, box2[0])
+                iy1 = max(y1, box2[1])
+                ix2 = min(x2, box2[2])
+                iy2 = min(y2, box2[3])
+                if ix1 < ix2 and iy1 < iy2:
+                    # Calculate overlap ratio
+                    intersection = (ix2 - ix1) * (iy2 - iy1)
+                    area1 = (x2 - x1) * (y2 - y1)
+                    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+                    overlap = intersection / min(area1, area2)
+                    if overlap >= overlap_threshold:
+                        # Merge boxes
+                        x1 = min(x1, box2[0])
+                        y1 = min(y1, box2[1])
+                        x2 = max(x2, box2[2])
+                        y2 = max(y2, box2[3])
+                        used.add(j)
+            merged.append((int(x1), int(y1), int(x2 - x1), int(y2 - y1)))
+        return merged
+    # ============================
+    # RT-DETR (ONNX) BACKEND
+    # ============================
+    def load_rtdetr_onnx_model(self, model_id: str = None, force_reload: bool = False) -> bool:
+        """
+        Load RT-DETR ONNX model using onnxruntime. Downloads detector.onnx and config.json
+        from the provided Hugging Face repo if not already cached.
+        """
+        if not ONNX_AVAILABLE:
+            logger.error("ONNX Runtime not available for RT-DETR ONNX backend")
+            return False
+        try:
+            # If singleton mode and already loaded, just attach shared session
+            try:
+                adv = (self.config or {}).get('manga_settings', {}).get('advanced', {}) if isinstance(self.config, dict) else {}
+                singleton = bool(adv.get('use_singleton_models', True))
+            except Exception:
+                singleton = True
+            if singleton and BubbleDetector._rtdetr_onnx_loaded and not force_reload and BubbleDetector._rtdetr_onnx_shared_session is not None:
+                self.rtdetr_onnx_session = BubbleDetector._rtdetr_onnx_shared_session
+                self.rtdetr_onnx_loaded = True
+                return True
+            repo = model_id or self.rtdetr_onnx_repo
+            try:
+                from huggingface_hub import hf_hub_download
+            except Exception as e:
+                logger.error(f"huggingface-hub required to fetch RT-DETR ONNX: {e}")
+                return False
+            # Ensure local models dir (use configured cache_dir directly: e.g., 'models')
+            cache_dir = self.cache_dir
+            os.makedirs(cache_dir, exist_ok=True)
+            # Download files into models/ and avoid symlinks so the file is visible there
+            try:
+                _ = hf_hub_download(repo_id=repo, filename='config.json', cache_dir=cache_dir, local_dir=cache_dir, local_dir_use_symlinks=False)
+            except Exception:
+                pass
+            onnx_fp = hf_hub_download(repo_id=repo, filename='detector.onnx', cache_dir=cache_dir, local_dir=cache_dir, local_dir_use_symlinks=False)
+            BubbleDetector._rtdetr_onnx_model_path = onnx_fp
+            # Pick providers: prefer CUDA if available; otherwise CPU. Do NOT use DML.
+            providers = ['CPUExecutionProvider']
+            try:
+                avail = ort.get_available_providers() if ONNX_AVAILABLE else []
+                if 'CUDAExecutionProvider' in avail:
+                    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+            except Exception:
+                pass
+            # Session options with reduced memory arena and optional thread limiting in singleton mode
+            so = ort.SessionOptions()
+            try:
+                so.enable_mem_pattern = False
+                so.enable_cpu_mem_arena = False
+            except Exception:
+                pass
+            # If singleton models mode is enabled in config, limit ORT threading to reduce CPU spikes
+            try:
+                adv = (self.config or {}).get('manga_settings', {}).get('advanced', {}) if isinstance(self.config, dict) else {}
+                if bool(adv.get('use_singleton_models', True)):
+                    so.intra_op_num_threads = 1
+                    so.inter_op_num_threads = 1
+                    try:
+                        so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+                    except Exception:
+                        pass
+                    try:
+                        so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
+                    except Exception:
+                        pass
+            except Exception:
+                pass
+            # Create session (serialize creation in singleton mode to avoid device storms)
+            if singleton:
+                with BubbleDetector._rtdetr_onnx_init_lock:
+                    # Re-check after acquiring lock
+                    if BubbleDetector._rtdetr_onnx_loaded and BubbleDetector._rtdetr_onnx_shared_session is not None and not force_reload:
+                        self.rtdetr_onnx_session = BubbleDetector._rtdetr_onnx_shared_session
+                        self.rtdetr_onnx_loaded = True
+                        return True
+                    sess = ort.InferenceSession(onnx_fp, providers=providers, sess_options=so)
+                    BubbleDetector._rtdetr_onnx_shared_session = sess
+                    BubbleDetector._rtdetr_onnx_loaded = True
+                    BubbleDetector._rtdetr_onnx_providers = providers
+                    self.rtdetr_onnx_session = sess
+                    self.rtdetr_onnx_loaded = True
+            else:
+                self.rtdetr_onnx_session = ort.InferenceSession(onnx_fp, providers=providers, sess_options=so)
+                self.rtdetr_onnx_loaded = True
+            logger.info("✅ RT-DETR (ONNX) model ready")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to load RT-DETR ONNX: {e}")
+            self.rtdetr_onnx_session = None
+            self.rtdetr_onnx_loaded = False
+            return False
+    def detect_with_rtdetr_onnx(self,
+                                image_path: str = None,
+                                image: np.ndarray = None,
+                                confidence: float = 0.3,
+                                return_all_bubbles: bool = False) -> Any:
+        """Detect using RT-DETR ONNX backend.
+        Returns bubbles list if return_all_bubbles else dict by classes similar to PyTorch path.
+        """
+        if not self.rtdetr_onnx_loaded or self.rtdetr_onnx_session is None:
+            logger.warning("RT-DETR ONNX not loaded")
+            return [] if return_all_bubbles else {'bubbles': [], 'text_bubbles': [], 'text_free': []}
+        try:
+            # Acquire image
+            if image_path is not None:
+                import cv2
+                image = cv2.imread(image_path)
+                if image is None:
+                    raise RuntimeError(f"Failed to read image: {image_path}")
+                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            else:
+                if image is None:
+                    raise RuntimeError("No image provided")
+                # Assume image is BGR np.ndarray if from OpenCV
+                try:
+                    import cv2
+                    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                except Exception:
+                    image_rgb = image
+            # To PIL then resize 640x640 as in reference
+            from PIL import Image as _PILImage
+            pil_image = _PILImage.fromarray(image_rgb)
+            im_resized = pil_image.resize((640, 640))
+            arr = np.asarray(im_resized, dtype=np.float32) / 255.0
+            arr = np.transpose(arr, (2, 0, 1))  # (3,H,W)
+            im_data = arr[np.newaxis, ...]
+            w, h = pil_image.size
+            orig_size = np.array([[w, h]], dtype=np.int64)
+            # Run with a concurrency guard to prevent device hangs and limit memory usage
+            # Apply semaphore for ALL providers (not just DML) to control concurrency
+            providers = BubbleDetector._rtdetr_onnx_providers or []
+            def _do_run(session):
+                return session.run(None, {
+                    'images': im_data,
+                    'orig_target_sizes': orig_size
+                })
+            # Always use semaphore to limit concurrent RT-DETR calls
+            acquired = False
+            try:
+                BubbleDetector._rtdetr_onnx_sema.acquire()
+                acquired = True
+                # Special DML error handling
+                if 'DmlExecutionProvider' in providers:
+                    try:
+                        outputs = _do_run(self.rtdetr_onnx_session)
+                    except Exception as dml_err:
+                        msg = str(dml_err)
+                        if '887A0005' in msg or '887A0006' in msg or 'Dml' in msg:
+                            # Rebuild CPU session and retry once
+                            try:
+                                base_path = BubbleDetector._rtdetr_onnx_model_path
+                                if base_path:
+                                    so = ort.SessionOptions()
+                                    so.enable_mem_pattern = False
+                                    so.enable_cpu_mem_arena = False
+                                    cpu_providers = ['CPUExecutionProvider']
+                                    # Serialize rebuild
+                                    with BubbleDetector._rtdetr_onnx_init_lock:
+                                        sess = ort.InferenceSession(base_path, providers=cpu_providers, sess_options=so)
+                                        BubbleDetector._rtdetr_onnx_shared_session = sess
+                                        BubbleDetector._rtdetr_onnx_providers = cpu_providers
+                                        self.rtdetr_onnx_session = sess
+                                    outputs = _do_run(self.rtdetr_onnx_session)
+                                else:
+                                    raise
+                            except Exception:
+                                raise
+                        else:
+                            raise
+                else:
+                    # Non-DML providers - just run directly
+                    outputs = _do_run(self.rtdetr_onnx_session)
+            finally:
+                if acquired:
+                    try:
+                        BubbleDetector._rtdetr_onnx_sema.release()
+                    except Exception:
+                        pass
+            # outputs expected: labels, boxes, scores
+            labels, boxes, scores = outputs[:3]
+            if labels.ndim == 2 and labels.shape[0] == 1:
+                labels = labels[0]
+            if scores.ndim == 2 and scores.shape[0] == 1:
+                scores = scores[0]
+            if boxes.ndim == 3 and boxes.shape[0] == 1:
+                boxes = boxes[0]
+            # Apply NMS to remove duplicate detections
+            # Group detections by class and apply NMS per class
+            class_detections = {self.CLASS_BUBBLE: [], self.CLASS_TEXT_BUBBLE: [], self.CLASS_TEXT_FREE: []}
+            for lab, box, scr in zip(labels, boxes, scores):
+                if float(scr) < float(confidence):
+                    continue
+                label_id = int(lab)
+                if label_id in class_detections:
+                    x1, y1, x2, y2 = map(float, box)
+                    class_detections[label_id].append((x1, y1, x2, y2, float(scr)))
+            # Apply NMS per class to remove duplicates
+            def compute_iou(box1, box2):
+                """Compute IoU between two boxes (x1, y1, x2, y2)"""
+                x1_1, y1_1, x2_1, y2_1 = box1[:4]
+                x1_2, y1_2, x2_2, y2_2 = box2[:4]
+                # Intersection
+                x_left = max(x1_1, x1_2)
+                y_top = max(y1_1, y1_2)
+                x_right = min(x2_1, x2_2)
+                y_bottom = min(y2_1, y2_2)
+                if x_right < x_left or y_bottom < y_top:
+                    return 0.0
+                intersection = (x_right - x_left) * (y_bottom - y_top)
+                # Union
+                area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
+                area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
+                union = area1 + area2 - intersection
+                return intersection / union if union > 0 else 0.0
+            def apply_nms(boxes_with_scores, iou_threshold=0.45):
+                """Apply Non-Maximum Suppression"""
+                if not boxes_with_scores:
+                    return []
+                # Sort by score (descending)
+                sorted_boxes = sorted(boxes_with_scores, key=lambda x: x[4], reverse=True)
+                keep = []
+                while sorted_boxes:
+                    # Keep the box with highest score
+                    current = sorted_boxes.pop(0)
+                    keep.append(current)
+                    # Remove boxes with high IoU
+                    sorted_boxes = [box for box in sorted_boxes if compute_iou(current, box) < iou_threshold]
+                return keep
+            # Apply NMS and build final detections
+            detections = {'bubbles': [], 'text_bubbles': [], 'text_free': []}
+            bubbles_all = []
+            for class_id, boxes_list in class_detections.items():
+                nms_boxes = apply_nms(boxes_list, iou_threshold=self.default_iou_threshold)
+                for x1, y1, x2, y2, scr in nms_boxes:
+                    bbox = (int(x1), int(y1), int(x2 - x1), int(y2 - y1))
+                    if class_id == self.CLASS_BUBBLE:
+                        detections['bubbles'].append(bbox)
+                        bubbles_all.append(bbox)
+                    elif class_id == self.CLASS_TEXT_BUBBLE:
+                        detections['text_bubbles'].append(bbox)
+                        bubbles_all.append(bbox)
+                    elif class_id == self.CLASS_TEXT_FREE:
+                        detections['text_free'].append(bbox)
+            return bubbles_all if return_all_bubbles else detections
+        except Exception as e:
+            logger.error(f"RT-DETR ONNX detection failed: {e}")
+            return [] if return_all_bubbles else {'bubbles': [], 'text_bubbles': [], 'text_free': []}
+# Standalone utility functions
+def download_model_from_huggingface(repo_id: str = "ogkalu/comic-speech-bubble-detector-yolov8m",
+                                   filename: str = "comic-speech-bubble-detector-yolov8m.pt",
+                                   cache_dir: str = "models") -> str:
+    """
+    Download model from Hugging Face Hub.
+    Args:
+        repo_id: Hugging Face repository ID
+        filename: Model filename in the repository
+        cache_dir: Local directory to cache the model
+    Returns:
+        Path to downloaded model file
+    """
+    try:
+        from huggingface_hub import hf_hub_download
+        os.makedirs(cache_dir, exist_ok=True)
+        logger.info(f"📥 Downloading {filename} from {repo_id}...")
+        model_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            cache_dir=cache_dir,
+            local_dir=cache_dir
+        )
+        logger.info(f"✅ Model downloaded to: {model_path}")
+        return model_path
+    except ImportError:
+        logger.error("huggingface-hub package required. Install with: pip install huggingface-hub")
+        return None
+    except Exception as e:
+        logger.error(f"Download failed: {e}")
+        return None
+def download_rtdetr_model(cache_dir: str = "models") -> bool:
+    """
+    Download RT-DETR model for advanced detection.
+    Args:
+        cache_dir: Directory to cache the model
+    Returns:
+        True if successful
+    """
+    if not TRANSFORMERS_AVAILABLE:
+        logger.error("Transformers required. Install with: pip install transformers")
+        return False
+    try:
+        logger.info("📥 Downloading RT-DETR model...")
+        from transformers import RTDetrForObjectDetection, RTDetrImageProcessor
+        # This will download and cache the model
+        processor = RTDetrImageProcessor.from_pretrained(
+            "ogkalu/comic-text-and-bubble-detector",
+            cache_dir=cache_dir
+        )
+        model = RTDetrForObjectDetection.from_pretrained(
+            "ogkalu/comic-text-and-bubble-detector",
+            cache_dir=cache_dir
+        )
+        logger.info("✅ RT-DETR model downloaded successfully")
+        return True
+    except Exception as e:
+        logger.error(f"Download failed: {e}")
+        return False
+# Example usage and testing
+if __name__ == "__main__":
+    import sys
+    # Create detector
+    detector = BubbleDetector()
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "download":
+            # Download model from Hugging Face
+            model_path = download_model_from_huggingface()
+            if model_path:
+                print(f"YOLOv8 model downloaded to: {model_path}")
+            # Also download RT-DETR
+            if download_rtdetr_model():
+                print("RT-DETR model downloaded")
+        elif sys.argv[1] == "detect" and len(sys.argv) > 3:
+            # Detect bubbles in an image
+            model_path = sys.argv[2]
+            image_path = sys.argv[3]
+            # Load appropriate model
+            if 'rtdetr' in model_path.lower():
+                if detector.load_rtdetr_model():
+                    # Use RT-DETR
+                    results = detector.detect_with_rtdetr(image_path)
+                    print(f"RT-DETR Detection:")
+                    print(f"  Empty bubbles: {len(results['bubbles'])}")
+                    print(f"  Text bubbles: {len(results['text_bubbles'])}")
+                    print(f"  Free text: {len(results['text_free'])}")
+            else:
+                if detector.load_model(model_path):
+                    bubbles = detector.detect_bubbles(image_path, confidence=0.5)
+                    print(f"YOLOv8 detected {len(bubbles)} bubbles:")
+                    for i, (x, y, w, h) in enumerate(bubbles):
+                        print(f"  Bubble {i+1}: position=({x},{y}) size=({w}x{h})")
+            # Optionally visualize
+            if len(sys.argv) > 4:
+                output_path = sys.argv[4]
+                detector.visualize_detections(image_path, output_path=output_path,
+                                             use_rtdetr='rtdetr' in model_path.lower())
+        elif sys.argv[1] == "test-both" and len(sys.argv) > 2:
+            # Test both models
+            image_path = sys.argv[2]
+            # Load YOLOv8
+            yolo_path = "models/comic-speech-bubble-detector-yolov8m.pt"
+            if os.path.exists(yolo_path):
+                detector.load_model(yolo_path)
+                yolo_bubbles = detector.detect_bubbles(image_path, use_rtdetr=False)
+                print(f"YOLOv8: {len(yolo_bubbles)} bubbles")
+            # Load RT-DETR
+            if detector.load_rtdetr_model():
+                rtdetr_bubbles = detector.detect_bubbles(image_path, use_rtdetr=True)
+                print(f"RT-DETR: {len(rtdetr_bubbles)} bubbles")
+        else:
+            print("Usage:")
+            print("  python bubble_detector.py download")
+            print("  python bubble_detector.py detect <model_path> <image_path> [output_path]")
+            print("  python bubble_detector.py test-both <image_path>")
+    else:
+        print("Bubble Detector Module (YOLOv8 + RT-DETR)")
+        print("Usage:")
+        print("  python bubble_detector.py download")
+        print("  python bubble_detector.py detect <model_path> <image_path> [output_path]")
+        print("  python bubble_detector.py test-both <image_path>")

hyphen_textwrap.py ADDED Viewed

	@@ -0,0 +1,508 @@

+# modified textwrap module to add hyphens whenever it breaks a long word
+# https://github.com/python/cpython/blob/main/Lib/textwrap.py
+"""Text wrapping and filling with improved hyphenation support.
+This module is adapted from comic-translate's enhanced textwrap implementation.
+It provides better hyphenation behavior when breaking long words across lines.
+"""
+# Copyright (C) 1999-2001 Gregory P. Ward.
+# Copyright (C) 2002, 2003 Python Software Foundation.
+# Written by Greg Ward <gward@python.net>
+import re
+__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
+# Hardcode the recognized whitespace characters to the US-ASCII
+# whitespace characters.  The main reason for doing this is that
+# some Unicode spaces (like \u00a0) are non-breaking whitespaces.
+_whitespace = '\t\n\x0b\x0c\r '
+class TextWrapper:
+    """
+    Object for wrapping/filling text.  The public interface consists of
+    the wrap() and fill() methods; the other methods are just there for
+    subclasses to override in order to tweak the default behaviour.
+    If you want to completely replace the main wrapping algorithm,
+    you'll probably have to override _wrap_chunks().
+    Several instance attributes control various aspects of wrapping:
+      width (default: 70)
+        the maximum width of wrapped lines (unless break_long_words
+        is false)
+      initial_indent (default: "")
+        string that will be prepended to the first line of wrapped
+        output.  Counts towards the line's width.
+      subsequent_indent (default: "")
+        string that will be prepended to all lines save the first
+        of wrapped output; also counts towards each line's width.
+      expand_tabs (default: true)
+        Expand tabs in input text to spaces before further processing.
+        Each tab will become 0 .. 'tabsize' spaces, depending on its position
+        in its line.  If false, each tab is treated as a single character.
+      tabsize (default: 8)
+        Expand tabs in input text to 0 .. 'tabsize' spaces, unless
+        'expand_tabs' is false.
+      replace_whitespace (default: true)
+        Replace all whitespace characters in the input text by spaces
+        after tab expansion.  Note that if expand_tabs is false and
+        replace_whitespace is true, every tab will be converted to a
+        single space!
+      fix_sentence_endings (default: false)
+        Ensure that sentence-ending punctuation is always followed
+        by two spaces.  Off by default because the algorithm is
+        (unavoidably) imperfect.
+      break_long_words (default: true)
+        Break words longer than 'width'.  If false, those words will not
+        be broken, and some lines might be longer than 'width'.
+      break_on_hyphens (default: true)
+        Allow breaking hyphenated words. If true, wrapping will occur
+        preferably on whitespaces and right after hyphens part of
+        compound words.
+      drop_whitespace (default: true)
+        Drop leading and trailing whitespace from lines.
+      max_lines (default: None)
+        Truncate wrapped lines.
+      placeholder (default: ' [...]')
+        Append to the last line of truncated text.
+      hyphenate_broken_words (default: True)
+        Add hyphens when breaking long words across lines.
+    """
+    unicode_whitespace_trans = dict.fromkeys(map(ord, _whitespace), ord(' '))
+    # This funky little regex is just the trick for splitting
+    # text up into word-wrappable chunks.  E.g.
+    #   "Hello there -- you goof-ball, use the -b option!"
+    # splits into
+    #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
+    # (after stripping out empty strings).
+    word_punct = r'[\w!"\'\&.,?]'
+    letter = r'[^\d\W]'
+    whitespace = r'[%s]' % re.escape(_whitespace)
+    nowhitespace = '[^' + whitespace[1:]
+    wordsep_re = re.compile(r'''
+        ( # any whitespace
+          %(ws)s+
+        | # em-dash between words
+          (?<=%(wp)s) -{2,} (?=\w)
+        | # word, possibly hyphenated
+          %(nws)s+? (?:
+            # hyphenated word
+              -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
+              (?= %(lt)s -? %(lt)s)
+            | # end of word
+              (?=%(ws)s|\Z)
+            | # em-dash
+              (?<=%(wp)s) (?=-{2,}\w)
+            )
+        )''' % {'wp': word_punct, 'lt': letter,
+                'ws': whitespace, 'nws': nowhitespace},
+        re.VERBOSE)
+    del word_punct, letter, nowhitespace
+    # This less funky little regex just split on recognized spaces. E.g.
+    #   "Hello there -- you goof-ball, use the -b option!"
+    # splits into
+    #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
+    wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
+    del whitespace
+    # XXX this is not locale- or charset-aware -- string.lowercase
+    # is US-ASCII only (and therefore English-only)
+    sentence_end_re = re.compile(r'[a-z]'             # lowercase letter
+                                 r'[\.\!\?]'          # sentence-ending punct.
+                                 r'[\"\']?'           # optional end-of-quote
+                                 r'\Z')               # end of chunk
+    def __init__(self,
+                 width=70,
+                 initial_indent="",
+                 subsequent_indent="",
+                 expand_tabs=True,
+                 replace_whitespace=True,
+                 fix_sentence_endings=False,
+                 break_long_words=True,
+                 drop_whitespace=True,
+                 break_on_hyphens=True,
+                 hyphenate_broken_words=True,
+                 tabsize=8,
+                 *,
+                 max_lines=None,
+                 placeholder=' [...]'):
+        self.width = width
+        self.initial_indent = initial_indent
+        self.subsequent_indent = subsequent_indent
+        self.expand_tabs = expand_tabs
+        self.replace_whitespace = replace_whitespace
+        self.fix_sentence_endings = fix_sentence_endings
+        self.break_long_words = break_long_words
+        self.drop_whitespace = drop_whitespace
+        self.break_on_hyphens = break_on_hyphens
+        self.tabsize = tabsize
+        self.max_lines = max_lines
+        self.placeholder = placeholder
+        self.hyphenate_broken_words = hyphenate_broken_words
+    # -- Private methods -----------------------------------------------
+    # (possibly useful for subclasses to override)
+    def _munge_whitespace(self, text):
+        """_munge_whitespace(text : string) -> string
+        Munge whitespace in text: expand tabs and convert all other
+        whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz"
+        becomes " foo    bar  baz".
+        """
+        if self.expand_tabs:
+            text = text.expandtabs(self.tabsize)
+        if self.replace_whitespace:
+            text = text.translate(self.unicode_whitespace_trans)
+        return text
+    def _split(self, text):
+        """_split(text : string) -> [string]
+        Split the text to wrap into indivisible chunks.  Chunks are
+        not quite the same as words; see _wrap_chunks() for full
+        details.  As an example, the text
+          Look, goof-ball -- use the -b option!
+        breaks into the following chunks:
+          'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
+          'use', ' ', 'the', ' ', '-b', ' ', 'option!'
+        if break_on_hyphens is True, or in:
+          'Look,', ' ', 'goof-ball', ' ', '--', ' ',
+          'use', ' ', 'the', ' ', '-b', ' ', option!'
+        otherwise.
+        """
+        if self.break_on_hyphens is True:
+            chunks = self.wordsep_re.split(text)
+        else:
+            chunks = self.wordsep_simple_re.split(text)
+        chunks = [c for c in chunks if c]
+        return chunks
+    def _fix_sentence_endings(self, chunks):
+        """_fix_sentence_endings(chunks : [string])
+        Correct for sentence endings buried in 'chunks'.  Eg. when the
+        original text contains "... foo.\\nBar ...", munge_whitespace()
+        and split() will convert that to [..., "foo.", " ", "Bar", ...]
+        which has one too few spaces; this method simply changes the one
+        space to two.
+        """
+        i = 0
+        patsearch = self.sentence_end_re.search
+        while i < len(chunks)-1:
+            if chunks[i+1] == " " and patsearch(chunks[i]):
+                chunks[i+1] = "  "
+                i += 2
+            else:
+                i += 1
+    def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
+        """_handle_long_word(chunks : [string],
+                            cur_line : [string],
+                            cur_len : int, width : int)
+        Handle a chunk of text (most likely a word, not whitespace) that
+        is too long to fit in any line.
+        """
+        # Figure out when indent is larger than the specified width, and make
+        # sure at least one character is stripped off on every pass
+        if width < 1:
+            space_left = 1
+        else:
+            space_left = width - cur_len
+        # If we're allowed to break long words, then do so: put as much
+        # of the next chunk onto the current line as will fit.
+        if self.break_long_words:
+            end = space_left
+            chunk = reversed_chunks[-1]
+            if self.break_on_hyphens and len(chunk) > space_left:
+                # break after last hyphen, but only if there are
+                # non-hyphens before it
+                hyphen = chunk.rfind('-', 0, space_left)
+                if hyphen > 0 and any(c != '-' for c in chunk[:hyphen]):
+                    end = hyphen + 1
+            if chunk[:end]:
+                cur_line.append(chunk[:end])
+                # Now adds a hyphen whenever a long word is split to the next line
+                # unless certain chracters already exists at the split
+                if self.hyphenate_broken_words and chunk[:end][-1] not in ['-','.',',']:
+                    cur_line.append('-')
+            reversed_chunks[-1] = chunk[end:]
+        # Otherwise, we have to preserve the long word intact.  Only add
+        # it to the current line if there's nothing already there --
+        # that minimizes how much we violate the width constraint.
+        elif not cur_line:
+            cur_line.append(reversed_chunks.pop())
+        # If we're not allowed to break long words, and there's already
+        # text on the current line, do nothing.  Next time through the
+        # main loop of _wrap_chunks(), we'll wind up here again, but
+        # cur_len will be zero, so the next line will be entirely
+        # devoted to the long word that we can't handle right now.
+    def _wrap_chunks(self, chunks):
+        """_wrap_chunks(chunks : [string]) -> [string]
+        Wrap a sequence of text chunks and return a list of lines of
+        length 'self.width' or less.  (If 'break_long_words' is false,
+        some lines may be longer than this.)  Chunks correspond roughly
+        to words and the whitespace between them: each chunk is
+        indivisible (modulo 'break_long_words'), but a line break can
+        come between any two chunks.  Chunks should not have internal
+        whitespace; ie. a chunk is either all whitespace or a "word".
+        Whitespace chunks will be removed from the beginning and end of
+        lines, but apart from that whitespace is preserved.
+        """
+        lines = []
+        if self.width <= 0:
+            raise ValueError("invalid width %r (must be > 0)" % self.width)
+        if self.max_lines is not None:
+            if self.max_lines > 1:
+                indent = self.subsequent_indent
+            else:
+                indent = self.initial_indent
+            if len(indent) + len(self.placeholder.lstrip()) > self.width:
+                raise ValueError("placeholder too large for max width")
+        # Arrange in reverse order so items can be efficiently popped
+        # from a stack of chucks.
+        chunks.reverse()
+        while chunks:
+            # Start the list of chunks that will make up the current line.
+            # cur_len is just the length of all the chunks in cur_line.
+            cur_line = []
+            cur_len = 0
+            # Figure out which static string will prefix this line.
+            if lines:
+                indent = self.subsequent_indent
+            else:
+                indent = self.initial_indent
+            # Maximum width for this line.
+            width = self.width - len(indent)
+            # First chunk on line is whitespace -- drop it, unless this
+            # is the very beginning of the text (ie. no lines started yet).
+            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
+                del chunks[-1]
+            while chunks:
+                l = len(chunks[-1])
+                # Can at least squeeze this chunk onto the current line.
+                if cur_len + l <= width:
+                    cur_line.append(chunks.pop())
+                    cur_len += l
+                # Nope, this line is full.
+                else:
+                    break
+            # The current line is full, and the next chunk is too big to
+            # fit on *any* line (not just this one).
+            if chunks and len(chunks[-1]) > width:
+                self._handle_long_word(chunks, cur_line, cur_len, width)
+                cur_len = sum(map(len, cur_line))
+            # If the last chunk on this line is all whitespace, drop it.
+            if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
+                cur_len -= len(cur_line[-1])
+                del cur_line[-1]
+            if cur_line:
+                if (self.max_lines is None or
+                    len(lines) + 1 < self.max_lines or
+                    (not chunks or
+                     self.drop_whitespace and
+                     len(chunks) == 1 and
+                     not chunks[0].strip()) and cur_len <= width):
+                    # Convert current line back to a string and store it in
+                    # list of all lines (return value).
+                    lines.append(indent + ''.join(cur_line))
+                else:
+                    while cur_line:
+                        if (cur_line[-1].strip() and
+                            cur_len + len(self.placeholder) <= width):
+                            cur_line.append(self.placeholder)
+                            lines.append(indent + ''.join(cur_line))
+                            break
+                        cur_len -= len(cur_line[-1])
+                        del cur_line[-1]
+                    else:
+                        if lines:
+                            prev_line = lines[-1].rstrip()
+                            if (len(prev_line) + len(self.placeholder) <=
+                                    self.width):
+                                lines[-1] = prev_line + self.placeholder
+                                break
+                        lines.append(indent + self.placeholder.lstrip())
+                    break
+        return lines
+    def _split_chunks(self, text):
+        text = self._munge_whitespace(text)
+        return self._split(text)
+    # -- Public interface ----------------------------------------------
+    def wrap(self, text):
+        """wrap(text : string) -> [string]
+        Reformat the single paragraph in 'text' so it fits in lines of
+        no more than 'self.width' columns, and return a list of wrapped
+        lines.  Tabs in 'text' are expanded with string.expandtabs(),
+        and all other whitespace characters (including newline) are
+        converted to space.
+        """
+        chunks = self._split_chunks(text)
+        if self.fix_sentence_endings:
+            self._fix_sentence_endings(chunks)
+        return self._wrap_chunks(chunks)
+    def fill(self, text):
+        """fill(text : string) -> string
+        Reformat the single paragraph in 'text' to fit in lines of no
+        more than 'self.width' columns, and return a new string
+        containing the entire wrapped paragraph.
+        """
+        return "\n".join(self.wrap(text))
+# -- Convenience interface ---------------------------------------------
+def wrap(text, width=70, **kwargs):
+    """Wrap a single paragraph of text, returning a list of wrapped lines.
+    Reformat the single paragraph in 'text' so it fits in lines of no
+    more than 'width' columns, and return a list of wrapped lines.  By
+    default, tabs in 'text' are expanded with string.expandtabs(), and
+    all other whitespace characters (including newline) are converted to
+    space.  See TextWrapper class for available keyword args to customize
+    wrapping behaviour.
+    """
+    w = TextWrapper(width=width, **kwargs)
+    return w.wrap(text)
+def fill(text, width=70, **kwargs):
+    """Fill a single paragraph of text, returning a new string.
+    Reformat the single paragraph in 'text' to fit in lines of no more
+    than 'width' columns, and return a new string containing the entire
+    wrapped paragraph.  As with wrap(), tabs are expanded and other
+    whitespace characters converted to space.  See TextWrapper class for
+    available keyword args to customize wrapping behaviour.
+    """
+    w = TextWrapper(width=width, **kwargs)
+    return w.fill(text)
+def shorten(text, width, **kwargs):
+    """Collapse and truncate the given text to fit in the given width.
+    The text first has its whitespace collapsed.  If it then fits in
+    the *width*, it is returned as is.  Otherwise, as many words
+    as possible are joined and then the placeholder is appended::
+        >>> textwrap.shorten("Hello  world!", width=12)
+        'Hello world!'
+        >>> textwrap.shorten("Hello  world!", width=11)
+        'Hello [...]'
+    """
+    w = TextWrapper(width=width, max_lines=1, **kwargs)
+    return w.fill(' '.join(text.strip().split()))
+# -- Loosely related functionality -------------------------------------
+_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
+_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
+def dedent(text):
+    """Remove any common leading whitespace from every line in `text`.
+    This can be used to make triple-quoted strings line up with the left
+    edge of the display, while still presenting them in the source code
+    in indented form.
+    Note that tabs and spaces are both treated as whitespace, but they
+    are not equal: the lines "  hello" and "\\thello" are
+    considered to have no common leading whitespace.
+    Entirely blank lines are normalized to a newline character.
+    """
+    # Look for the longest leading string of spaces and tabs common to
+    # all lines.
+    margin = None
+    text = _whitespace_only_re.sub('', text)
+    indents = _leading_whitespace_re.findall(text)
+    for indent in indents:
+        if margin is None:
+            margin = indent
+        # Current line more deeply indented than previous winner:
+        # no change (previous winner is still on top).
+        elif indent.startswith(margin):
+            pass
+        # Current line consistent with and no deeper than previous winner:
+        # it's the new winner.
+        elif margin.startswith(indent):
+            margin = indent
+        # Find the largest common whitespace between current line and previous
+        # winner.
+        else:
+            for i, (x, y) in enumerate(zip(margin, indent)):
+                if x != y:
+                    margin = margin[:i]
+                    break
+    # sanity check (testing/debugging only)
+    if 0 and margin:
+        for line in text.split("\n"):
+            assert not line or line.startswith(margin), \
+                   "line = %r, margin = %r" % (line, margin)
+    if margin:
+        text = re.sub(r'(?m)^' + margin, '', text)
+    return text
+def indent(text, prefix, predicate=None):
+    """Adds 'prefix' to the beginning of selected lines in 'text'.
+    If 'predicate' is provided, 'prefix' will only be added to the lines
+    where 'predicate(line)' is True. If 'predicate' is not provided,
+    it will default to adding 'prefix' to all non-empty lines that do not
+    consist solely of whitespace characters.
+    """
+    if predicate is None:
+        # str.splitlines(True) doesn't produce empty string.
+        #  ''.splitlines(True) => []
+        #  'foo\n'.splitlines(True) => ['foo\n']
+        # So we can use just `not s.isspace()` here.
+        predicate = lambda s: not s.isspace()
+    prefixed_lines = []
+    for line in text.splitlines(True):
+        if predicate(line):
+            prefixed_lines.append(prefix)
+        prefixed_lines.append(line)
+    return ''.join(prefixed_lines)

local_inpainter.py ADDED Viewed

The diff for this file is too large to render. See raw diff

manga_integration.py ADDED Viewed

The diff for this file is too large to render. See raw diff

manga_settings_dialog.py ADDED Viewed

The diff for this file is too large to render. See raw diff

manga_translator.py ADDED Viewed

The diff for this file is too large to render. See raw diff

ocr_manager.py ADDED Viewed

	@@ -0,0 +1,1904 @@

+# ocr_manager.py
+"""
+OCR Manager for handling multiple OCR providers
+Handles installation, model downloading, and OCR processing
+Updated with HuggingFace donut model and proper bubble detection integration
+"""
+import os
+import sys
+import cv2
+import json
+import subprocess
+import threading
+import traceback
+from typing import List, Dict, Optional, Tuple, Any
+import numpy as np
+from dataclasses import dataclass
+from PIL import Image
+import logging
+import time
+import random
+import base64
+import io
+import requests
+try:
+    import gptqmodel
+    HAS_GPTQ = True
+except ImportError:
+    try:
+        import auto_gptq
+        HAS_GPTQ = True
+    except ImportError:
+        HAS_GPTQ = False
+try:
+    import optimum
+    HAS_OPTIMUM = True
+except ImportError:
+    HAS_OPTIMUM = False
+try:
+    import accelerate
+    HAS_ACCELERATE = True
+except ImportError:
+    HAS_ACCELERATE = False
+logger = logging.getLogger(__name__)
+@dataclass
+class OCRResult:
+    """Unified OCR result format with built-in sanitization to prevent data corruption."""
+    text: str
+    bbox: Tuple[int, int, int, int]  # x, y, w, h
+    confidence: float
+    vertices: Optional[List[Tuple[int, int]]] = None
+    def __post_init__(self):
+        """
+        This special method is called automatically after the object is created.
+        It acts as a final safeguard to ensure the 'text' attribute is ALWAYS a clean string.
+        """
+        # --- THIS IS THE DEFINITIVE FIX ---
+        # If the text we received is a tuple, we extract the first element.
+        # This makes it impossible for a tuple to exist in a finished object.
+        if isinstance(self.text, tuple):
+            # Log that we are fixing a critical data error.
+            print(f"CRITICAL WARNING: Corrupted tuple detected in OCRResult. Sanitizing '{self.text}' to '{self.text[0]}'.")
+            self.text = self.text[0]
+        # Ensure the final result is always a stripped string.
+        self.text = str(self.text).strip()
+class OCRProvider:
+    """Base class for OCR providers"""
+    def __init__(self, log_callback=None):
+        # Set thread limits early if environment indicates single-threaded mode
+        try:
+            if os.environ.get('OMP_NUM_THREADS') == '1':
+                # Already in single-threaded mode, ensure it's applied to this process
+                try:
+                    import sys
+                    if 'torch' in sys.modules:
+                        import torch
+                        torch.set_num_threads(1)
+                except (ImportError, RuntimeError, AttributeError):
+                    pass
+                try:
+                    import cv2
+                    cv2.setNumThreads(1)
+                except (ImportError, AttributeError):
+                    pass
+        except Exception:
+            pass
+        self.log_callback = log_callback
+        self.is_installed = False
+        self.is_loaded = False
+        self.model = None
+        self.stop_flag = None
+        self._stopped = False
+    def _log(self, message: str, level: str = "info"):
+        """Log message with stop suppression"""
+        # Suppress logs when stopped (allow only essential stop confirmation messages)
+        if self._check_stop():
+            essential_stop_keywords = [
+                "⏹️ Translation stopped by user",
+                "⏹️ OCR processing stopped",
+                "cleanup", "🧹"
+            ]
+            if not any(keyword in message for keyword in essential_stop_keywords):
+                return
+        if self.log_callback:
+            self.log_callback(message, level)
+        else:
+            print(f"[{level.upper()}] {message}")
+    def set_stop_flag(self, stop_flag):
+        """Set the stop flag for checking interruptions"""
+        self.stop_flag = stop_flag
+        self._stopped = False
+    def _check_stop(self) -> bool:
+        """Check if stop has been requested"""
+        if self._stopped:
+            return True
+        if self.stop_flag and self.stop_flag.is_set():
+            self._stopped = True
+            return True
+        # Check global manga translator cancellation
+        try:
+            from manga_translator import MangaTranslator
+            if MangaTranslator.is_globally_cancelled():
+                self._stopped = True
+                return True
+        except Exception:
+            pass
+        return False
+    def reset_stop_flags(self):
+        """Reset stop flags when starting new processing"""
+        self._stopped = False
+    def check_installation(self) -> bool:
+        """Check if provider is installed"""
+        raise NotImplementedError
+    def install(self, progress_callback=None) -> bool:
+        """Install the provider"""
+        raise NotImplementedError
+    def load_model(self, **kwargs) -> bool:
+        """Load the OCR model"""
+        raise NotImplementedError
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """Detect text in image"""
+        raise NotImplementedError
+class CustomAPIProvider(OCRProvider):
+    """Custom API OCR provider that uses existing GUI variables"""
+    def __init__(self, log_callback=None):
+        super().__init__(log_callback)
+        # Use EXISTING environment variables from TranslatorGUI
+        self.api_url = os.environ.get('OPENAI_CUSTOM_BASE_URL', '')
+        self.api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
+        self.model_name = os.environ.get('MODEL', 'gpt-4o-mini')
+        # OCR prompt - use system prompt or a dedicated OCR prompt variable
+        self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
+            os.environ.get('SYSTEM_PROMPT',
+            "YOU ARE AN OCR SYSTEM. YOUR ONLY JOB IS TEXT EXTRACTION.\n\n"
+            "CRITICAL RULES:\n"
+            "1. DO NOT TRANSLATE ANYTHING\n"
+            "2. DO NOT MODIFY THE TEXT\n"
+            "3. DO NOT EXPLAIN OR COMMENT\n"
+            "4. ONLY OUTPUT THE EXACT TEXT YOU SEE\n"
+            "5. PRESERVE NATURAL TEXT FLOW - DO NOT ADD UNNECESSARY LINE BREAKS\n\n"
+            "If you see Korean text, output it in Korean.\n"
+            "If you see Japanese text, output it in Japanese.\n"
+            "If you see Chinese text, output it in Chinese.\n"
+            "If you see English text, output it in English.\n\n"
+            "IMPORTANT: Only use line breaks where they naturally occur in the original text "
+            "(e.g., between dialogue lines or paragraphs). Do not break text mid-sentence or "
+            "between every word/character.\n\n"
+            "For vertical text common in manga/comics, transcribe it as a continuous line unless "
+            "there are clear visual breaks.\n\n"
+            "NEVER translate. ONLY extract exactly what is written.\n"
+            "Output ONLY the raw text, nothing else."
+            ))
+        # Use existing temperature and token settings
+        self.temperature = float(os.environ.get('TRANSLATION_TEMPERATURE', '0.01'))
+        # Don't hardcode to 8192 - get fresh value when actually used
+        self.max_tokens = int(os.environ.get('MAX_OUTPUT_TOKENS', '4096'))
+        # Image settings from existing compression variables
+        self.image_format = 'jpeg' if os.environ.get('IMAGE_COMPRESSION_FORMAT', 'auto') != 'png' else 'png'
+        self.image_quality = int(os.environ.get('JPEG_QUALITY', '100'))
+        # Simple defaults
+        self.api_format = 'openai'  # Most custom endpoints are OpenAI-compatible
+        self.timeout = int(os.environ.get('CHUNK_TIMEOUT', '30'))
+        self.api_headers = {}  # Additional custom headers
+        # Retry configuration for Custom API OCR calls
+        self.max_retries = int(os.environ.get('CUSTOM_OCR_MAX_RETRIES', '3'))
+        self.retry_initial_delay = float(os.environ.get('CUSTOM_OCR_RETRY_INITIAL_DELAY', '0.8'))
+        self.retry_backoff = float(os.environ.get('CUSTOM_OCR_RETRY_BACKOFF', '1.8'))
+        self.retry_jitter = float(os.environ.get('CUSTOM_OCR_RETRY_JITTER', '0.4'))
+        self.retry_on_empty = os.environ.get('CUSTOM_OCR_RETRY_ON_EMPTY', '1') == '1'
+    def check_installation(self) -> bool:
+        """Always installed - uses UnifiedClient"""
+        self.is_installed = True
+        return True
+    def install(self, progress_callback=None) -> bool:
+        """No installation needed for API-based provider"""
+        return self.check_installation()
+    def load_model(self, **kwargs) -> bool:
+        """Initialize UnifiedClient with current settings"""
+        try:
+            from unified_api_client import UnifiedClient
+            # Support passing API key from GUI if available
+            if 'api_key' in kwargs:
+                api_key = kwargs['api_key']
+            else:
+                api_key = os.environ.get('API_KEY', '') or os.environ.get('OPENAI_API_KEY', '')
+            if 'model' in kwargs:
+                model = kwargs['model']
+            else:
+                model = os.environ.get('MODEL', 'gpt-4o-mini')
+            if not api_key:
+                self._log("❌ No API key configured", "error")
+                return False
+            # Create UnifiedClient just like translations do
+            self.client = UnifiedClient(model=model, api_key=api_key)
+            #self._log(f"✅ Using {model} for OCR via UnifiedClient")
+            self.is_loaded = True
+            return True
+        except Exception as e:
+            self._log(f"❌ Failed to initialize UnifiedClient: {str(e)}", "error")
+            return False
+    def _test_connection(self) -> bool:
+        """Test API connection with a simple request"""
+        try:
+            # Create a small test image
+            test_image = np.ones((100, 100, 3), dtype=np.uint8) * 255
+            cv2.putText(test_image, "TEST", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
+            # Encode image
+            image_base64 = self._encode_image(test_image)
+            # Prepare test request based on API format
+            if self.api_format == 'openai':
+                test_payload = {
+                    "model": self.model_name,
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": "What text do you see?"},
+                                {"type": "image_url", "image_url": {"url": f"data:image/{self.image_format};base64,{image_base64}"}}
+                            ]
+                        }
+                    ],
+                    "max_tokens": 50
+                }
+            else:
+                # For other formats, just try a basic health check
+                return True
+            headers = self._prepare_headers()
+            response = requests.post(
+                self.api_url,
+                headers=headers,
+                json=test_payload,
+                timeout=10
+            )
+            return response.status_code == 200
+        except Exception:
+            return False
+    def _encode_image(self, image: np.ndarray) -> str:
+        """Encode numpy array to base64 string"""
+        # Convert BGR to RGB if needed
+        if len(image.shape) == 3 and image.shape[2] == 3:
+            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        else:
+            image_rgb = image
+        # Convert to PIL Image
+        pil_image = Image.fromarray(image_rgb)
+        # Save to bytes buffer
+        buffer = io.BytesIO()
+        if self.image_format.lower() == 'png':
+            pil_image.save(buffer, format='PNG')
+        else:
+            pil_image.save(buffer, format='JPEG', quality=self.image_quality)
+        # Encode to base64
+        buffer.seek(0)
+        image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+        return image_base64
+    def _prepare_headers(self) -> dict:
+        """Prepare request headers"""
+        headers = {
+            "Content-Type": "application/json"
+        }
+        # Add API key if configured
+        if self.api_key:
+            if self.api_format == 'anthropic':
+                headers["x-api-key"] = self.api_key
+            else:
+                headers["Authorization"] = f"Bearer {self.api_key}"
+        # Add any custom headers
+        headers.update(self.api_headers)
+        return headers
+    def _prepare_request_payload(self, image_base64: str) -> dict:
+        """Prepare request payload based on API format"""
+        if self.api_format == 'openai':
+            return {
+                "model": self.model_name,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": self.ocr_prompt},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/{self.image_format};base64,{image_base64}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                "max_tokens": self.max_tokens,
+                "temperature": self.temperature
+            }
+        elif self.api_format == 'anthropic':
+            return {
+                "model": self.model_name,
+                "max_tokens": self.max_tokens,
+                "temperature": self.temperature,
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": self.ocr_prompt
+                            },
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": f"image/{self.image_format}",
+                                    "data": image_base64
+                                }
+                            }
+                        ]
+                    }
+                ]
+            }
+        else:
+            # Custom format - use environment variable for template
+            template = os.environ.get('CUSTOM_OCR_REQUEST_TEMPLATE', '{}')
+            payload = json.loads(template)
+            # Replace placeholders
+            payload_str = json.dumps(payload)
+            payload_str = payload_str.replace('{{IMAGE_BASE64}}', image_base64)
+            payload_str = payload_str.replace('{{PROMPT}}', self.ocr_prompt)
+            payload_str = payload_str.replace('{{MODEL}}', self.model_name)
+            payload_str = payload_str.replace('{{MAX_TOKENS}}', str(self.max_tokens))
+            payload_str = payload_str.replace('{{TEMPERATURE}}', str(self.temperature))
+            return json.loads(payload_str)
+    def _extract_text_from_response(self, response_data: dict) -> str:
+        """Extract text from API response based on format"""
+        try:
+            if self.api_format == 'openai':
+                # OpenAI format: response.choices[0].message.content
+                return response_data.get('choices', [{}])[0].get('message', {}).get('content', '')
+            elif self.api_format == 'anthropic':
+                # Anthropic format: response.content[0].text
+                content = response_data.get('content', [])
+                if content and isinstance(content, list):
+                    return content[0].get('text', '')
+                return ''
+            else:
+                # Custom format - use environment variable for path
+                response_path = os.environ.get('CUSTOM_OCR_RESPONSE_PATH', 'text')
+                # Navigate through the response using the path
+                result = response_data
+                for key in response_path.split('.'):
+                    if isinstance(result, dict):
+                        result = result.get(key, '')
+                    elif isinstance(result, list) and key.isdigit():
+                        idx = int(key)
+                        result = result[idx] if idx < len(result) else ''
+                    else:
+                        result = ''
+                        break
+                return str(result)
+        except Exception as e:
+            self._log(f"Failed to extract text from response: {e}", "error")
+            return ''
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """Process image using UnifiedClient.send_image()"""
+        results = []
+        try:
+            # Get fresh max_tokens from environment - GUI will have set this
+            max_tokens = int(os.environ.get('MAX_OUTPUT_TOKENS', '4096'))
+            if not self.is_loaded:
+                if not self.load_model():
+                    return results
+            import cv2
+            from PIL import Image
+            import base64
+            import io
+            # Convert numpy array to PIL Image
+            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image_rgb)
+            h, w = image.shape[:2]
+            # Convert PIL Image to base64 string
+            buffer = io.BytesIO()
+            # Use the image format from settings
+            if self.image_format.lower() == 'png':
+                pil_image.save(buffer, format='PNG')
+            else:
+                pil_image.save(buffer, format='JPEG', quality=self.image_quality)
+            buffer.seek(0)
+            image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
+            # For OpenAI vision models, we need BOTH:
+            # 1. System prompt with instructions
+            # 2. User message that includes the image
+            messages = [
+                {
+                    "role": "system",
+                    "content": self.ocr_prompt  # The OCR instruction as system prompt
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Image:"  # Minimal text, just to have something
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{image_base64}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            # Now send this properly formatted message
+            # The UnifiedClient should handle this correctly
+            # But we're NOT using send_image, we're using regular send
+            # Retry-aware call
+            from unified_api_client import UnifiedClientError  # local import to avoid hard dependency at module import time
+            max_attempts = max(1, self.max_retries)
+            attempt = 0
+            last_error = None
+            # Common refusal/error phrases that indicate a non-OCR response
+            refusal_phrases = [
+                "I can't extract", "I cannot extract",
+                "I'm sorry", "I am sorry",
+                "I'm unable", "I am unable",
+                "cannot process images",
+                "I can't help with that",
+                "cannot view images",
+                "no text in the image"
+            ]
+            while attempt < max_attempts:
+                # Check for stop before each attempt
+                if self._check_stop():
+                    self._log("⏹️ OCR processing stopped by user", "warning")
+                    return results
+                try:
+                    response = self.client.send(
+                        messages=messages,
+                        temperature=self.temperature,
+                        max_tokens=max_tokens
+                    )
+                    # Extract content from response object
+                    content, finish_reason = response
+                    # Validate content
+                    has_content = bool(content and str(content).strip())
+                    refused = False
+                    if has_content:
+                        # Filter out explicit failure markers
+                        if "[" in content and "FAILED]" in content:
+                            refused = True
+                        elif any(phrase.lower() in content.lower() for phrase in refusal_phrases):
+                            refused = True
+                    # Decide success or retry
+                    if has_content and not refused:
+                        text = str(content).strip()
+                        results.append(OCRResult(
+                            text=text,
+                            bbox=(0, 0, w, h),
+                            confidence=kwargs.get('confidence', 0.85),
+                            vertices=[(0, 0), (w, 0), (w, h), (0, h)]
+                        ))
+                        self._log(f"✅ Detected: {text[:50]}...")
+                        break  # success
+                    else:
+                        reason = "empty result" if not has_content else "refusal/non-OCR response"
+                        last_error = f"{reason} (finish_reason: {finish_reason})"
+                        # Check if we should retry on empty or refusal
+                        should_retry = (not has_content and self.retry_on_empty) or refused
+                        attempt += 1
+                        if attempt >= max_attempts or not should_retry:
+                            # No more retries or shouldn't retry
+                            if not has_content:
+                                self._log(f"⚠️ No text detected (finish_reason: {finish_reason})")
+                            else:
+                                self._log(f"❌ Model returned non-OCR response: {str(content)[:120]}", "warning")
+                            break
+                        # Backoff before retrying
+                        delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
+                        self._log(f"🔄 Retry {attempt}/{max_attempts - 1} after {delay:.1f}s due to {reason}...", "warning")
+                        time.sleep(delay)
+                        time.sleep(0.1)  # Brief pause for stability
+                        self._log("💤 OCR retry pausing briefly for stability", "debug")
+                        continue
+                except UnifiedClientError as ue:
+                    msg = str(ue)
+                    last_error = msg
+                    # Do not retry on explicit user cancellation
+                    if 'cancelled' in msg.lower() or 'stopped by user' in msg.lower():
+                        self._log(f"❌ OCR cancelled: {msg}", "error")
+                        break
+                    attempt += 1
+                    if attempt >= max_attempts:
+                        self._log(f"❌ OCR failed after {attempt} attempts: {msg}", "error")
+                        break
+                    delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
+                    self._log(f"🔄 API error, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {msg}", "warning")
+                    time.sleep(delay)
+                    time.sleep(0.1)  # Brief pause for stability
+                    self._log("💤 OCR API error retry pausing briefly for stability", "debug")
+                    continue
+                except Exception as e_inner:
+                    last_error = str(e_inner)
+                    attempt += 1
+                    if attempt >= max_attempts:
+                        self._log(f"❌ OCR exception after {attempt} attempts: {last_error}", "error")
+                        break
+                    delay = self.retry_initial_delay * (self.retry_backoff ** (attempt - 1)) + random.uniform(0, self.retry_jitter)
+                    self._log(f"🔄 Exception, retry {attempt}/{max_attempts - 1} after {delay:.1f}s: {last_error}", "warning")
+                    time.sleep(delay)
+                    time.sleep(0.1)  # Brief pause for stability
+                    self._log("💤 OCR exception retry pausing briefly for stability", "debug")
+                    continue
+        except Exception as e:
+            self._log(f"❌ Error: {str(e)}", "error")
+            import traceback
+            self._log(traceback.format_exc(), "debug")
+        return results
+class MangaOCRProvider(OCRProvider):
+    """Manga OCR provider using HuggingFace model directly"""
+    def __init__(self, log_callback=None):
+        super().__init__(log_callback)
+        self.processor = None
+        self.model = None
+        self.tokenizer = None
+    def check_installation(self) -> bool:
+        """Check if transformers is installed"""
+        try:
+            import transformers
+            import torch
+            self.is_installed = True
+            return True
+        except ImportError:
+            return False
+    def install(self, progress_callback=None) -> bool:
+        """Install transformers and torch"""
+        pass
+    def _is_valid_local_model_dir(self, path: str) -> bool:
+        """Check that a local HF model directory has required files."""
+        try:
+            if not path or not os.path.isdir(path):
+                return False
+            needed_any_weights = any(
+                os.path.exists(os.path.join(path, name)) for name in (
+                    'pytorch_model.bin',
+                    'model.safetensors'
+                )
+            )
+            has_config = os.path.exists(os.path.join(path, 'config.json'))
+            has_processor = (
+                os.path.exists(os.path.join(path, 'preprocessor_config.json')) or
+                os.path.exists(os.path.join(path, 'processor_config.json'))
+            )
+            has_tokenizer = (
+                os.path.exists(os.path.join(path, 'tokenizer.json')) or
+                os.path.exists(os.path.join(path, 'tokenizer_config.json'))
+            )
+            return has_config and needed_any_weights and has_processor and has_tokenizer
+        except Exception:
+            return False
+    def load_model(self, **kwargs) -> bool:
+        """Load the manga-ocr model, preferring a local directory to avoid re-downloading"""
+        print("\n>>> MangaOCRProvider.load_model() called")
+        try:
+            if not self.is_installed and not self.check_installation():
+                print("ERROR: Transformers not installed")
+                self._log("❌ Transformers not installed", "error")
+                return False
+            # Always disable progress bars to avoid tqdm issues in some environments
+            import os
+            os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+            from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoImageProcessor
+            import torch
+            # Prefer a local model directory if present to avoid any Hub access
+            candidates = []
+            env_local = os.environ.get("MANGA_OCR_LOCAL_DIR")
+            if env_local:
+                candidates.append(env_local)
+            # Project root one level up from this file
+            root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+            candidates.append(os.path.join(root_dir, 'models', 'manga-ocr-base'))
+            candidates.append(os.path.join(root_dir, 'models', 'kha-white', 'manga-ocr-base'))
+            model_source = None
+            local_only = False
+            # Find a valid local dir
+            for cand in candidates:
+                if self._is_valid_local_model_dir(cand):
+                    model_source = cand
+                    local_only = True
+                    break
+            # If no valid local dir, use Hub
+            if not model_source:
+                model_source = "kha-white/manga-ocr-base"
+                # Make sure we are not forcing offline mode
+                if os.environ.get("HF_HUB_OFFLINE") == "1":
+                    try:
+                        del os.environ["HF_HUB_OFFLINE"]
+                    except Exception:
+                        pass
+                self._log("🔥 Loading manga-ocr model from Hugging Face Hub")
+                self._log(f"   Repo: {model_source}")
+            else:
+                # Only set offline when local dir is fully valid
+                os.environ.setdefault("HF_HUB_OFFLINE", "1")
+                self._log("🔥 Loading manga-ocr model from local directory")
+                self._log(f"   Local path: {model_source}")
+            # Decide target device once; we will move after full CPU load to avoid meta tensors
+            use_cuda = torch.cuda.is_available()
+            # Try loading components, falling back to Hub if local-only fails
+            def _load_components(source: str, local_flag: bool):
+                self._log("   Loading tokenizer...")
+                tok = AutoTokenizer.from_pretrained(source, local_files_only=local_flag)
+                self._log("   Loading image processor...")
+                try:
+                    from transformers import AutoProcessor
+                except Exception:
+                    AutoProcessor = None
+                try:
+                    proc = AutoImageProcessor.from_pretrained(source, local_files_only=local_flag)
+                except Exception as e_proc:
+                    if AutoProcessor is not None:
+                        self._log(f"   ⚠️ AutoImageProcessor failed: {e_proc}. Trying AutoProcessor...", "warning")
+                        proc = AutoProcessor.from_pretrained(source, local_files_only=local_flag)
+                    else:
+                        raise
+                self._log("   Loading model...")
+                # Prevent meta tensors by forcing full materialization on CPU at load time
+                os.environ.setdefault('TORCHDYNAMO_DISABLE', '1')
+                mdl = VisionEncoderDecoderModel.from_pretrained(
+                    source,
+                    local_files_only=local_flag,
+                    low_cpu_mem_usage=False,
+                    device_map=None,
+                    torch_dtype=torch.float32  # Use torch_dtype instead of dtype
+                )
+                return tok, proc, mdl
+            try:
+                self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
+            except Exception as e_local:
+                if local_only:
+                    # Fallback to Hub once if local fails
+                    self._log(f"   ⚠️ Local model load failed: {e_local}", "warning")
+                    try:
+                        if os.environ.get("HF_HUB_OFFLINE") == "1":
+                            del os.environ["HF_HUB_OFFLINE"]
+                    except Exception:
+                        pass
+                    model_source = "kha-white/manga-ocr-base"
+                    local_only = False
+                    self._log("   Retrying from Hugging Face Hub...")
+                    self.tokenizer, self.processor, self.model = _load_components(model_source, local_only)
+                else:
+                    raise
+            # Move to CUDA only after full CPU materialization
+            target_device = 'cpu'
+            if use_cuda:
+                try:
+                    self.model = self.model.to('cuda')
+                    target_device = 'cuda'
+                except Exception as move_err:
+                    self._log(f"   ⚠️ Could not move model to CUDA: {move_err}", "warning")
+                    target_device = 'cpu'
+            # Finalize eval mode
+            self.model.eval()
+            # Sanity-check: ensure no parameter remains on 'meta' device
+            try:
+                for n, p in self.model.named_parameters():
+                    dev = getattr(p, 'device', None)
+                    if dev is not None and getattr(dev, 'type', '') == 'meta':
+                        raise RuntimeError(f"Parameter {n} is on 'meta' after load")
+            except Exception as sanity_err:
+                self._log(f"❌ Manga-OCR model load sanity check failed: {sanity_err}", "error")
+                return False
+            print(f"SUCCESS: Model loaded on {target_device.upper()}")
+            self._log(f"   ✅ Model loaded on {target_device.upper()}")
+            self.is_loaded = True
+            self._log("✅ Manga OCR model ready")
+            print(">>> Returning True from load_model()")
+            return True
+        except Exception as e:
+            print(f"\nEXCEPTION in load_model: {e}")
+            import traceback
+            print(traceback.format_exc())
+            self._log(f"❌ Failed to load manga-ocr model: {str(e)}", "error")
+            self._log(traceback.format_exc(), "error")
+            try:
+                if 'local_only' in locals() and local_only:
+                    self._log("Hint: Local load failed. Ensure your models/manga-ocr-base contains required files (config.json, preprocessor_config.json, tokenizer.json or tokenizer_config.json, and model weights).", "warning")
+            except Exception:
+                pass
+            return False
+    def _run_ocr(self, pil_image):
+        """Run OCR on a PIL image using the HuggingFace model"""
+        import torch
+        # Process image (keyword arg for broader compatibility across transformers versions)
+        inputs = self.processor(images=pil_image, return_tensors="pt")
+        pixel_values = inputs["pixel_values"]
+        # Move to same device as model
+        try:
+            model_device = next(self.model.parameters()).device
+        except StopIteration:
+            model_device = torch.device('cpu')
+        pixel_values = pixel_values.to(model_device)
+        # Generate text
+        with torch.no_grad():
+            generated_ids = self.model.generate(pixel_values)
+        # Decode
+        generated_text = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return generated_text
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """
+        Process the image region passed to it.
+        This could be a bubble region or the full image.
+        """
+        results = []
+        # Check for stop at start
+        if self._check_stop():
+            self._log("⏹️ Manga-OCR processing stopped by user", "warning")
+            return results
+        try:
+            if not self.is_loaded:
+                if not self.load_model():
+                    return results
+            import cv2
+            from PIL import Image
+            # Get confidence from kwargs
+            confidence = kwargs.get('confidence', 0.7)
+            # Convert numpy array to PIL
+            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image_rgb)
+            h, w = image.shape[:2]
+            self._log("🔍 Processing region with manga-ocr...")
+            # Check for stop before inference
+            if self._check_stop():
+                self._log("⏹️ Manga-OCR inference stopped by user", "warning")
+                return results
+            # Run OCR on the image region
+            text = self._run_ocr(pil_image)
+            if text and text.strip():
+                # Return result for this region with its actual bbox
+                results.append(OCRResult(
+                    text=text.strip(),
+                    bbox=(0, 0, w, h),  # Relative to the region passed in
+                    confidence=confidence,
+                    vertices=[(0, 0), (w, 0), (w, h), (0, h)]
+                ))
+                self._log(f"✅ Detected text: {text[:50]}...")
+        except Exception as e:
+            self._log(f"❌ Error in manga-ocr: {str(e)}", "error")
+        return results
+class Qwen2VL(OCRProvider):
+    """OCR using Qwen2-VL - Vision Language Model that can read Korean text"""
+    def __init__(self, log_callback=None):
+        super().__init__(log_callback)
+        self.processor = None
+        self.model = None
+        self.tokenizer = None
+        # Get OCR prompt from environment or use default
+        self.ocr_prompt = os.environ.get('OCR_SYSTEM_PROMPT',
+            "YOU ARE AN OCR SYSTEM. YOUR ONLY JOB IS TEXT EXTRACTION.\n\n"
+            "CRITICAL RULES:\n"
+            "1. DO NOT TRANSLATE ANYTHING\n"
+            "2. DO NOT MODIFY THE TEXT\n"
+            "3. DO NOT EXPLAIN OR COMMENT\n"
+            "4. ONLY OUTPUT THE EXACT TEXT YOU SEE\n"
+            "5. PRESERVE NATURAL TEXT FLOW - DO NOT ADD UNNECESSARY LINE BREAKS\n\n"
+            "If you see Korean text, output it in Korean.\n"
+            "If you see Japanese text, output it in Japanese.\n"
+            "If you see Chinese text, output it in Chinese.\n"
+            "If you see English text, output it in English.\n\n"
+            "IMPORTANT: Only use line breaks where they naturally occur in the original text "
+            "(e.g., between dialogue lines or paragraphs). Do not break text mid-sentence or "
+            "between every word/character.\n\n"
+            "For vertical text common in manga/comics, transcribe it as a continuous line unless "
+            "there are clear visual breaks.\n\n"
+            "NEVER translate. ONLY extract exactly what is written.\n"
+            "Output ONLY the raw text, nothing else."
+        )
+    def set_ocr_prompt(self, prompt: str):
+        """Allow setting the OCR prompt dynamically"""
+        self.ocr_prompt = prompt
+    def check_installation(self) -> bool:
+        """Check if required packages are installed"""
+        try:
+            import transformers
+            import torch
+            self.is_installed = True
+            return True
+        except ImportError:
+            return False
+    def install(self, progress_callback=None) -> bool:
+        """Install requirements for Qwen2-VL"""
+        pass
+    def load_model(self, model_size=None, **kwargs) -> bool:
+        """Load Qwen2-VL model with size selection"""
+        self._log(f"DEBUG: load_model called with model_size={model_size}")
+        try:
+            if not self.is_installed and not self.check_installation():
+                self._log("❌ Not installed", "error")
+                return False
+            self._log("🔥 Loading Qwen2-VL for Advanced OCR...")
+            from transformers import AutoProcessor, AutoTokenizer
+            import torch
+            # Model options
+            model_options = {
+                "1": "Qwen/Qwen2-VL-2B-Instruct",
+                "2": "Qwen/Qwen2-VL-7B-Instruct",
+                "3": "Qwen/Qwen2-VL-72B-Instruct",
+                "4": "custom"
+            }
+            # CHANGE: Default to 7B instead of 2B
+            # Check for saved preference first
+            if model_size is None:
+                # Try to get from environment or config
+                import os
+                model_size = os.environ.get('QWEN2VL_MODEL_SIZE', '1')
+            # Determine which model to load
+            if model_size and str(model_size).startswith("custom:"):
+                # Custom model passed with ID
+                model_id = str(model_size).replace("custom:", "")
+                self.loaded_model_size = "Custom"
+                self.model_id = model_id
+                self._log(f"Loading custom model: {model_id}")
+            elif model_size == "4":
+                # Custom option selected but no ID - shouldn't happen
+                self._log("❌ Custom model selected but no ID provided", "error")
+                return False
+            elif model_size and str(model_size) in model_options:
+                # Standard model option
+                option = model_options[str(model_size)]
+                if option == "custom":
+                    self._log("❌ Custom model needs an ID", "error")
+                    return False
+                model_id = option
+                # Set loaded_model_size for status display
+                if model_size == "1":
+                    self.loaded_model_size = "2B"
+                elif model_size == "2":
+                    self.loaded_model_size = "7B"
+                elif model_size == "3":
+                    self.loaded_model_size = "72B"
+            else:
+                # CHANGE: Default to 7B (option "2") instead of 2B
+                model_id = model_options["1"]  # Changed from "1" to "2"
+                self.loaded_model_size = "2B"   # Changed from "2B" to "7B"
+                self._log("No model size specified, defaulting to 2B")  # Changed message
+            self._log(f"Loading model: {model_id}")
+            # Load processor and tokenizer
+            self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+            self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            # Load the model - let it figure out the class dynamically
+            if torch.cuda.is_available():
+                self._log(f"GPU: {torch.cuda.get_device_name(0)}")
+                # Use auto model class
+                from transformers import AutoModelForVision2Seq
+                self.model = AutoModelForVision2Seq.from_pretrained(
+                    model_id,
+                    dtype=torch.float16,
+                    device_map="auto",
+                    trust_remote_code=True
+                )
+                self._log("✅ Model loaded on GPU")
+            else:
+                self._log("Loading on CPU...")
+                from transformers import AutoModelForVision2Seq
+                self.model = AutoModelForVision2Seq.from_pretrained(
+                    model_id,
+                    dtype=torch.float32,
+                    trust_remote_code=True
+                )
+                self._log("✅ Model loaded on CPU")
+            self.model.eval()
+            self.is_loaded = True
+            self._log("✅ Qwen2-VL ready for Advanced OCR!")
+            return True
+        except Exception as e:
+            self._log(f"❌ Failed to load: {str(e)}", "error")
+            import traceback
+            self._log(traceback.format_exc(), "debug")
+            return False
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """Process image with Qwen2-VL for Korean text extraction"""
+        results = []
+        if hasattr(self, 'model_id'):
+            self._log(f"DEBUG: Using model: {self.model_id}", "debug")
+        # Check if OCR prompt was passed in kwargs (for dynamic updates)
+        if 'ocr_prompt' in kwargs:
+            self.ocr_prompt = kwargs['ocr_prompt']
+        try:
+            if not self.is_loaded:
+                if not self.load_model():
+                    return results
+            import cv2
+            from PIL import Image
+            import torch
+            # Convert to PIL
+            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image_rgb)
+            h, w = image.shape[:2]
+            self._log(f"🔍 Processing with Qwen2-VL ({w}x{h} pixels)...")
+            # Use the configurable OCR prompt
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "image": pil_image,
+                        },
+                        {
+                            "type": "text",
+                            "text": self.ocr_prompt  # Use the configurable prompt
+                        }
+                    ]
+                }
+            ]
+            # Alternative simpler prompt if the above still causes issues:
+            # "text": "OCR: Extract text as-is"
+            # Process with Qwen2-VL
+            text = self.processor.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            inputs = self.processor(
+                text=[text],
+                images=[pil_image],
+                padding=True,
+                return_tensors="pt"
+            )
+            # Get the device and dtype the model is currently on
+            model_device = next(self.model.parameters()).device
+            model_dtype = next(self.model.parameters()).dtype
+            # Move inputs to the same device as the model and cast float tensors to model dtype
+            try:
+                # Move first
+                inputs = inputs.to(model_device)
+                # Then align dtypes only for floating tensors (e.g., pixel_values)
+                for k, v in inputs.items():
+                    if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
+                        inputs[k] = v.to(model_dtype)
+            except Exception:
+                # Fallback: ensure at least pixel_values is correct if present
+                try:
+                    if isinstance(inputs, dict) and "pixel_values" in inputs:
+                        pv = inputs["pixel_values"].to(model_device)
+                        if torch.is_floating_point(pv):
+                            inputs["pixel_values"] = pv.to(model_dtype)
+                except Exception:
+                    pass
+            # Ensure pixel_values explicitly matches model dtype if present
+            try:
+                if isinstance(inputs, dict) and "pixel_values" in inputs:
+                    inputs["pixel_values"] = inputs["pixel_values"].to(device=model_device, dtype=model_dtype)
+            except Exception:
+                pass
+            # Generate text with stricter parameters to avoid creative responses
+            use_amp = (hasattr(torch, 'cuda') and model_device.type == 'cuda' and model_dtype in (torch.float16, torch.bfloat16))
+            autocast_dev = 'cuda' if model_device.type == 'cuda' else 'cpu'
+            autocast_dtype = model_dtype if model_dtype in (torch.float16, torch.bfloat16) else None
+            with torch.no_grad():
+                if use_amp and autocast_dtype is not None:
+                    with torch.autocast(autocast_dev, dtype=autocast_dtype):
+                        generated_ids = self.model.generate(
+                            **inputs,
+                            max_new_tokens=128,      # Reduced from 512 - manga bubbles are typically short
+                            do_sample=False,        # Keep deterministic
+                            temperature=0.01,       # Keep your very low temperature
+                            top_p=1.0,             # Keep no nucleus sampling
+                            repetition_penalty=1.0, # Keep no repetition penalty
+                            num_beams=1,           # Ensure greedy decoding (faster than beam search)
+                            use_cache=True,        # Enable KV cache for speed
+                            early_stopping=True,   # Stop at EOS token
+                            pad_token_id=self.tokenizer.pad_token_id,      # Proper padding
+                            eos_token_id=self.tokenizer.eos_token_id,      # Proper stopping
+                        )
+                else:
+                    generated_ids = self.model.generate(
+                        **inputs,
+                        max_new_tokens=128,      # Reduced from 512 - manga bubbles are typically short
+                        do_sample=False,        # Keep deterministic
+                        temperature=0.01,       # Keep your very low temperature
+                        top_p=1.0,             # Keep no nucleus sampling
+                        repetition_penalty=1.0, # Keep no repetition penalty
+                        num_beams=1,           # Ensure greedy decoding (faster than beam search)
+                        use_cache=True,        # Enable KV cache for speed
+                        early_stopping=True,   # Stop at EOS token
+                        pad_token_id=self.tokenizer.pad_token_id,      # Proper padding
+                        eos_token_id=self.tokenizer.eos_token_id,      # Proper stopping
+                    )
+            # Decode the output
+            generated_ids_trimmed = [
+                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            output_text = self.processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )[0]
+            if output_text and output_text.strip():
+                text = output_text.strip()
+                # ADDED: Filter out any response that looks like an explanation or apology
+                # Common patterns that indicate the model is being "helpful" instead of just extracting
+                unwanted_patterns = [
+                    "죄송합니다",  # "I apologize"
+                    "sorry",
+                    "apologize",
+                    "이미지에는",  # "in this image"
+                    "텍스트가 없습니다",  # "there is no text"
+                    "I cannot",
+                    "I don't see",
+                    "There is no",
+                    "질문이 있으시면",  # "if you have questions"
+                ]
+                # Check if response contains unwanted patterns
+                text_lower = text.lower()
+                is_explanation = any(pattern.lower() in text_lower for pattern in unwanted_patterns)
+                # Also check if the response is suspiciously long for a bubble
+                # Most manga bubbles are short, if we get 50+ chars it might be an explanation
+                is_too_long = len(text) > 100 and ('.' in text or ',' in text or '!' in text)
+                if is_explanation or is_too_long:
+                    self._log(f"⚠️ Model returned explanation instead of text, ignoring", "warning")
+                    # Return empty result or just skip this region
+                    return results
+                # Check language
+                has_korean = any('\uAC00' <= c <= '\uD7AF' for c in text)
+                has_japanese = any('\u3040' <= c <= '\u309F' or '\u30A0' <= c <= '\u30FF' for c in text)
+                has_chinese = any('\u4E00' <= c <= '\u9FFF' for c in text)
+                if has_korean:
+                    self._log(f"✅ Korean detected: {text[:50]}...")
+                elif has_japanese:
+                    self._log(f"✅ Japanese detected: {text[:50]}...")
+                elif has_chinese:
+                    self._log(f"✅ Chinese detected: {text[:50]}...")
+                else:
+                    self._log(f"✅ Text: {text[:50]}...")
+                results.append(OCRResult(
+                    text=text,
+                    bbox=(0, 0, w, h),
+                    confidence=0.9,
+                    vertices=[(0, 0), (w, 0), (w, h), (0, h)]
+                ))
+            else:
+                self._log("⚠️ No text detected", "warning")
+        except Exception as e:
+            self._log(f"❌ Error: {str(e)}", "error")
+            import traceback
+            self._log(traceback.format_exc(), "debug")
+        return results
+class EasyOCRProvider(OCRProvider):
+    """EasyOCR provider for multiple languages"""
+    def __init__(self, log_callback=None, languages=None):
+        super().__init__(log_callback)
+        # Default to safe language combination
+        self.languages = languages or ['ja', 'en']  # Safe default
+        self._validate_language_combination()
+    def _validate_language_combination(self):
+        """Validate and fix EasyOCR language combinations"""
+        # EasyOCR language compatibility rules
+        incompatible_pairs = [
+            (['ja', 'ko'], 'Japanese and Korean cannot be used together'),
+            (['ja', 'zh'], 'Japanese and Chinese cannot be used together'),
+            (['ko', 'zh'], 'Korean and Chinese cannot be used together')
+        ]
+        for incompatible, reason in incompatible_pairs:
+            if all(lang in self.languages for lang in incompatible):
+                self._log(f"⚠️ EasyOCR: {reason}", "warning")
+                # Keep first language + English
+                self.languages = [self.languages[0], 'en']
+                self._log(f"🔧 Auto-adjusted to: {self.languages}", "info")
+                break
+    def check_installation(self) -> bool:
+        """Check if easyocr is installed"""
+        try:
+            import easyocr
+            self.is_installed = True
+            return True
+        except ImportError:
+            return False
+    def install(self, progress_callback=None) -> bool:
+        """Install easyocr"""
+        pass
+    def load_model(self, **kwargs) -> bool:
+        """Load easyocr model"""
+        try:
+            if not self.is_installed and not self.check_installation():
+                self._log("❌ easyocr not installed", "error")
+                return False
+            self._log(f"🔥 Loading easyocr model for languages: {self.languages}...")
+            import easyocr
+            # This will download models on first run
+            self.model = easyocr.Reader(self.languages, gpu=True)
+            self.is_loaded = True
+            self._log("✅ easyocr model loaded successfully")
+            return True
+        except Exception as e:
+            self._log(f"❌ Failed to load easyocr: {str(e)}", "error")
+            # Try CPU mode if GPU fails
+            try:
+                import easyocr
+                self.model = easyocr.Reader(self.languages, gpu=False)
+                self.is_loaded = True
+                self._log("✅ easyocr loaded in CPU mode")
+                return True
+            except:
+                return False
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """Detect text using easyocr"""
+        results = []
+        try:
+            if not self.is_loaded:
+                if not self.load_model():
+                    return results
+            # EasyOCR can work directly with numpy arrays
+            ocr_results = self.model.readtext(image, detail=1)
+            # Parse results
+            for (bbox, text, confidence) in ocr_results:
+                # bbox is a list of 4 points
+                xs = [point[0] for point in bbox]
+                ys = [point[1] for point in bbox]
+                x_min, x_max = min(xs), max(xs)
+                y_min, y_max = min(ys), max(ys)
+                results.append(OCRResult(
+                    text=text,
+                    bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
+                    confidence=confidence,
+                    vertices=[(int(p[0]), int(p[1])) for p in bbox]
+                ))
+            self._log(f"✅ Detected {len(results)} text regions")
+        except Exception as e:
+            self._log(f"❌ Error in easyocr detection: {str(e)}", "error")
+        return results
+class PaddleOCRProvider(OCRProvider):
+    """PaddleOCR provider with memory safety measures"""
+    def check_installation(self) -> bool:
+        """Check if paddleocr is installed"""
+        try:
+            from paddleocr import PaddleOCR
+            self.is_installed = True
+            return True
+        except ImportError:
+            return False
+    def install(self, progress_callback=None) -> bool:
+        """Install paddleocr"""
+        pass
+    def load_model(self, **kwargs) -> bool:
+        """Load paddleocr model with memory-safe configurations"""
+        try:
+            if not self.is_installed and not self.check_installation():
+                self._log("❌ paddleocr not installed", "error")
+                return False
+            self._log("🔥 Loading PaddleOCR model...")
+            # Set memory-safe environment variables BEFORE importing
+            import os
+            os.environ['OMP_NUM_THREADS'] = '1'  # Prevent OpenMP conflicts
+            os.environ['MKL_NUM_THREADS'] = '1'  # Prevent MKL conflicts
+            os.environ['OPENBLAS_NUM_THREADS'] = '1'  # Prevent OpenBLAS conflicts
+            os.environ['FLAGS_use_mkldnn'] = '0'  # Disable MKL-DNN
+            from paddleocr import PaddleOCR
+            # Try memory-safe configurations
+            configs_to_try = [
+                # Config 1: Most memory-safe configuration
+                {
+                    'use_angle_cls': False,  # Disable angle to save memory
+                    'lang': 'ch',
+                    'rec_batch_num': 1,  # Process one at a time
+                    'max_text_length': 100,  # Limit text length
+                    'drop_score': 0.5,  # Higher threshold to reduce detections
+                    'cpu_threads': 1,  # Single thread to avoid conflicts
+                },
+                # Config 2: Minimal memory footprint
+                {
+                    'lang': 'ch',
+                    'rec_batch_num': 1,
+                    'cpu_threads': 1,
+                },
+                # Config 3: Absolute minimal
+                {
+                    'lang': 'ch'
+                },
+                # Config 4: Empty config
+                {}
+            ]
+            for i, config in enumerate(configs_to_try):
+                try:
+                    self._log(f"   Trying configuration {i+1}/{len(configs_to_try)}: {config}")
+                    # Force garbage collection before loading
+                    import gc
+                    gc.collect()
+                    self.model = PaddleOCR(**config)
+                    self.is_loaded = True
+                    self.current_config = config
+                    self._log(f"✅ PaddleOCR loaded successfully with config: {config}")
+                    return True
+                except Exception as e:
+                    error_str = str(e)
+                    self._log(f"   Config {i+1} failed: {error_str}", "debug")
+                    # Clean up on failure
+                    if hasattr(self, 'model'):
+                        del self.model
+                    gc.collect()
+                    continue
+            self._log(f"❌ PaddleOCR failed to load with any configuration", "error")
+            return False
+        except Exception as e:
+            self._log(f"❌ Failed to load paddleocr: {str(e)}", "error")
+            import traceback
+            self._log(traceback.format_exc(), "debug")
+            return False
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """Detect text with memory safety measures"""
+        results = []
+        try:
+            if not self.is_loaded:
+                if not self.load_model():
+                    return results
+            import cv2
+            import numpy as np
+            import gc
+            # Memory safety: Ensure image isn't too large
+            h, w = image.shape[:2] if len(image.shape) >= 2 else (0, 0)
+            # Limit image size to prevent memory issues
+            MAX_DIMENSION = 1500
+            if h > MAX_DIMENSION or w > MAX_DIMENSION:
+                scale = min(MAX_DIMENSION/h, MAX_DIMENSION/w)
+                new_h, new_w = int(h*scale), int(w*scale)
+                self._log(f"⚠️ Resizing large image from {w}x{h} to {new_w}x{new_h} for memory safety", "warning")
+                image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
+                scale_factor = 1/scale
+            else:
+                scale_factor = 1.0
+            # Ensure correct format
+            if len(image.shape) == 2:  # Grayscale
+                image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+            elif len(image.shape) == 4:  # Batch
+                image = image[0]
+            # Ensure uint8 type
+            if image.dtype != np.uint8:
+                if image.max() <= 1.0:
+                    image = (image * 255).astype(np.uint8)
+                else:
+                    image = image.astype(np.uint8)
+            # Make a copy to avoid memory corruption
+            image_copy = image.copy()
+            # Force garbage collection before OCR
+            gc.collect()
+            # Process with timeout protection
+            import signal
+            import threading
+            ocr_results = None
+            ocr_error = None
+            def run_ocr():
+                nonlocal ocr_results, ocr_error
+                try:
+                    ocr_results = self.model.ocr(image_copy)
+                except Exception as e:
+                    ocr_error = e
+            # Run OCR in a separate thread with timeout
+            ocr_thread = threading.Thread(target=run_ocr)
+            ocr_thread.daemon = True
+            ocr_thread.start()
+            ocr_thread.join(timeout=30)  # 30 second timeout
+            if ocr_thread.is_alive():
+                self._log("❌ PaddleOCR timeout - taking too long", "error")
+                return results
+            if ocr_error:
+                raise ocr_error
+            # Parse results
+            results = self._parse_ocr_results(ocr_results)
+            # Scale coordinates back if image was resized
+            if scale_factor != 1.0 and results:
+                for r in results:
+                    x, y, width, height = r.bbox
+                    r.bbox = (int(x*scale_factor), int(y*scale_factor),
+                            int(width*scale_factor), int(height*scale_factor))
+                    r.vertices = [(int(v[0]*scale_factor), int(v[1]*scale_factor))
+                                for v in r.vertices]
+            if results:
+                self._log(f"✅ Detected {len(results)} text regions", "info")
+            else:
+                self._log("No text regions found", "debug")
+            # Clean up
+            del image_copy
+            gc.collect()
+        except Exception as e:
+            error_msg = str(e) if str(e) else type(e).__name__
+            if "memory" in error_msg.lower() or "0x" in error_msg:
+                self._log("❌ Memory access violation in PaddleOCR", "error")
+                self._log("   This is a known Windows issue with PaddleOCR", "info")
+                self._log("   Please switch to EasyOCR or manga-ocr instead", "warning")
+            elif "trace_order.size()" in error_msg:
+                self._log("❌ PaddleOCR internal error", "error")
+                self._log("   Please switch to EasyOCR or manga-ocr", "warning")
+            else:
+                self._log(f"❌ Error in paddleocr detection: {error_msg}", "error")
+            import traceback
+            self._log(traceback.format_exc(), "debug")
+        return results
+    def _parse_ocr_results(self, ocr_results) -> List[OCRResult]:
+        """Parse OCR results safely"""
+        results = []
+        if isinstance(ocr_results, bool) and ocr_results == False:
+            return results
+        if ocr_results is None or not isinstance(ocr_results, list):
+            return results
+        if len(ocr_results) == 0:
+            return results
+        # Handle batch format
+        if isinstance(ocr_results[0], list) and len(ocr_results[0]) > 0:
+            first_item = ocr_results[0][0]
+            if isinstance(first_item, list) and len(first_item) > 0:
+                if isinstance(first_item[0], (list, tuple)) and len(first_item[0]) == 2:
+                    ocr_results = ocr_results[0]
+        # Parse detections
+        for detection in ocr_results:
+            if not detection or isinstance(detection, bool):
+                continue
+            if not isinstance(detection, (list, tuple)) or len(detection) < 2:
+                continue
+            try:
+                bbox_points = detection[0]
+                text_data = detection[1]
+                if not isinstance(bbox_points, (list, tuple)) or len(bbox_points) != 4:
+                    continue
+                if not isinstance(text_data, (tuple, list)) or len(text_data) < 2:
+                    continue
+                text = str(text_data[0]).strip()
+                confidence = float(text_data[1])
+                if not text or confidence < 0.3:
+                    continue
+                xs = [float(p[0]) for p in bbox_points]
+                ys = [float(p[1]) for p in bbox_points]
+                x_min, x_max = min(xs), max(xs)
+                y_min, y_max = min(ys), max(ys)
+                if (x_max - x_min) < 5 or (y_max - y_min) < 5:
+                    continue
+                results.append(OCRResult(
+                    text=text,
+                    bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
+                    confidence=confidence,
+                    vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
+                ))
+            except Exception:
+                continue
+        return results
+class DocTROCRProvider(OCRProvider):
+    """DocTR OCR provider"""
+    def check_installation(self) -> bool:
+        """Check if doctr is installed"""
+        try:
+            from doctr.models import ocr_predictor
+            self.is_installed = True
+            return True
+        except ImportError:
+            return False
+    def install(self, progress_callback=None) -> bool:
+        """Install doctr"""
+        pass
+    def load_model(self, **kwargs) -> bool:
+        """Load doctr model"""
+        try:
+            if not self.is_installed and not self.check_installation():
+                self._log("❌ doctr not installed", "error")
+                return False
+            self._log("🔥 Loading DocTR model...")
+            from doctr.models import ocr_predictor
+            # Load pretrained model
+            self.model = ocr_predictor(pretrained=True)
+            self.is_loaded = True
+            self._log("✅ DocTR model loaded successfully")
+            return True
+        except Exception as e:
+            self._log(f"❌ Failed to load doctr: {str(e)}", "error")
+            return False
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """Detect text using doctr"""
+        results = []
+        try:
+            if not self.is_loaded:
+                if not self.load_model():
+                    return results
+            from doctr.io import DocumentFile
+            # DocTR expects document format
+            # Convert numpy array to PIL and save temporarily
+            import tempfile
+            import cv2
+            with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
+                cv2.imwrite(tmp.name, image)
+                doc = DocumentFile.from_images(tmp.name)
+            # Run OCR
+            result = self.model(doc)
+            # Parse results
+            h, w = image.shape[:2]
+            for page in result.pages:
+                for block in page.blocks:
+                    for line in block.lines:
+                        for word in line.words:
+                            # Handle different geometry formats
+                            geometry = word.geometry
+                            if len(geometry) == 4:
+                                # Standard format: (x1, y1, x2, y2)
+                                x1, y1, x2, y2 = geometry
+                            elif len(geometry) == 2:
+                                # Alternative format: ((x1, y1), (x2, y2))
+                                (x1, y1), (x2, y2) = geometry
+                            else:
+                                self._log(f"Unexpected geometry format: {geometry}", "warning")
+                                continue
+                            # Convert relative coordinates to absolute
+                            x1, x2 = int(x1 * w), int(x2 * w)
+                            y1, y2 = int(y1 * h), int(y2 * h)
+                            results.append(OCRResult(
+                                text=word.value,
+                                bbox=(x1, y1, x2 - x1, y2 - y1),
+                                confidence=word.confidence,
+                                vertices=[(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
+                            ))
+            # Clean up temp file
+            try:
+                os.unlink(tmp.name)
+            except:
+                pass
+            self._log(f"DocTR detected {len(results)} text regions")
+        except Exception as e:
+            self._log(f"Error in doctr detection: {str(e)}", "error")
+            import traceback
+            self._log(traceback.format_exc(), "error")
+        return results
+class RapidOCRProvider(OCRProvider):
+    """RapidOCR provider for fast local OCR"""
+    def check_installation(self) -> bool:
+        """Check if rapidocr is installed"""
+        try:
+            import rapidocr_onnxruntime
+            self.is_installed = True
+            return True
+        except ImportError:
+            return False
+    def install(self, progress_callback=None) -> bool:
+        """Install rapidocr (requires manual pip install)"""
+        # RapidOCR requires manual installation
+        if progress_callback:
+            progress_callback("RapidOCR requires manual pip installation")
+        self._log("Run: pip install rapidocr-onnxruntime", "info")
+        return False  # Always return False since we can't auto-install
+    def load_model(self, **kwargs) -> bool:
+        """Load RapidOCR model"""
+        try:
+            if not self.is_installed and not self.check_installation():
+                self._log("RapidOCR not installed", "error")
+                return False
+            self._log("Loading RapidOCR...")
+            from rapidocr_onnxruntime import RapidOCR
+            self.model = RapidOCR()
+            self.is_loaded = True
+            self._log("RapidOCR model loaded successfully")
+            return True
+        except Exception as e:
+            self._log(f"Failed to load RapidOCR: {str(e)}", "error")
+            return False
+    def detect_text(self, image: np.ndarray, **kwargs) -> List[OCRResult]:
+        """Detect text using RapidOCR"""
+        if not self.is_loaded:
+            self._log("RapidOCR model not loaded", "error")
+            return []
+        results = []
+        try:
+            # Convert numpy array to PIL Image for RapidOCR
+            if len(image.shape) == 3:
+                # BGR to RGB
+                image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            else:
+                image_rgb = image
+            # RapidOCR expects PIL Image or numpy array
+            ocr_results, _ = self.model(image_rgb)
+            if ocr_results:
+                for result in ocr_results:
+                    # RapidOCR returns [bbox, text, confidence]
+                    bbox_points = result[0]  # 4 corner points
+                    text = result[1]
+                    confidence = float(result[2])
+                    if not text or not text.strip():
+                        continue
+                    # Convert 4-point bbox to x,y,w,h format
+                    xs = [point[0] for point in bbox_points]
+                    ys = [point[1] for point in bbox_points]
+                    x_min, x_max = min(xs), max(xs)
+                    y_min, y_max = min(ys), max(ys)
+                    results.append(OCRResult(
+                        text=text.strip(),
+                        bbox=(int(x_min), int(y_min), int(x_max - x_min), int(y_max - y_min)),
+                        confidence=confidence,
+                        vertices=[(int(p[0]), int(p[1])) for p in bbox_points]
+                    ))
+            self._log(f"Detected {len(results)} text regions")
+        except Exception as e:
+            self._log(f"Error in RapidOCR detection: {str(e)}", "error")
+        return results
+class OCRManager:
+    """Manager for multiple OCR providers"""
+    def __init__(self, log_callback=None):
+        self.log_callback = log_callback
+        self.providers = {
+            'custom-api': CustomAPIProvider(log_callback) ,
+            'manga-ocr': MangaOCRProvider(log_callback),
+            'easyocr': EasyOCRProvider(log_callback),
+            'paddleocr': PaddleOCRProvider(log_callback),
+            'doctr': DocTROCRProvider(log_callback),
+            'rapidocr': RapidOCRProvider(log_callback),
+            'Qwen2-VL': Qwen2VL(log_callback)
+        }
+        self.current_provider = None
+        self.stop_flag = None
+    def get_provider(self, name: str) -> Optional[OCRProvider]:
+        """Get OCR provider by name"""
+        return self.providers.get(name)
+    def set_current_provider(self, name: str):
+        """Set current active provider"""
+        if name in self.providers:
+            self.current_provider = name
+            return True
+        return False
+    def check_provider_status(self, name: str) -> Dict[str, bool]:
+        """Check installation and loading status of provider"""
+        provider = self.providers.get(name)
+        if not provider:
+            return {'installed': False, 'loaded': False}
+        result = {
+            'installed': provider.check_installation(),
+            'loaded': provider.is_loaded
+        }
+        if self.log_callback:
+            self.log_callback(f"DEBUG: check_provider_status({name}) returning loaded={result['loaded']}", "debug")
+        return result
+    def install_provider(self, name: str, progress_callback=None) -> bool:
+        """Install a provider"""
+        provider = self.providers.get(name)
+        if not provider:
+            return False
+        return provider.install(progress_callback)
+    def load_provider(self, name: str, **kwargs) -> bool:
+        """Load a provider's model with optional parameters"""
+        provider = self.providers.get(name)
+        if not provider:
+            return False
+        return provider.load_model(**kwargs)  # <-- Passes model_size and any other kwargs
+    def shutdown(self):
+        """Release models/processors/tokenizers for all providers and clear caches."""
+        try:
+            import gc
+            for name, provider in list(self.providers.items()):
+                try:
+                    if hasattr(provider, 'model'):
+                        provider.model = None
+                    if hasattr(provider, 'processor'):
+                        provider.processor = None
+                    if hasattr(provider, 'tokenizer'):
+                        provider.tokenizer = None
+                    if hasattr(provider, 'reader'):
+                        provider.reader = None
+                    if hasattr(provider, 'is_loaded'):
+                        provider.is_loaded = False
+                except Exception:
+                    pass
+            gc.collect()
+            try:
+                import torch
+                torch.cuda.empty_cache()
+            except Exception:
+                pass
+        except Exception:
+            pass
+    def detect_text(self, image: np.ndarray, provider_name: str = None, **kwargs) -> List[OCRResult]:
+        """Detect text using specified or current provider"""
+        provider_name = provider_name or self.current_provider
+        if not provider_name:
+            return []
+        provider = self.providers.get(provider_name)
+        if not provider:
+            return []
+        return provider.detect_text(image, **kwargs)
+    def set_stop_flag(self, stop_flag):
+        """Set stop flag for all providers"""
+        self.stop_flag = stop_flag
+        for provider in self.providers.values():
+            if hasattr(provider, 'set_stop_flag'):
+                provider.set_stop_flag(stop_flag)
+    def reset_stop_flags(self):
+        """Reset stop flags for all providers"""
+        for provider in self.providers.values():
+            if hasattr(provider, 'reset_stop_flags'):
+                provider.reset_stop_flags()