# -------------------------------
# AI Fast Image Server — ZeroGPU Ready (Gradio 5)
# -------------------------------

from __future__ import annotations
import os
import sys
import logging
import subprocess
from typing import Optional

# ---------- Fast, safe defaults ----------
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")   # faster model downloads
os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")      # silence NVML in headless envs
os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")

# ---------- Logging ----------
logging.basicConfig(
    level=os.environ.get("LOG_LEVEL", "INFO").upper(),
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    stream=sys.stdout,
)
log = logging.getLogger("ai-fast-image-server")

# ---------- Config via ENV ----------
# MODEL_BACKEND: "sdxl_lcm_lora" (default), "sdxl_lcm_unet" (heavy), "ssd1b_lcm_lora" (light)
MODEL_BACKEND = os.getenv("MODEL_BACKEND", "sdxl_lcm_lora").lower()
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
DEFAULT_SIZE = int(os.getenv("DEFAULT_SIZE", "1024"))
SECRET_TOKEN = os.getenv("SECRET_TOKEN", "default_secret")
PORT = int(os.getenv("PORT", "7860"))
CONCURRENCY = int(os.getenv("CONCURRENCY", "2"))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", "32"))
ENABLE_SSR = os.getenv("ENABLE_SSR", "false").lower() == "true"  # SSR off by default for stability
WARMUP = os.getenv("WARMUP", "false").lower() == "true"          # default False for ZeroGPU

# ============================================================
# Import `spaces` BEFORE any CUDA-related libs (torch/diffusers)
# ============================================================
try:
    import spaces  # real decorator on HF Spaces
except ImportError:
    # Local/dev fallback: no-op decorator so app still runs without ZeroGPU
    class _DummySpaces:
        def GPU(self, *args, **kwargs):
            def _wrap(f):
                return f
            return _wrap
    spaces = _DummySpaces()

# ---------- Third-party imports (safe to import after `spaces`) ----------
import warnings
warnings.filterwarnings("ignore", message="Can't initialize NVML")

import numpy as np
import torch
from PIL import Image
import gradio as gr
from diffusers import (
    DiffusionPipeline,
    UNet2DConditionModel,
    LCMScheduler,
    AutoPipelineForText2Image,
)

# ---------- Version guard: Torch 2.1 + NumPy 2.x is incompatible ----------
try:
    _np_major = int(np.__version__.split(".")[0])
    if torch.__version__.startswith("2.1") and _np_major >= 2:
        raise RuntimeError(
            f"Incompatible versions: torch=={torch.__version__} with numpy=={np.__version__}. "
            "Pin numpy==1.26.4 or upgrade torch to >=2.3."
        )
except Exception as e:
    log.error(str(e))
    raise

# ---------- Paths ----------
CURRENT_DIR = os.getcwd()
CACHE_DIR = os.path.join(CURRENT_DIR, "cache")
os.makedirs(CACHE_DIR, exist_ok=True)

# ---------- GPU info (logs only) ----------
def print_nvidia_smi() -> None:
    try:
        proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=False)
        if proc.returncode == 0 and proc.stdout.strip():
            log.info("\n" + proc.stdout.strip())
        else:
            msg = proc.stderr.strip() if proc.stderr else "nvidia-smi not available or returned no output."
            log.info(msg)
    except FileNotFoundError:
        log.info("nvidia-smi not found on PATH.")

print_nvidia_smi()

# ---------- Global pipeline handle (kept on CPU between calls) ----------
pipe: Optional[DiffusionPipeline] = None

def _gpu_mem_efficiency(p: DiffusionPipeline) -> None:
    """Enable memory-efficient attention and VAE tiling where possible."""
    enabled = False
    try:
        p.enable_xformers_memory_efficient_attention()
        enabled = True
    except Exception:
        try:
            p.enable_attention_slicing("max")
            enabled = True
        except Exception:
            pass
    try:
        p.enable_vae_tiling()
    except Exception:
        pass
    if enabled:
        try:
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.set_float32_matmul_precision("high")
        except Exception:
            pass

def _build_pipeline_cpu() -> DiffusionPipeline:
    """
    Build the pipeline on CPU with float32 to keep it stable in ZeroGPU's
    CPU-only startup environment. We'll move it to CUDA inside the GPU-decorated
    function per call and return it to CPU after.
    """
    log.info(f"Building pipeline for model backend: {MODEL_BACKEND}")
    if MODEL_BACKEND == "sdxl_lcm_unet":
        # SDXL base with LCM UNet (no LoRA required)
        unet = UNet2DConditionModel.from_pretrained(
            "latent-consistency/lcm-sdxl",
            torch_dtype=torch.float32,
            cache_dir=CACHE_DIR,
        )
        _p = DiffusionPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            unet=unet,
            torch_dtype=torch.float32,
            cache_dir=CACHE_DIR,
        )
    elif MODEL_BACKEND == "ssd1b_lcm_lora":
        # SSD-1B with LCM-LoRA (Diffusers backend; no PEFT required)
        _p = AutoPipelineForText2Image.from_pretrained(
            "segmind/SSD-1B",
            torch_dtype=torch.float32,
            cache_dir=CACHE_DIR,
        )
        _p.load_lora_weights(
            "latent-consistency/lcm-lora-ssd-1b",
            adapter_name="lcm",
            use_peft_backend=False,  # <-- avoid PEFT requirement
        )
        _p.fuse_lora()
    else:
        # Default: SDXL + LCM-LoRA (smaller download, great speed/quality)
        _p = DiffusionPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0",
            torch_dtype=torch.float32,
            cache_dir=CACHE_DIR,
        )
        _p.load_lora_weights(
            "latent-consistency/lcm-lora-sdxl",
            adapter_name="lcm",
            use_peft_backend=False,  # <-- avoid PEFT requirement
        )
        _p.fuse_lora()

    _p.scheduler = LCMScheduler.from_config(_p.scheduler.config)
    _p.to("cpu", torch.float32)
    try:
        _p.enable_vae_tiling()
    except Exception:
        pass

    log.info("Pipeline built successfully on CPU.")
    return _p

def ensure_pipe() -> DiffusionPipeline:
    """Initializes and returns the global pipeline object."""
    global pipe
    if pipe is None:
        pipe = _build_pipeline_cpu()
    return pipe

# ---------- Cold-start aware duration estimator ----------
GPU_COLD = True  # first GPU invocation will upload weights & warm kernels

def _estimate_duration(prompt: str,
                       negative_prompt: str,
                       seed: int,
                       width: int,
                       height: int,
                       guidance_scale: float,
                       steps: int,
                       secret_token: str) -> int:
    """
    ZeroGPU runtime budget (seconds).
    Includes:
      - model->GPU transfer + warmup (cold start tax)
      - per-step cost scaled by resolution
    """
    # normalize size to 1024x1024 ~= 1.0
    px_scale = (max(256, width) * max(256, height)) / (1024 * 1024)

    # conservative costs (tuned for SDXL+LCM on H200 slice)
    cold_tax = 22.0 if GPU_COLD else 10.0   # seconds
    step_cost = 1.2                         # sec/step at 1024^2
    base = 6.0                              # misc overhead

    est = base + cold_tax + steps * step_cost * max(0.5, px_scale)

    # floors: bigger images need a higher minimum
    floor = 45 if px_scale >= 1.0 else (30 if px_scale >= 0.5 else 20)

    return int(min(120, max(floor, est)))

# ---------- Public generate (token gate) ----------
@spaces.GPU(duration=_estimate_duration)  # ZeroGPU uses this to schedule a GPU window
def generate(
    prompt: str,
    negative_prompt: str = "",
    seed: int = 0,
    width: int = DEFAULT_SIZE,
    height: int = DEFAULT_SIZE,
    guidance_scale: float = 0.0,
    steps: int = 4,
    secret_token: str = "",
) -> Image.Image:
    # Declare global BEFORE any reference or assignment to GPU_COLD
    global GPU_COLD

    if secret_token != SECRET_TOKEN:
        raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")

    # For logs: what window we asked ZeroGPU for (based on current cold/warm state)
    try:
        requested = _estimate_duration(prompt, negative_prompt, seed, width, height, guidance_scale, steps, secret_token)
        log.info(f"ZeroGPU duration requested: {requested}s (cold={GPU_COLD}, size={width}x{height}, steps={steps})")
    except Exception:
        pass

    _p = ensure_pipe()  # already built on CPU & cached weights on disk

    # Clamp user inputs for safety
    width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
    height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
    steps = int(np.clip(steps, 1, 12))
    guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))

    # Try to use CUDA when available (ZeroGPU will make it available inside this call)
    moved_to_cuda = False
    try:
        if torch.cuda.is_available():
            _p.to("cuda", torch.float16)
            _gpu_mem_efficiency(_p)
            moved_to_cuda = True
        else:
            _p.to("cpu", torch.float32)
    except Exception as e:
        log.warning(f"Falling back to CPU: {e}")
        _p.to("cpu", torch.float32)

    # mark that we've done our cold GPU upload for this process
    if moved_to_cuda:
        GPU_COLD = False

    try:
        device = "cuda" if moved_to_cuda else "cpu"
        gen = torch.Generator(device=device)
        if seed is not None:
            gen = gen.manual_seed(int(seed))

        out = _p(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=width,
            height=height,
            guidance_scale=guidance_scale,
            num_inference_steps=steps,
            generator=gen,
            output_type="pil",
        )
        return out.images[0]
    finally:
        # Return model to CPU so the GPU can be released immediately after call
        try:
            _p.to("cpu", torch.float32)
            _p.enable_vae_tiling()
        except Exception:
            pass

# ---------- Optional warmup (CPU only for ZeroGPU) ----------
def warmup():
    """Performs a minimal inference on CPU to warm up the components."""
    try:
        _p = ensure_pipe()
        _ = _p(
            prompt="minimal warmup",
            width=256,
            height=256,
            guidance_scale=0.0,
            num_inference_steps=1,
            generator=torch.Generator(device="cpu").manual_seed(1),
            output_type="pil",
        ).images[0]
        log.info("CPU warmup inference complete.")
    except Exception as e:
        log.warning(f"Warmup skipped or failed: {e}")

# ---------- Gradio UI (v5) ----------
def build_ui() -> gr.Blocks:
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown("## Image Generator (LCM) — SDXL / SSD-1B (ZeroGPU Ready)")

        with gr.Row():
            prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image…")
            negative = gr.Textbox(label="Negative Prompt", lines=2, placeholder="(optional)")

        with gr.Row():
            seed = gr.Number(label="Seed", value=0, precision=0)
            width = gr.Slider(256, MAX_IMAGE_SIZE, value=DEFAULT_SIZE, step=32, label="Width")
            height = gr.Slider(256, MAX_IMAGE_SIZE, value=DEFAULT_SIZE, step=32, label="Height")

        with gr.Row():
            guidance = gr.Slider(0.0, 2.0, value=0.0, step=0.1, label="Guidance scale")
            steps = gr.Slider(1, 12, value=4, step=1, label="Inference steps")
            token = gr.Textbox(label="Secret Token", type="password", lines=1)

        out = gr.Image(label="Result", height=DEFAULT_SIZE, width=DEFAULT_SIZE)
        run = gr.Button("Generate", variant="primary")

        inputs = [prompt, negative, seed, width, height, guidance, steps, token]
        # Per-event concurrency control (Gradio v5)
        run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)

        gr.Markdown(
            f"*Backend:* `{MODEL_BACKEND}` &nbsp; | &nbsp; "
            f"*ZeroGPU:* `@spaces.GPU` enabled &nbsp; | &nbsp; "
            f"*Max size:* {MAX_IMAGE_SIZE}px"
        )
    return demo

# ---------- Launch ----------
def main():
    # --- Pre-load the model on startup (downloads happen here, not in GPU window) ---
    log.info("Application starting up. Pre-loading model on CPU...")
    ensure_pipe()
    log.info("Model pre-loaded successfully.")

    # --- Optional: Run a single inference on CPU if WARMUP is enabled ---
    if WARMUP:
        log.info("Warmup enabled. Running a test inference on CPU.")
        warmup()

    # --- Build and launch the Gradio UI ---
    demo = build_ui()
    demo.queue(max_size=QUEUE_SIZE)

    log.info("Starting Gradio server...")
    demo.launch(
        server_name="0.0.0.0",
        server_port=PORT,
        show_api=True,
        ssr_mode=ENABLE_SSR,  # Off by default; enable with ENABLE_SSR=true if needed
        share=False,
        show_error=True,
    )

if __name__ == "__main__":
    main()