# ------------------------------- # AI Fast Image Server — ZeroGPU Ready (Gradio 5) # ------------------------------- from __future__ import annotations import os import sys import logging import subprocess from typing import Optional # ---------- Fast, safe defaults ---------- os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # faster model downloads os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1") # silence NVML in headless envs os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1") # ---------- Logging ---------- logging.basicConfig( level=os.environ.get("LOG_LEVEL", "INFO").upper(), format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", stream=sys.stdout, ) log = logging.getLogger("ai-fast-image-server") # ---------- Config via ENV ---------- # MODEL_BACKEND: "sdxl_lcm_lora" (default), "sdxl_lcm_unet" (heavy), "ssd1b_lcm_lora" (light) MODEL_BACKEND = os.getenv("MODEL_BACKEND", "sdxl_lcm_lora").lower() MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024")) DEFAULT_SIZE = int(os.getenv("DEFAULT_SIZE", "1024")) SECRET_TOKEN = os.getenv("SECRET_TOKEN", "default_secret") PORT = int(os.getenv("PORT", "7860")) CONCURRENCY = int(os.getenv("CONCURRENCY", "2")) QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", "32")) ENABLE_SSR = os.getenv("ENABLE_SSR", "false").lower() == "true" # SSR off by default for stability WARMUP = os.getenv("WARMUP", "false").lower() == "true" # default False for ZeroGPU # ============================================================ # Import `spaces` BEFORE any CUDA-related libs (torch/diffusers) # ============================================================ try: import spaces # real decorator on HF Spaces except ImportError: # Local/dev fallback: no-op decorator so app still runs without ZeroGPU class _DummySpaces: def GPU(self, *args, **kwargs): def _wrap(f): return f return _wrap spaces = _DummySpaces() # ---------- Third-party imports (safe to import after `spaces`) ---------- import warnings warnings.filterwarnings("ignore", message="Can't initialize NVML") import numpy as np import torch from PIL import Image import gradio as gr from diffusers import ( DiffusionPipeline, UNet2DConditionModel, LCMScheduler, AutoPipelineForText2Image, ) # ---------- Version guard: Torch 2.1 + NumPy 2.x is incompatible ---------- try: _np_major = int(np.__version__.split(".")[0]) if torch.__version__.startswith("2.1") and _np_major >= 2: raise RuntimeError( f"Incompatible versions: torch=={torch.__version__} with numpy=={np.__version__}. " "Pin numpy==1.26.4 or upgrade torch to >=2.3." ) except Exception as e: log.error(str(e)) raise # ---------- Paths ---------- CURRENT_DIR = os.getcwd() CACHE_DIR = os.path.join(CURRENT_DIR, "cache") os.makedirs(CACHE_DIR, exist_ok=True) # ---------- GPU info (logs only) ---------- def print_nvidia_smi() -> None: try: proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=False) if proc.returncode == 0 and proc.stdout.strip(): log.info("\n" + proc.stdout.strip()) else: msg = proc.stderr.strip() if proc.stderr else "nvidia-smi not available or returned no output." log.info(msg) except FileNotFoundError: log.info("nvidia-smi not found on PATH.") print_nvidia_smi() # ---------- Global pipeline handle (kept on CPU between calls) ---------- pipe: Optional[DiffusionPipeline] = None def _gpu_mem_efficiency(p: DiffusionPipeline) -> None: """Enable memory-efficient attention and VAE tiling where possible.""" enabled = False try: p.enable_xformers_memory_efficient_attention() enabled = True except Exception: try: p.enable_attention_slicing("max") enabled = True except Exception: pass try: p.enable_vae_tiling() except Exception: pass if enabled: try: torch.backends.cuda.matmul.allow_tf32 = True torch.set_float32_matmul_precision("high") except Exception: pass def _build_pipeline_cpu() -> DiffusionPipeline: """ Build the pipeline on CPU with float32 to keep it stable in ZeroGPU's CPU-only startup environment. We'll move it to CUDA inside the GPU-decorated function per call and return it to CPU after. """ log.info(f"Building pipeline for model backend: {MODEL_BACKEND}") if MODEL_BACKEND == "sdxl_lcm_unet": # SDXL base with LCM UNet (no LoRA required) unet = UNet2DConditionModel.from_pretrained( "latent-consistency/lcm-sdxl", torch_dtype=torch.float32, cache_dir=CACHE_DIR, ) _p = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float32, cache_dir=CACHE_DIR, ) elif MODEL_BACKEND == "ssd1b_lcm_lora": # SSD-1B with LCM-LoRA (Diffusers backend; no PEFT required) _p = AutoPipelineForText2Image.from_pretrained( "segmind/SSD-1B", torch_dtype=torch.float32, cache_dir=CACHE_DIR, ) _p.load_lora_weights( "latent-consistency/lcm-lora-ssd-1b", adapter_name="lcm", use_peft_backend=False, # <-- avoid PEFT requirement ) _p.fuse_lora() else: # Default: SDXL + LCM-LoRA (smaller download, great speed/quality) _p = DiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float32, cache_dir=CACHE_DIR, ) _p.load_lora_weights( "latent-consistency/lcm-lora-sdxl", adapter_name="lcm", use_peft_backend=False, # <-- avoid PEFT requirement ) _p.fuse_lora() _p.scheduler = LCMScheduler.from_config(_p.scheduler.config) _p.to("cpu", torch.float32) try: _p.enable_vae_tiling() except Exception: pass log.info("Pipeline built successfully on CPU.") return _p def ensure_pipe() -> DiffusionPipeline: """Initializes and returns the global pipeline object.""" global pipe if pipe is None: pipe = _build_pipeline_cpu() return pipe # ---------- Cold-start aware duration estimator ---------- GPU_COLD = True # first GPU invocation will upload weights & warm kernels def _estimate_duration(prompt: str, negative_prompt: str, seed: int, width: int, height: int, guidance_scale: float, steps: int, secret_token: str) -> int: """ ZeroGPU runtime budget (seconds). Includes: - model->GPU transfer + warmup (cold start tax) - per-step cost scaled by resolution """ # normalize size to 1024x1024 ~= 1.0 px_scale = (max(256, width) * max(256, height)) / (1024 * 1024) # conservative costs (tuned for SDXL+LCM on H200 slice) cold_tax = 22.0 if GPU_COLD else 10.0 # seconds step_cost = 1.2 # sec/step at 1024^2 base = 6.0 # misc overhead est = base + cold_tax + steps * step_cost * max(0.5, px_scale) # floors: bigger images need a higher minimum floor = 45 if px_scale >= 1.0 else (30 if px_scale >= 0.5 else 20) return int(min(120, max(floor, est))) # ---------- Public generate (token gate) ---------- @spaces.GPU(duration=_estimate_duration) # ZeroGPU uses this to schedule a GPU window def generate( prompt: str, negative_prompt: str = "", seed: int = 0, width: int = DEFAULT_SIZE, height: int = DEFAULT_SIZE, guidance_scale: float = 0.0, steps: int = 4, secret_token: str = "", ) -> Image.Image: # Declare global BEFORE any reference or assignment to GPU_COLD global GPU_COLD if secret_token != SECRET_TOKEN: raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.") # For logs: what window we asked ZeroGPU for (based on current cold/warm state) try: requested = _estimate_duration(prompt, negative_prompt, seed, width, height, guidance_scale, steps, secret_token) log.info(f"ZeroGPU duration requested: {requested}s (cold={GPU_COLD}, size={width}x{height}, steps={steps})") except Exception: pass _p = ensure_pipe() # already built on CPU & cached weights on disk # Clamp user inputs for safety width = int(np.clip(width, 256, MAX_IMAGE_SIZE)) height = int(np.clip(height, 256, MAX_IMAGE_SIZE)) steps = int(np.clip(steps, 1, 12)) guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0)) # Try to use CUDA when available (ZeroGPU will make it available inside this call) moved_to_cuda = False try: if torch.cuda.is_available(): _p.to("cuda", torch.float16) _gpu_mem_efficiency(_p) moved_to_cuda = True else: _p.to("cpu", torch.float32) except Exception as e: log.warning(f"Falling back to CPU: {e}") _p.to("cpu", torch.float32) # mark that we've done our cold GPU upload for this process if moved_to_cuda: GPU_COLD = False try: device = "cuda" if moved_to_cuda else "cpu" gen = torch.Generator(device=device) if seed is not None: gen = gen.manual_seed(int(seed)) out = _p( prompt=prompt, negative_prompt=negative_prompt, width=width, height=height, guidance_scale=guidance_scale, num_inference_steps=steps, generator=gen, output_type="pil", ) return out.images[0] finally: # Return model to CPU so the GPU can be released immediately after call try: _p.to("cpu", torch.float32) _p.enable_vae_tiling() except Exception: pass # ---------- Optional warmup (CPU only for ZeroGPU) ---------- def warmup(): """Performs a minimal inference on CPU to warm up the components.""" try: _p = ensure_pipe() _ = _p( prompt="minimal warmup", width=256, height=256, guidance_scale=0.0, num_inference_steps=1, generator=torch.Generator(device="cpu").manual_seed(1), output_type="pil", ).images[0] log.info("CPU warmup inference complete.") except Exception as e: log.warning(f"Warmup skipped or failed: {e}") # ---------- Gradio UI (v5) ---------- def build_ui() -> gr.Blocks: with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("## Image Generator (LCM) — SDXL / SSD-1B (ZeroGPU Ready)") with gr.Row(): prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image…") negative = gr.Textbox(label="Negative Prompt", lines=2, placeholder="(optional)") with gr.Row(): seed = gr.Number(label="Seed", value=0, precision=0) width = gr.Slider(256, MAX_IMAGE_SIZE, value=DEFAULT_SIZE, step=32, label="Width") height = gr.Slider(256, MAX_IMAGE_SIZE, value=DEFAULT_SIZE, step=32, label="Height") with gr.Row(): guidance = gr.Slider(0.0, 2.0, value=0.0, step=0.1, label="Guidance scale") steps = gr.Slider(1, 12, value=4, step=1, label="Inference steps") token = gr.Textbox(label="Secret Token", type="password", lines=1) out = gr.Image(label="Result", height=DEFAULT_SIZE, width=DEFAULT_SIZE) run = gr.Button("Generate", variant="primary") inputs = [prompt, negative, seed, width, height, guidance, steps, token] # Per-event concurrency control (Gradio v5) run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY) gr.Markdown( f"*Backend:* `{MODEL_BACKEND}`   |   " f"*ZeroGPU:* `@spaces.GPU` enabled   |   " f"*Max size:* {MAX_IMAGE_SIZE}px" ) return demo # ---------- Launch ---------- def main(): # --- Pre-load the model on startup (downloads happen here, not in GPU window) --- log.info("Application starting up. Pre-loading model on CPU...") ensure_pipe() log.info("Model pre-loaded successfully.") # --- Optional: Run a single inference on CPU if WARMUP is enabled --- if WARMUP: log.info("Warmup enabled. Running a test inference on CPU.") warmup() # --- Build and launch the Gradio UI --- demo = build_ui() demo.queue(max_size=QUEUE_SIZE) log.info("Starting Gradio server...") demo.launch( server_name="0.0.0.0", server_port=PORT, show_api=True, ssr_mode=ENABLE_SSR, # Off by default; enable with ENABLE_SSR=true if needed share=False, show_error=True, ) if __name__ == "__main__": main()