Spaces:

ruslanmv
/

ai-fast-image-server

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

6fdbc47

1 Parent(s): 44b7f30

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -158

app.py CHANGED Viewed

@@ -1,206 +1,323 @@
-# ---- Flags ----
-run_api = False
-SSD_1B = False  # True = use SSD-1B + LCM LoRA, False = SDXL Base + LCM (default)
-# ---- Standard imports ----
 import os
 import subprocess
-import numpy as np
-# Optional: clear_output is nice in notebooks; ignore if not available
-try:
-    from IPython.display import clear_output  # noqa: F401
-except Exception:
-    def clear_output():  # no-op outside notebooks
-        pass
-# ---- Tame NVML noise in containers without GPU drivers (optional) ----
-os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")
 import warnings
 warnings.filterwarnings("ignore", message="Can't initialize NVML")
-# ---- App imports (expect deps from requirements.txt already installed) ----
 import torch
-import gradio as gr
 from PIL import Image
-from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler
-# ---- Config / constants ----
-current_dir = os.getcwd()
-cache_path = os.path.join(current_dir, "cache")
-os.makedirs(cache_path, exist_ok=True)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
-SECRET_TOKEN = os.getenv("SECRET_TOKEN", "default_secret")
-# ---- GPU visibility / info (for logs only) ----
-def print_nvidia_smi():
     try:
-        proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
         if proc.returncode == 0 and proc.stdout.strip():
-            print(proc.stdout)
         else:
-            # Show stderr when present to help debugging; not used for logic
-            if proc.stderr:
-                print(proc.stderr)
-            else:
-                print("nvidia-smi not available or returned no output.")
     except FileNotFoundError:
-        print("nvidia-smi not found on PATH.")
 print_nvidia_smi()
-# ---- Device + dtype selection (robust) ----
-is_gpu = torch.cuda.is_available()
-print(f"CUDA available: {is_gpu}")
-device = torch.device("cuda") if is_gpu else torch.device("cpu")
-dtype = torch.float16 if is_gpu else torch.float32
-# ---- Helpers to only pass 'variant' when needed (Diffusers <=0.23 friendly) ----
-def _add_variant(kwargs: dict) -> dict:
-    """Only include 'variant' when running on GPU."""
-    if is_gpu:
-        kwargs = dict(kwargs)  # shallow copy
-        kwargs["variant"] = "fp16"
-    return kwargs
-# ---- Pipeline setup ----
-if not SSD_1B:
-    # SDXL base + LCM UNet
-    unet = UNet2DConditionModel.from_pretrained(
-        "latent-consistency/lcm-sdxl",
-        torch_dtype=dtype,
-        cache_dir=cache_path,
-        **_add_variant({})
-    )
-    pipe = DiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-xl-base-1.0",
-        unet=unet,
-        torch_dtype=dtype,
-        cache_dir=cache_path,
-        **_add_variant({})
-    )
-    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-    pipe.to(device)
-else:
-    # SSD-1B + LCM LoRA
-    from diffusers import AutoPipelineForText2Image
-    pipe = AutoPipelineForText2Image.from_pretrained(
-        "segmind/SSD-1B",
-        torch_dtype=dtype,
-        cache_dir=cache_path,
-        **_add_variant({})
-    )
-    pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-    pipe.to(device)
-    pipe.load_lora_weights("latent-consistency/lcm-lora-ssd-1b")
-    pipe.fuse_lora()
-# ---- Core generate function ----
-def generate(
     prompt: str,
     negative_prompt: str = "",
-    seed: int = 0,
-    width: int = 1024,
-    height: int = 1024,
     guidance_scale: float = 0.0,
     num_inference_steps: int = 4,
-    secret_token: str = "",
 ) -> Image.Image:
-    # Token gate
-    if secret_token != SECRET_TOKEN:
-        raise gr.Error("Invalid secret token. Set SECRET_TOKEN on the server or pass the correct token.")
-    # Clamp sizes (avoid OOM on CPU)
     width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
     height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
-    # Deterministic generator on the active device
-    generator = torch.Generator(device=device)
     if seed is not None:
         generator = generator.manual_seed(int(seed))
-    out = pipe(
         prompt=prompt,
         negative_prompt=negative_prompt,
         width=width,
         height=height,
-        guidance_scale=guidance_scale,
         num_inference_steps=num_inference_steps,
         generator=generator,
         output_type="pil",
     )
-    return out.images[0]
-# ---- Optional notebook helper ----
-def generate_image(prompt="A scenic watercolor landscape, mountains at dawn"):
-    img = generate(
         prompt=prompt,
-        negative_prompt="",
-        seed=0,
-        width=1024,
-        height=1024,
-        guidance_scale=0.0,
-        num_inference_steps=4,
-        secret_token=SECRET_TOKEN,
     )
     try:
-        from IPython.display import display
-        display(img)
-    except Exception:
-        pass  # Non-notebook environment
-# ---- UI (Gradio 3.39.0 components) ----
-if not run_api:
-    secret_token = gr.Textbox(
-        label="Secret Token",
-        placeholder="Enter your secret token",
-        type="password",
-    )
-    prompt = gr.Textbox(
-        label="Prompt",
-        show_label=True,
-        max_lines=2,
-        placeholder="Enter your prompt",
-    )
-    negative_prompt = gr.Textbox(
-        label="Negative prompt",
-        max_lines=2,
-        placeholder="Enter a negative prompt (optional)",
-    )
-    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
-    width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
-    height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
-    guidance_scale = gr.Slider(label="Guidance scale", minimum=0, maximum=2, step=0.1, value=0.0)
-    num_inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=8, step=1, value=4)
-    iface = gr.Interface(
-        fn=generate,
-        inputs=[prompt, negative_prompt, seed, width, height, guidance_scale, num_inference_steps, secret_token],
-        outputs=gr.Image(label="Result"),
-        title="Image Generator (LCM)",
-        description="Fast SDXL/SSD-1B image generation with LCM. Uses CPU if CUDA is unavailable.",
-    )
-    iface.launch()
-if run_api:
-    with gr.Blocks() as demo:
         gr.Markdown(
-            "### REST API for LCM Text-to-Image\n"
-            "Use the `/run` endpoint programmatically with your secret."
         )
-        secret_token = gr.Textbox(label="Secret Token", type="password")
-        prompt = gr.Textbox(label="Prompt")
-        negative_prompt = gr.Textbox(label="Negative prompt")
-        seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
-        width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
-        height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024)
-        guidance_scale = gr.Slider(label="Guidance scale", minimum=0, maximum=2, step=0.1, value=0.0)
-        num_inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=8, step=1, value=4)
-        inputs = [prompt, negative_prompt, seed, width, height, guidance_scale, num_inference_steps, secret_token]
-        prompt.submit(fn=generate, inputs=inputs, outputs=gr.Image(), api_name="run")
-    demo.queue(max_size=32).launch(debug=False)

+# -------------------------------
+# AI Fast Image Server (Production)
+# -------------------------------
+from __future__ import annotations
 import os
+import sys
+import logging
 import subprocess
+from typing import Optional
+# ---------- Early, safe env defaults ----------
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # faster model downloads
+os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")      # silence NVML in headless envs
+os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
+# ---------- Logging ----------
+logging.basicConfig(
+    level=os.environ.get("LOG_LEVEL", "INFO").upper(),
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    stream=sys.stdout,
+)
+log = logging.getLogger("ai-fast-image-server")
+# ---------- Config via ENV ----------
+# MODEL_BACKEND: sdxl_lcm_unet (heavy), sdxl_lcm_lora (light), ssd1b_lcm_lora (light)
+MODEL_BACKEND = os.getenv("MODEL_BACKEND", "sdxl_lcm_lora").lower()
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
+DEFAULT_SIZE = int(os.getenv("DEFAULT_SIZE", "1024"))
+SECRET_TOKEN = os.getenv("SECRET_TOKEN", "default_secret")
+PORT = int(os.getenv("PORT", "7860"))
+CONCURRENCY = int(os.getenv("CONCURRENCY", "2"))
+QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", "32"))
+ENABLE_SSR = os.getenv("ENABLE_SSR", "false").lower() == "true"  # SSR can be flaky; default off
+# ---------- Imports that require deps ----------
 import warnings
 warnings.filterwarnings("ignore", message="Can't initialize NVML")
+import numpy as np
 import torch
 from PIL import Image
+import gradio as gr
+from diffusers import (
+    DiffusionPipeline,
+    UNet2DConditionModel,
+    LCMScheduler,
+    AutoPipelineForText2Image,
+)
+# ---------- Version guard: Torch 2.1 + NumPy 2.x is incompatible ----------
+try:
+    _np_major = int(np.__version__.split(".")[0])
+    if torch.__version__.startswith("2.1") and _np_major >= 2:
+        raise RuntimeError(
+            f"Incompatible versions: torch=={torch.__version__} with numpy=={np.__version__}. "
+            "Pin numpy==1.26.4 or upgrade torch to >=2.3."
+        )
+except Exception as e:
+    log.error(str(e))
+    raise
+# ---------- Paths ----------
+CURRENT_DIR = os.getcwd()
+CACHE_DIR = os.path.join(CURRENT_DIR, "cache")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# ---------- GPU info (logs only) ----------
+def print_nvidia_smi() -> None:
     try:
+        proc = subprocess.run(["nvidia-smi"], capture_output=True, text=True, check=False)
         if proc.returncode == 0 and proc.stdout.strip():
+            log.info("\n" + proc.stdout.strip())
         else:
+            msg = proc.stderr.strip() if proc.stderr else "nvidia-smi not available or returned no output."
+            log.info(msg)
     except FileNotFoundError:
+        log.info("nvidia-smi not found on PATH.")
 print_nvidia_smi()
+IS_GPU = torch.cuda.is_available()
+DEVICE = torch.device("cuda") if IS_GPU else torch.device("cpu")
+DTYPE = torch.float16 if IS_GPU else torch.float32
+log.info(f"CUDA available: {IS_GPU} | device={DEVICE} | dtype={DTYPE}")
+# ---------- Torch perf knobs ----------
+try:
+    if IS_GPU:
+        torch.backends.cuda.matmul.allow_tf32 = True  # safe perf on Ampere+
+        torch.set_float32_matmul_precision("high")
+except Exception:
+    pass
+# ---------- Helpers ----------
+def _variant_kwargs() -> dict:
+    # use fp16 repo variants only on GPU
+    return {"variant": "fp16"} if IS_GPU else {}
+def _cpu_safety_settings(pipe: DiffusionPipeline) -> None:
+    # reduce RAM usage and avoid giant VAE allocations on CPU
+    try:
+        pipe.enable_vae_tiling()
+    except Exception:
+        pass
+def _gpu_memory_efficiency(pipe: DiffusionPipeline) -> None:
+    # enable memory-efficient attention when available
+    enabled = False
+    try:
+        pipe.enable_xformers_memory_efficient_attention()
+        enabled = True
+    except Exception:
+        try:
+            pipe.enable_attention_slicing("max")
+            enabled = True
+        except Exception:
+            pass
+    if enabled:
+        try:
+            pipe.enable_vae_tiling()
+        except Exception:
+            pass
+# ---------- Model loading ----------
+pipe: Optional[DiffusionPipeline] = None
+def load_pipeline() -> DiffusionPipeline:
+    """
+    Load the selected backend with sensible defaults.
+    - sdxl_lcm_unet: SDXL base + full LCM UNet (heavy, high VRAM)
+    - sdxl_lcm_lora: SDXL base + LCM-LoRA (light, recommended)
+    - ssd1b_lcm_lora: SSD-1B + LCM-LoRA (light)
+    """
+    log.info(f"Loading model backend: {MODEL_BACKEND}")
+    if MODEL_BACKEND == "sdxl_lcm_unet":
+        # Heavy: downloads ~10 GB UNet; best quality/speed on big GPUs
+        unet = UNet2DConditionModel.from_pretrained(
+            "latent-consistency/lcm-sdxl",
+            torch_dtype=DTYPE,
+            cache_dir=CACHE_DIR,
+            **_variant_kwargs(),
+        )
+        _pipe = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            unet=unet,
+            torch_dtype=DTYPE,
+            cache_dir=CACHE_DIR,
+            **_variant_kwargs(),
+        )
+    elif MODEL_BACKEND == "ssd1b_lcm_lora":
+        _pipe = AutoPipelineForText2Image.from_pretrained(
+            "segmind/SSD-1B",
+            torch_dtype=DTYPE,
+            cache_dir=CACHE_DIR,
+            **_variant_kwargs(),
+        )
+        _pipe.load_lora_weights("latent-consistency/lcm-lora-ssd-1b")
+        _pipe.fuse_lora()
+    else:
+        # Default & recommended: SDXL + LCM-LoRA (smaller downloads, good quality)
+        _pipe = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=DTYPE,
+            cache_dir=CACHE_DIR,
+            **_variant_kwargs(),
+        )
+        _pipe.load_lora_weights("latent-consistency/lcm-lora-sdxl")
+        _pipe.fuse_lora()
+    # Use LCM scheduler
+    _pipe.scheduler = LCMScheduler.from_config(_pipe.scheduler.config)
+    # Device & memory efficiency
+    _pipe.to(DEVICE)
+    if IS_GPU:
+        _gpu_memory_efficiency(_pipe)
+    else:
+        _cpu_safety_settings(_pipe)
+    log.info("Pipeline loaded.")
+    return _pipe
+# warmup lazily
+def ensure_pipe() -> DiffusionPipeline:
+    global pipe
+    if pipe is None:
+        pipe = load_pipeline()
+    return pipe
+# ---------- HF Spaces GPU decorator (fixes “No @spaces.GPU function detected”) ----------
+try:
+    import spaces  # type: ignore
+    GPU_DECORATOR = spaces.GPU
+    log.info("`spaces` package detected. GPU-decorating inference function.")
+except Exception:
+    GPU_DECORATOR = lambda f: f  # no-op
+# ---------- Inference ----------
+@gpu_dec := GPU_DECORATOR
+def generate_image_internal(
     prompt: str,
     negative_prompt: str = "",
+    seed: Optional[int] = 0,
+    width: int = DEFAULT_SIZE,
+    height: int = DEFAULT_SIZE,
     guidance_scale: float = 0.0,
     num_inference_steps: int = 4,
 ) -> Image.Image:
+    _pipe = ensure_pipe()
+    # Clamp to safe bounds
     width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
     height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
+    num_inference_steps = int(np.clip(num_inference_steps, 1, 12))
+    guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
+    # Deterministic generator
+    generator = torch.Generator(device=DEVICE)
     if seed is not None:
         generator = generator.manual_seed(int(seed))
+    result = _pipe(
         prompt=prompt,
         negative_prompt=negative_prompt,
         width=width,
         height=height,
+        guidance_scale=guidance_scale,         # LCM prefers low/no guidance
         num_inference_steps=num_inference_steps,
         generator=generator,
         output_type="pil",
     )
+    return result.images[0]
+# thin wrapper that enforces the token (kept out of the GPU-decorated function)
+def generate(
+    prompt: str,
+    negative_prompt: str = "",
+    seed: int = 0,
+    width: int = DEFAULT_SIZE,
+    height: int = DEFAULT_SIZE,
+    guidance_scale: float = 0.0,
+    num_inference_steps: int = 4,
+    secret_token: str = "",
+) -> Image.Image:
+    if secret_token != SECRET_TOKEN:
+        raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
+    return generate_image_internal(
         prompt=prompt,
+        negative_prompt=negative_prompt,
+        seed=seed,
+        width=width,
+        height=height,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
     )
+# ---------- Optional warmup at startup ----------
+def warmup():
     try:
+        ensure_pipe()
+        _ = generate_image_internal(
+            prompt="A quick warmup prompt, minimal style", seed=42, width=512, height=512, num_inference_steps=2
+        )
+        log.info("Warmup complete.")
+    except Exception as e:
+        log.warning(f"Warmup skipped or failed: {e}")
+if os.getenv("WARMUP", "true").lower() == "true":
+    # Don't block too long on CPU
+    if IS_GPU:
+        warmup()
+# ---------- Gradio UI (v5) ----------
+def build_ui() -> gr.Blocks:
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("## Image Generator (LCM) — SDXL / SSD-1B")
+        with gr.Row():
+            prompt = gr.Textbox(label="Prompt", lines=3, placeholder="Describe the image...")
+            negative = gr.Textbox(label="Negative Prompt", lines=2, placeholder="(optional)")
+        with gr.Row():
+            seed = gr.Number(label="Seed", value=0, precision=0)
+            width = gr.Slider(256, MAX_IMAGE_SIZE, value=DEFAULT_SIZE, step=32, label="Width")
+            height = gr.Slider(256, MAX_IMAGE_SIZE, value=DEFAULT_SIZE, step=32, label="Height")
+        with gr.Row():
+            guidance = gr.Slider(0.0, 2.0, value=0.0, step=0.1, label="Guidance scale")
+            steps = gr.Slider(1, 12, value=4, step=1, label="Inference steps")
+            token = gr.Textbox(label="Secret Token", type="password", lines=1)
+        out = gr.Image(label="Result", height=DEFAULT_SIZE, width=DEFAULT_SIZE)
+        run = gr.Button("Generate", variant="primary")
+        inputs = [prompt, negative, seed, width, height, guidance, steps, token]
+        run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)
+        # Simple health info
         gr.Markdown(
+            f"*Backend:* `{MODEL_BACKEND}` &nbsp; | &nbsp; "
+            f"*Device:* `{DEVICE}` &nbsp; | &nbsp; "
+            f"*dtype:* `{DTYPE}`"
         )
+    return demo
+# ---------- Launch ----------
+def main():
+    demo = build_ui()
+    # Queue for backpressure and concurrency control
+    demo.queue(max_size=QUEUE_SIZE, concurrency_count=CONCURRENCY)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=PORT,
+        show_api=True,
+        ssr_mode=ENABLE_SSR,  # SSR off by default (can be flaky on Spaces)
+        share=False,
+        show_error=True,
+    )
+if __name__ == "__main__":
+    main()