Spaces:

ruslanmv
/

ai-fast-image-server

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

35220ff

1 Parent(s): 74942a4

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -33

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # -------------------------------
-# AI Fast Image Server — ZeroGPU Ready
 # -------------------------------
 from __future__ import annotations
@@ -7,7 +7,7 @@ import os
 import sys
 import logging
 import subprocess
-from typing import Optional, Callable
 # ---------- Fast, safe defaults ----------
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")   # faster model downloads
@@ -55,7 +55,6 @@ try:
 except Exception:
     class _DummySpaces:
         def GPU(self, *args, **kwargs):
-            # identity decorator if not on Spaces
             def _wrap(f):
                 return f
             return _wrap
@@ -112,17 +111,12 @@ def _gpu_mem_efficiency(p: DiffusionPipeline) -> None:
     except Exception:
         pass
     if enabled:
-        # faster matmul on Ampere+
         try:
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.set_float32_matmul_precision("high")
         except Exception:
             pass
-def _variant_kwargs() -> dict:
-    # Use fp16 repo variants only when on GPU (avoid oddities on CPU)
-    return {"variant": "fp16"}
 def _build_pipeline_cpu() -> DiffusionPipeline:
     """
     Build the pipeline on CPU with float32 to keep it stable in ZeroGPU's
@@ -131,12 +125,10 @@ def _build_pipeline_cpu() -> DiffusionPipeline:
     """
     log.info(f"Loading model backend: {MODEL_BACKEND}")
     if MODEL_BACKEND == "sdxl_lcm_unet":
-        # Heavy: full LCM UNet (~10GB). Use only if you have big VRAM.
         unet = UNet2DConditionModel.from_pretrained(
             "latent-consistency/lcm-sdxl",
             torch_dtype=torch.float32,
             cache_dir=CACHE_DIR,
-            # no variant on CPU
         )
         _p = DiffusionPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
@@ -162,13 +154,10 @@ def _build_pipeline_cpu() -> DiffusionPipeline:
         _p.load_lora_weights("latent-consistency/lcm-lora-sdxl")
         _p.fuse_lora()
-    # Use LCM scheduler
     _p.scheduler = LCMScheduler.from_config(_p.scheduler.config)
-    # Stay on CPU by default (ZeroGPU will give us CUDA only during calls)
     _p.to("cpu", torch.float32)
     try:
-        _p.enable_vae_tiling()  # also fine on CPU
     except Exception:
         pass
@@ -181,23 +170,26 @@ def ensure_pipe() -> DiffusionPipeline:
         pipe = _build_pipeline_cpu()
     return pipe
-# ---------- Duration model for ZeroGPU ----------
-def _estimate_duration(prompt: str, negative_prompt: str, seed: int,
-                       width: int, height: int, guidance_scale: float, steps: int,
-                       secret_token: str) -> int:
     """
     Rough estimate (seconds) to inform ZeroGPU scheduler for better queuing.
     Scale by pixel count and steps. Conservative upper bound.
     """
-    base = 3.0  # pipeline dispatch + overhead
     px_scale = (max(256, width) * max(256, height)) / (1024 * 1024)
     step_cost = 0.85  # ~0.85s/step @1024^2 (H200 slice; tune as needed)
     est = base + steps * step_cost * max(0.5, px_scale)
-    # Clamp between 10 and 120 seconds
     return int(min(120, max(10, est)))
 # ---------- GPU-decorated inference (Spaces detects this) ----------
-@spaces.GPU(duration=_estimate_duration)  # dynamic duration; no-op outside Spaces
 def _generate_gpu_call(
     prompt: str,
     negative_prompt: str,
@@ -212,19 +204,15 @@ def _generate_gpu_call(
     start and back to CPU at the end so that it remains usable when GPU is released.
     """
     _p = ensure_pipe()
-    # Move to CUDA with half precision (safe with LCM)
     _p.to("cuda", torch.float16)
     _gpu_mem_efficiency(_p)
     try:
-        # Clamp inputs
         width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
         height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
         steps = int(np.clip(steps, 1, 12))
         guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
-        # Deterministic generator on CUDA
         gen = torch.Generator(device="cuda")
         if seed is not None:
             gen = gen.manual_seed(int(seed))
@@ -234,21 +222,20 @@ def _generate_gpu_call(
             negative_prompt=negative_prompt,
             width=width,
             height=height,
-            guidance_scale=guidance_scale,     # LCM prefers low guidance
             num_inference_steps=steps,
             generator=gen,
             output_type="pil",
         )
         return out.images[0]
     finally:
-        # Always return pipeline to CPU so next non-GPU context is safe
         try:
             _p.to("cpu", torch.float32)
             _p.enable_vae_tiling()
         except Exception:
             pass
-# ---------- Public generate (token gate kept outside GPU context) ----------
 def generate(
     prompt: str,
     negative_prompt: str = "",
@@ -261,7 +248,6 @@ def generate(
 ) -> Image.Image:
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
     return _generate_gpu_call(
         prompt=prompt,
         negative_prompt=negative_prompt,
@@ -272,11 +258,10 @@ def generate(
         steps=num_inference_steps,
     )
-# ---------- Optional warmup (CPU only by default for ZeroGPU) ----------
 def warmup():
     try:
         ensure_pipe()
-        # Tiny CPU warmup to load weights into RAM/cache
         _ = pipe(
             prompt="minimal warmup",
             width=256,
@@ -316,6 +301,7 @@ def build_ui() -> gr.Blocks:
         run = gr.Button("Generate", variant="primary")
         inputs = [prompt, negative, seed, width, height, guidance, steps, token]
         run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)
         gr.Markdown(
@@ -328,12 +314,13 @@ def build_ui() -> gr.Blocks:
 # ---------- Launch ----------
 def main():
     demo = build_ui()
-    demo.queue(max_size=QUEUE_SIZE, concurrency_count=CONCURRENCY)
     demo.launch(
         server_name="0.0.0.0",
         server_port=PORT,
         show_api=True,
-        ssr_mode=ENABLE_SSR,  # Off by default; turn on with ENABLE_SSR=true if needed
         share=False,
         show_error=True,
     )

 # -------------------------------
+# AI Fast Image Server — ZeroGPU Ready (Gradio 5)
 # -------------------------------
 from __future__ import annotations
 import sys
 import logging
 import subprocess
+from typing import Optional
 # ---------- Fast, safe defaults ----------
 os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")   # faster model downloads
 except Exception:
     class _DummySpaces:
         def GPU(self, *args, **kwargs):
             def _wrap(f):
                 return f
             return _wrap
     except Exception:
         pass
     if enabled:
         try:
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.set_float32_matmul_precision("high")
         except Exception:
             pass
 def _build_pipeline_cpu() -> DiffusionPipeline:
     """
     Build the pipeline on CPU with float32 to keep it stable in ZeroGPU's
     """
     log.info(f"Loading model backend: {MODEL_BACKEND}")
     if MODEL_BACKEND == "sdxl_lcm_unet":
         unet = UNet2DConditionModel.from_pretrained(
             "latent-consistency/lcm-sdxl",
             torch_dtype=torch.float32,
             cache_dir=CACHE_DIR,
         )
         _p = DiffusionPipeline.from_pretrained(
             "stabilityai/stable-diffusion-xl-base-1.0",
         _p.load_lora_weights("latent-consistency/lcm-lora-sdxl")
         _p.fuse_lora()
     _p.scheduler = LCMScheduler.from_config(_p.scheduler.config)
     _p.to("cpu", torch.float32)
     try:
+        _p.enable_vae_tiling()
     except Exception:
         pass
         pipe = _build_pipeline_cpu()
     return pipe
+# ---------- Duration model for ZeroGPU (match decorated function signature) ----------
+def _estimate_duration(prompt: str,
+                       negative_prompt: str,
+                       seed: int,
+                       width: int,
+                       height: int,
+                       guidance_scale: float,
+                       steps: int) -> int:
     """
     Rough estimate (seconds) to inform ZeroGPU scheduler for better queuing.
     Scale by pixel count and steps. Conservative upper bound.
     """
+    base = 3.0
     px_scale = (max(256, width) * max(256, height)) / (1024 * 1024)
     step_cost = 0.85  # ~0.85s/step @1024^2 (H200 slice; tune as needed)
     est = base + steps * step_cost * max(0.5, px_scale)
     return int(min(120, max(10, est)))
 # ---------- GPU-decorated inference (Spaces detects this) ----------
+@spaces.GPU(duration=_estimate_duration)  # no-op outside Spaces
 def _generate_gpu_call(
     prompt: str,
     negative_prompt: str,
     start and back to CPU at the end so that it remains usable when GPU is released.
     """
     _p = ensure_pipe()
     _p.to("cuda", torch.float16)
     _gpu_mem_efficiency(_p)
     try:
         width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
         height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
         steps = int(np.clip(steps, 1, 12))
         guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
         gen = torch.Generator(device="cuda")
         if seed is not None:
             gen = gen.manual_seed(int(seed))
             negative_prompt=negative_prompt,
             width=width,
             height=height,
+            guidance_scale=guidance_scale,
             num_inference_steps=steps,
             generator=gen,
             output_type="pil",
         )
         return out.images[0]
     finally:
         try:
             _p.to("cpu", torch.float32)
             _p.enable_vae_tiling()
         except Exception:
             pass
+# ---------- Public generate (token gate) ----------
 def generate(
     prompt: str,
     negative_prompt: str = "",
 ) -> Image.Image:
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
     return _generate_gpu_call(
         prompt=prompt,
         negative_prompt=negative_prompt,
         steps=num_inference_steps,
     )
+# ---------- Optional warmup (CPU only for ZeroGPU) ----------
 def warmup():
     try:
         ensure_pipe()
         _ = pipe(
             prompt="minimal warmup",
             width=256,
         run = gr.Button("Generate", variant="primary")
         inputs = [prompt, negative, seed, width, height, guidance, steps, token]
+        # Per-event concurrency control (Gradio v5)
         run.click(fn=generate, inputs=inputs, outputs=out, concurrency_limit=CONCURRENCY)
         gr.Markdown(
 # ---------- Launch ----------
 def main():
     demo = build_ui()
+    # Gradio v5: queue() no longer accepts `concurrency_count`; use per-event limits.
+    demo.queue(max_size=QUEUE_SIZE)
     demo.launch(
         server_name="0.0.0.0",
         server_port=PORT,
         show_api=True,
+        ssr_mode=ENABLE_SSR,  # Off by default; enable with ENABLE_SSR=true if needed
         share=False,
         show_error=True,
     )