Spaces:

ruslanmv
/

ai-fast-image-server

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

20ea5a4

1 Parent(s): 35220ff

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -47

app.py CHANGED Viewed

@@ -51,8 +51,9 @@ from diffusers import (
 # ---------- ZeroGPU decorator (works even off-Spaces) ----------
 try:
-    import spaces  # real decorator on Spaces
-except Exception:
     class _DummySpaces:
         def GPU(self, *args, **kwargs):
             def _wrap(f):
@@ -96,7 +97,7 @@ pipe: Optional[DiffusionPipeline] = None
 def _gpu_mem_efficiency(p: DiffusionPipeline) -> None:
     """Enable memory-efficient attention and VAE tiling where possible."""
-    enabled = False
     try:
         p.enable_xformers_memory_efficient_attention()
         enabled = True
@@ -177,7 +178,8 @@ def _estimate_duration(prompt: str,
                        width: int,
                        height: int,
                        guidance_scale: float,
-                       steps: int) -> int:
     """
     Rough estimate (seconds) to inform ZeroGPU scheduler for better queuing.
     Scale by pixel count and steps. Conservative upper bound.
@@ -188,32 +190,46 @@ def _estimate_duration(prompt: str,
     est = base + steps * step_cost * max(0.5, px_scale)
     return int(min(120, max(10, est)))
-# ---------- GPU-decorated inference (Spaces detects this) ----------
-@spaces.GPU(duration=_estimate_duration)  # no-op outside Spaces
-def _generate_gpu_call(
     prompt: str,
-    negative_prompt: str,
-    seed: Optional[int],
-    width: int,
-    height: int,
-    guidance_scale: float,
-    steps: int,
 ) -> Image.Image:
-    """
-    Runs under a ZeroGPU-allocated context. We move the pipeline to CUDA at the
-    start and back to CPU at the end so that it remains usable when GPU is released.
-    """
     _p = ensure_pipe()
-    _p.to("cuda", torch.float16)
-    _gpu_mem_efficiency(_p)
     try:
-        width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
-        height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
-        steps = int(np.clip(steps, 1, 12))
-        guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
-        gen = torch.Generator(device="cuda")
         if seed is not None:
             gen = gen.manual_seed(int(seed))
@@ -229,35 +245,13 @@ def _generate_gpu_call(
         )
         return out.images[0]
     finally:
         try:
             _p.to("cpu", torch.float32)
             _p.enable_vae_tiling()
         except Exception:
             pass
-# ---------- Public generate (token gate) ----------
-def generate(
-    prompt: str,
-    negative_prompt: str = "",
-    seed: int = 0,
-    width: int = DEFAULT_SIZE,
-    height: int = DEFAULT_SIZE,
-    guidance_scale: float = 0.0,
-    num_inference_steps: int = 4,
-    secret_token: str = "",
-) -> Image.Image:
-    if secret_token != SECRET_TOKEN:
-        raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
-    return _generate_gpu_call(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        seed=seed,
-        width=width,
-        height=height,
-        guidance_scale=guidance_scale,
-        steps=num_inference_steps,
-    )
 # ---------- Optional warmup (CPU only for ZeroGPU) ----------
 def warmup():
     try:

 # ---------- ZeroGPU decorator (works even off-Spaces) ----------
 try:
+    import spaces  # real decorator on HF Spaces
+except ImportError:
+    # Local/dev fallback: no-op decorator so app still runs without ZeroGPU
     class _DummySpaces:
         def GPU(self, *args, **kwargs):
             def _wrap(f):
 def _gpu_mem_efficiency(p: DiffusionPipeline) -> None:
     """Enable memory-efficient attention and VAE tiling where possible."""
+    enabled = false_flag = False
     try:
         p.enable_xformers_memory_efficient_attention()
         enabled = True
                        width: int,
                        height: int,
                        guidance_scale: float,
+                       steps: int,
+                       secret_token: str) -> int:
     """
     Rough estimate (seconds) to inform ZeroGPU scheduler for better queuing.
     Scale by pixel count and steps. Conservative upper bound.
     est = base + steps * step_cost * max(0.5, px_scale)
     return int(min(120, max(10, est)))
+# ---------- Public generate (token gate) ----------
+@spaces.GPU(duration=_estimate_duration)  # <- MUST decorate the function Gradio calls
+def generate(
     prompt: str,
+    negative_prompt: str = "",
+    seed: int = 0,
+    width: int = DEFAULT_SIZE,
+    height: int = DEFAULT_SIZE,
+    guidance_scale: float = 0.0,
+    steps: int = 4,
+    secret_token: str = "",
 ) -> Image.Image:
+    if secret_token != SECRET_TOKEN:
+        # Using gr.Error keeps the nice Gradio toast in UI
+        raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
     _p = ensure_pipe()
+    # Clamp user inputs for safety
+    width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
+    height = int(np.clip(height, 256, MAX_IMAGE_SIZE))
+    steps = int(np.clip(steps, 1, 12))
+    guidance_scale = float(np.clip(guidance_scale, 0.0, 2.0))
+    # Try to use CUDA when available (ZeroGPU will make it available inside this call)
+    moved_to_cuda = False
     try:
+        if torch.cuda.is_available():
+            _p.to("cuda", torch.float16)
+            _gpu_mem_efficiency(_p)
+            moved_to_cuda = True
+        else:
+            _p.to("cpu", torch.float32)
+    except Exception as e:
+        log.warning(f"Falling back to CPU: {e}")
+        _p.to("cpu", torch.float32)
+    try:
+        device = "cuda" if moved_to_cuda else "cpu"
+        gen = torch.Generator(device=device)
         if seed is not None:
             gen = gen.manual_seed(int(seed))
         )
         return out.images[0]
     finally:
+        # Return model to CPU so the GPU can be released immediately after call
         try:
             _p.to("cpu", torch.float32)
             _p.enable_vae_tiling()
         except Exception:
             pass
 # ---------- Optional warmup (CPU only for ZeroGPU) ----------
 def warmup():
     try: