Spaces:

ruslanmv
/

ai-fast-image-server

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

8888e64

1 Parent(s): 3025d81

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -17

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import subprocess
 from typing import Optional
 # ---------- Fast, safe defaults ----------
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # faster model downloads
 os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")      # silence NVML in headless envs
 os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
@@ -184,7 +184,9 @@ def ensure_pipe() -> DiffusionPipeline:
         pipe = _build_pipeline_cpu()
     return pipe
-# ---------- Duration model for ZeroGPU (match decorated function signature) ----------
 def _estimate_duration(prompt: str,
                        negative_prompt: str,
                        seed: int,
@@ -194,17 +196,28 @@ def _estimate_duration(prompt: str,
                        steps: int,
                        secret_token: str) -> int:
     """
-    Rough estimate (seconds) to inform ZeroGPU scheduler for better queuing.
-    Scale by pixel count and steps. Conservative upper bound.
     """
-    base = 3.0
     px_scale = (max(256, width) * max(256, height)) / (1024 * 1024)
-    step_cost = 0.85  # ~0.85s/step @1024^2 (H200 slice; tune as needed)
-    est = base + steps * step_cost * max(0.5, px_scale)
-    return int(min(120, max(10, est)))
 # ---------- Public generate (token gate) ----------
-@spaces.GPU(duration=_estimate_duration)  # <- MUST decorate the function Gradio calls
 def generate(
     prompt: str,
     negative_prompt: str = "",
@@ -218,7 +231,14 @@ def generate(
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
-    _p = ensure_pipe() # This will now return the pre-loaded pipe
     # Clamp user inputs for safety
     width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
@@ -239,6 +259,11 @@ def generate(
         log.warning(f"Falling back to CPU: {e}")
         _p.to("cpu", torch.float32)
     try:
         device = "cuda" if moved_to_cuda else "cpu"
         gen = torch.Generator(device=device)
@@ -268,7 +293,6 @@ def generate(
 def warmup():
     """Performs a minimal inference on CPU to warm up the components."""
     try:
-        # Ensure pipe is loaded, though it should be already by main()
         _p = ensure_pipe()
         _ = _p(
             prompt="minimal warmup",
@@ -318,11 +342,11 @@ def build_ui() -> gr.Blocks:
 # ---------- Launch ----------
 def main():
-    # --- FIX: Pre-load the model on startup ---
-    log.info("Application starting up. Pre-loading model...")
-    ensure_pipe() # This will download and build the pipeline on the CPU
     log.info("Model pre-loaded successfully.")
     # --- Optional: Run a single inference on CPU if WARMUP is enabled ---
     if WARMUP:
         log.info("Warmup enabled. Running a test inference on CPU.")
@@ -330,9 +354,8 @@ def main():
     # --- Build and launch the Gradio UI ---
     demo = build_ui()
-    # Gradio v5: queue() no longer accepts `concurrency_count`; use per-event limits.
     demo.queue(max_size=QUEUE_SIZE)
     log.info("Starting Gradio server...")
     demo.launch(
         server_name="0.0.0.0",

 from typing import Optional
 # ---------- Fast, safe defaults ----------
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")   # faster model downloads
 os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")      # silence NVML in headless envs
 os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
         pipe = _build_pipeline_cpu()
     return pipe
+# ---------- Cold-start aware duration estimator ----------
+GPU_COLD = True  # first GPU invocation will upload weights & warm kernels
 def _estimate_duration(prompt: str,
                        negative_prompt: str,
                        seed: int,
                        steps: int,
                        secret_token: str) -> int:
     """
+    ZeroGPU runtime budget (seconds).
+    Includes:
+      - model->GPU transfer + warmup (cold start tax)
+      - per-step cost scaled by resolution
     """
+    # normalize size to 1024x1024 ~= 1.0
     px_scale = (max(256, width) * max(256, height)) / (1024 * 1024)
+    # conservative costs (tuned for SDXL+LCM on H200 slice)
+    cold_tax = 22.0 if GPU_COLD else 10.0   # seconds
+    step_cost = 1.2                         # sec/step at 1024^2
+    base = 6.0                              # misc overhead
+    est = base + cold_tax + steps * step_cost * max(0.5, px_scale)
+    # floors: bigger images need a higher minimum
+    floor = 45 if px_scale >= 1.0 else (30 if px_scale >= 0.5 else 20)
+    return int(min(120, max(floor, est)))
 # ---------- Public generate (token gate) ----------
+@spaces.GPU(duration=_estimate_duration)  # ZeroGPU uses this to schedule a GPU window
 def generate(
     prompt: str,
     negative_prompt: str = "",
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
+    # For logs: what window we asked ZeroGPU for (based on current cold/warm state)
+    try:
+        requested = _estimate_duration(prompt, negative_prompt, seed, width, height, guidance_scale, steps, secret_token)
+        log.info(f"ZeroGPU duration requested: {requested}s (cold={GPU_COLD}, size={width}x{height}, steps={steps})")
+    except Exception:
+        pass
+    _p = ensure_pipe()  # already built on CPU & cached weights on disk
     # Clamp user inputs for safety
     width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
         log.warning(f"Falling back to CPU: {e}")
         _p.to("cpu", torch.float32)
+    # mark that we've done our cold GPU upload for this process
+    global GPU_COLD
+    if moved_to_cuda:
+        GPU_COLD = False
     try:
         device = "cuda" if moved_to_cuda else "cpu"
         gen = torch.Generator(device=device)
 def warmup():
     """Performs a minimal inference on CPU to warm up the components."""
     try:
         _p = ensure_pipe()
         _ = _p(
             prompt="minimal warmup",
 # ---------- Launch ----------
 def main():
+    # --- Pre-load the model on startup (downloads happen here, not in GPU window) ---
+    log.info("Application starting up. Pre-loading model on CPU...")
+    ensure_pipe()
     log.info("Model pre-loaded successfully.")
     # --- Optional: Run a single inference on CPU if WARMUP is enabled ---
     if WARMUP:
         log.info("Warmup enabled. Running a test inference on CPU.")
     # --- Build and launch the Gradio UI ---
     demo = build_ui()
     demo.queue(max_size=QUEUE_SIZE)
     log.info("Starting Gradio server...")
     demo.launch(
         server_name="0.0.0.0",