Spaces:

ruslanmv
/

ai-fast-image-server

Running on Zero

App Files Files Community

ruslanmv commited on Sep 28

Commit

a8d4cbb

1 Parent(s): 23c2f20

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -12

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import subprocess
 from typing import Optional
 # ---------- Fast, safe defaults ----------
-os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")   # faster model downloads
 os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")      # silence NVML in headless envs
 os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
@@ -126,7 +126,7 @@ def _build_pipeline_cpu() -> DiffusionPipeline:
     CPU-only startup environment. We'll move it to CUDA inside the GPU-decorated
     function per call and return it to CPU after.
     """
-    log.info(f"Loading model backend: {MODEL_BACKEND}")
     if MODEL_BACKEND == "sdxl_lcm_unet":
         # SDXL base with LCM UNet (no LoRA required)
         unet = UNet2DConditionModel.from_pretrained(
@@ -150,7 +150,7 @@ def _build_pipeline_cpu() -> DiffusionPipeline:
         _p.load_lora_weights(
             "latent-consistency/lcm-lora-ssd-1b",
             adapter_name="lcm",
-            use_peft_backend=False,   # <-- avoid PEFT requirement
         )
         _p.fuse_lora()
     else:
@@ -163,7 +163,7 @@ def _build_pipeline_cpu() -> DiffusionPipeline:
         _p.load_lora_weights(
             "latent-consistency/lcm-lora-sdxl",
             adapter_name="lcm",
-            use_peft_backend=False,   # <-- avoid PEFT requirement
         )
         _p.fuse_lora()
@@ -174,10 +174,11 @@ def _build_pipeline_cpu() -> DiffusionPipeline:
     except Exception:
         pass
-    log.info("Pipeline built on CPU.")
     return _p
 def ensure_pipe() -> DiffusionPipeline:
     global pipe
     if pipe is None:
         pipe = _build_pipeline_cpu()
@@ -217,7 +218,7 @@ def generate(
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
-    _p = ensure_pipe()
     # Clamp user inputs for safety
     width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
@@ -265,9 +266,11 @@ def generate(
 # ---------- Optional warmup (CPU only for ZeroGPU) ----------
 def warmup():
     try:
-        ensure_pipe()
-        _ = pipe(
             prompt="minimal warmup",
             width=256,
             height=256,
@@ -276,13 +279,10 @@ def warmup():
             generator=torch.Generator(device="cpu").manual_seed(1),
             output_type="pil",
         ).images[0]
-        log.info("CPU warmup complete.")
     except Exception as e:
         log.warning(f"Warmup skipped or failed: {e}")
-if WARMUP:
-    warmup()
 # ---------- Gradio UI (v5) ----------
 def build_ui() -> gr.Blocks:
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -318,9 +318,22 @@ def build_ui() -> gr.Blocks:
 # ---------- Launch ----------
 def main():
     demo = build_ui()
     # Gradio v5: queue() no longer accepts `concurrency_count`; use per-event limits.
     demo.queue(max_size=QUEUE_SIZE)
     demo.launch(
         server_name="0.0.0.0",
         server_port=PORT,

 from typing import Optional
 # ---------- Fast, safe defaults ----------
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")  # faster model downloads
 os.environ.setdefault("DEEPSPEED_DISABLE_NVML", "1")      # silence NVML in headless envs
 os.environ.setdefault("BITSANDBYTES_NOWELCOME", "1")
     CPU-only startup environment. We'll move it to CUDA inside the GPU-decorated
     function per call and return it to CPU after.
     """
+    log.info(f"Building pipeline for model backend: {MODEL_BACKEND}")
     if MODEL_BACKEND == "sdxl_lcm_unet":
         # SDXL base with LCM UNet (no LoRA required)
         unet = UNet2DConditionModel.from_pretrained(
         _p.load_lora_weights(
             "latent-consistency/lcm-lora-ssd-1b",
             adapter_name="lcm",
+            use_peft_backend=False,  # <-- avoid PEFT requirement
         )
         _p.fuse_lora()
     else:
         _p.load_lora_weights(
             "latent-consistency/lcm-lora-sdxl",
             adapter_name="lcm",
+            use_peft_backend=False,  # <-- avoid PEFT requirement
         )
         _p.fuse_lora()
     except Exception:
         pass
+    log.info("Pipeline built successfully on CPU.")
     return _p
 def ensure_pipe() -> DiffusionPipeline:
+    """Initializes and returns the global pipeline object."""
     global pipe
     if pipe is None:
         pipe = _build_pipeline_cpu()
     if secret_token != SECRET_TOKEN:
         raise gr.Error("Invalid secret token. Set SECRET_TOKEN or pass the correct token.")
+    _p = ensure_pipe() # This will now return the pre-loaded pipe
     # Clamp user inputs for safety
     width = int(np.clip(width, 256, MAX_IMAGE_SIZE))
 # ---------- Optional warmup (CPU only for ZeroGPU) ----------
 def warmup():
+    """Performs a minimal inference on CPU to warm up the components."""
     try:
+        # Ensure pipe is loaded, though it should be already by main()
+        _p = ensure_pipe()
+        _ = _p(
             prompt="minimal warmup",
             width=256,
             height=256,
             generator=torch.Generator(device="cpu").manual_seed(1),
             output_type="pil",
         ).images[0]
+        log.info("CPU warmup inference complete.")
     except Exception as e:
         log.warning(f"Warmup skipped or failed: {e}")
 # ---------- Gradio UI (v5) ----------
 def build_ui() -> gr.Blocks:
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
 # ---------- Launch ----------
 def main():
+    # --- FIX: Pre-load the model on startup ---
+    log.info("Application starting up. Pre-loading model...")
+    ensure_pipe() # This will download and build the pipeline on the CPU
+    log.info("Model pre-loaded successfully.")
+    # --- Optional: Run a single inference on CPU if WARMUP is enabled ---
+    if WARMUP:
+        log.info("Warmup enabled. Running a test inference on CPU.")
+        warmup()
+    # --- Build and launch the Gradio UI ---
     demo = build_ui()
     # Gradio v5: queue() no longer accepts `concurrency_count`; use per-event limits.
     demo.queue(max_size=QUEUE_SIZE)
+    log.info("Starting Gradio server...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=PORT,