Spaces:

zerogpu-aoti
/

wan2-2-fp8da-aoti

Running on Zero

App Files Files Community

linoyts HF Staff commited on Aug 1

Commit

048bf77

verified ·

1 Parent(s): a4b1327

Update app.py (#2)

Browse files

- Update app.py (cea01700bf2260c5aacdd47f80e0e5810abb0cde)
- Update optimization.py (201ea86637fa9ad7fb65d35aa467abe396108f3c)

Files changed (2) hide show

app.py +82 -37
optimization.py +17 -0

app.py CHANGED Viewed

@@ -13,19 +13,22 @@ import tempfile
 import numpy as np
 from PIL import Image
 import random
 from optimization import optimize_pipeline_
-MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
 LANDSCAPE_WIDTH = 832
 LANDSCAPE_HEIGHT = 480
 MAX_SEED = np.iinfo(np.int32).max
-FIXED_FPS = 24
 MIN_FRAMES_MODEL = 8
-MAX_FRAMES_MODEL = 121
 pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
@@ -42,6 +45,39 @@ pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
     torch_dtype=torch.bfloat16,
 ).to('cuda')
 optimize_pipeline_(pipe,
     image=Image.new('RGB', (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT)),
@@ -53,7 +89,7 @@ optimize_pipeline_(pipe,
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
-default_negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
 def resize_image(image: Image.Image) -> Image.Image:
@@ -82,8 +118,9 @@ def get_duration(
     input_image,
     prompt,
     negative_prompt,
-    num_frames,
     guidance_scale,
     steps,
     seed,
     randomize_seed,
@@ -96,29 +133,32 @@ def generate_video(
     input_image,
     prompt,
     negative_prompt=default_negative_prompt,
-    num_frames = MAX_FRAMES_MODEL,
-    guidance_scale = 3.5,
-    steps = 28,
     seed = 42,
     randomize_seed = False,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
-    Generate a video from an input image using the Wan 2.1 I2V model with CausVid LoRA.
     This function takes an input image and generates a video animation based on the provided
-    prompt and parameters. It uses the Wan 2.1 14B Image-to-Video model with CausVid LoRA
-    for fast generation in 4-8 steps.
     Args:
         input_image (PIL.Image): The input image to animate. Will be resized to target dimensions.
         prompt (str): Text prompt describing the desired animation or motion.
         negative_prompt (str, optional): Negative prompt to avoid unwanted elements.
             Defaults to default_negative_prompt (contains unwanted visual artifacts).
-        num_frames (int, optional): Number of frames.
-            Defaults to MAX_FRAMES_MODEL
         guidance_scale (float, optional): Controls adherence to the prompt. Higher values = more adherence.
             Defaults to 1.0. Range: 0.0-20.0.
         steps (int, optional): Number of inference steps. More steps = higher quality but slower.
             Defaults to 4. Range: 1-30.
         seed (int, optional): Random seed for reproducible results. Defaults to 42.
@@ -137,23 +177,27 @@ def generate_video(
     Note:
         - The function automatically resizes the input image to the target dimensions
         - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
         - The function uses GPU acceleration via the @spaces.GPU decorator
     """
-    if input_image is None:
-        raise gr.Error("Please upload an input image.")
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
-    resized_image = resize_image(input_image)
     output_frames_list = pipe(
-        image=resized_image,
         prompt=prompt,
         negative_prompt=negative_prompt,
-        height=resized_image.height,
-        width=resized_image.width,
         num_frames=num_frames,
         guidance_scale=float(guidance_scale),
         num_inference_steps=int(steps),
         generator=torch.Generator(device="cuda").manual_seed(current_seed),
     ).frames[0]
@@ -166,20 +210,21 @@ def generate_video(
     return video_path, current_seed
 with gr.Blocks() as demo:
-    gr.Markdown("# Fast 4 steps Wan 2.1 I2V (14B) with CausVid LoRA")
-    gr.Markdown("[CausVid](https://github.com/tianweiy/CausVid) is a distilled version of Wan 2.1 to run faster in just 4-8 steps, [extracted as LoRA by Kijai](https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors) and is compatible with 🧨 diffusers")
     with gr.Row():
         with gr.Column():
-            input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)")
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
-            num_frames_input = gr.Slider(minimum=MIN_FRAMES_MODEL, maximum=MAX_FRAMES_MODEL, step=1, value=MAX_FRAMES_MODEL, label="Frames")
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
-                steps_slider = gr.Slider(minimum=1, maximum=40, step=1, value=28, label="Inference Steps")
-                guidance_scale_input = gr.Slider(minimum=0.0, maximum=20.0, step=0.5, value=1.0, label="Guidance Scale", visible=False)
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
@@ -187,20 +232,20 @@ with gr.Blocks() as demo:
     ui_inputs = [
         input_image_component, prompt_input,
-        negative_prompt_input, num_frames_input,
-        guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
-    gr.Examples(
-        examples=[
-            [
-                "wan_i2v_input.JPG",
-                "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.",
-            ],
-        ],
-        inputs=[input_image_component, prompt_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
-    )
 if __name__ == "__main__":
     demo.queue().launch(mcp_server=True)

 import numpy as np
 from PIL import Image
 import random
+import gc
 from optimization import optimize_pipeline_
+MODEL_ID = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
 LANDSCAPE_WIDTH = 832
 LANDSCAPE_HEIGHT = 480
 MAX_SEED = np.iinfo(np.int32).max
+FIXED_FPS = 16
 MIN_FRAMES_MODEL = 8
+MAX_FRAMES_MODEL = 81
+MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
+MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
 pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
     torch_dtype=torch.bfloat16,
 ).to('cuda')
+# load, fuse, unload before compilation
+# pipe.load_lora_weights(
+#    "vrgamedevgirl84/Wan14BT2VFusioniX",
+#    weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
+#     adapter_name="phantom"
+# )
+# pipe.set_adapters(["phantom"], adapter_weights=[0.95])
+# pipe.fuse_lora(adapter_names=["phantom"], lora_scale=1.0)
+# pipe.unload_lora_weights()
+# pipe.load_lora_weights(
+#    "vrgamedevgirl84/Wan14BT2VFusioniX",
+#    weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
+#     adapter_name="phantom"
+# )
+# kwargs = {}
+# kwargs["load_into_transformer_2"] = True
+# pipe.load_lora_weights(
+#    "vrgamedevgirl84/Wan14BT2VFusioniX",
+#    weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
+#     adapter_name="phantom_2", **kwargs
+# )
+# pipe.set_adapters(["phantom", "phantom_2"], adapter_weights=[1., 1.])
+# pipe.fuse_lora(adapter_names=["phantom"], lora_scale=3., components=["transformer"])
+# pipe.fuse_lora(adapter_names=["phantom_2"], lora_scale=1., components=["transformer_2"])
+# pipe.unload_lora_weights()
+for i in range(3):
+    gc.collect()
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
 optimize_pipeline_(pipe,
     image=Image.new('RGB', (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT)),
 default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
+default_negative_prompt = "色调艳丽, 过曝, 静态, 细节模糊不清, 字幕, 风格, 作品, 画作, 画面, 静止, 整体发灰, 最差质量, 低质量, JPEG压缩残留, 丑陋的, 残缺的, 多余的手指, 画得不好的手部, 画得不好的脸部, 畸形的, 毁容的, 形态畸形的肢体, 手指融合, 静止不动的画面, 杂乱的背景, 三条腿, 背景人很多, 倒着走"
 def resize_image(image: Image.Image) -> Image.Image:
     input_image,
     prompt,
     negative_prompt,
+    duration_seconds,
     guidance_scale,
+    guidance_scale_2,
     steps,
     seed,
     randomize_seed,
     input_image,
     prompt,
     negative_prompt=default_negative_prompt,
+    duration_seconds = MAX_DURATION,
+    guidance_scale = 1,
+    guidance_scale_2 = 3,
+    steps = 6,
     seed = 42,
     randomize_seed = False,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
+    Generate a video from an input image using the Wan 2.2 14B I2V model with Phantom LoRA.
     This function takes an input image and generates a video animation based on the provided
+    prompt and parameters. It uses an FP8 qunatized Wan 2.2 14B Image-to-Video model in with Phantom LoRA
+    for fast generation in 6-8 steps.
     Args:
         input_image (PIL.Image): The input image to animate. Will be resized to target dimensions.
         prompt (str): Text prompt describing the desired animation or motion.
         negative_prompt (str, optional): Negative prompt to avoid unwanted elements.
             Defaults to default_negative_prompt (contains unwanted visual artifacts).
+        duration_seconds (float, optional): Duration of the generated video in seconds.
+            Defaults to 2. Clamped between MIN_FRAMES_MODEL/FIXED_FPS and MAX_FRAMES_MODEL/FIXED_FPS.
         guidance_scale (float, optional): Controls adherence to the prompt. Higher values = more adherence.
             Defaults to 1.0. Range: 0.0-20.0.
+        guidance_scale_2 (float, optional): Controls adherence to the prompt. Higher values = more adherence.
+            Defaults to 1.0. Range: 0.0-20.0.
         steps (int, optional): Number of inference steps. More steps = higher quality but slower.
             Defaults to 4. Range: 1-30.
         seed (int, optional): Random seed for reproducible results. Defaults to 42.
     Note:
         - The function automatically resizes the input image to the target dimensions
+        - Frame count is calculated as duration_seconds * FIXED_FPS (24)
         - Output dimensions are adjusted to be multiples of MOD_VALUE (32)
         - The function uses GPU acceleration via the @spaces.GPU decorator
+        - Generation time varies based on steps and duration (see get_duration function)
     """
+    # if input_image is None:
+    #     raise gr.Error("Please upload an input image.")
+    num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+    # resized_image = resize_image(input_image)
     output_frames_list = pipe(
+        #image=resized_image,
         prompt=prompt,
         negative_prompt=negative_prompt,
+        height=480,
+        width=832,
         num_frames=num_frames,
         guidance_scale=float(guidance_scale),
+        guidance_scale_2=float(guidance_scale_2),
         num_inference_steps=int(steps),
         generator=torch.Generator(device="cuda").manual_seed(current_seed),
     ).frames[0]
     return video_path, current_seed
 with gr.Blocks() as demo:
+    gr.Markdown("# Fast 6 steps Wan 2.2 I2V (14B) with Phantom LoRA")
+    gr.Markdown("run Wan 2.2 in just 6-8 steps, with [FusionX Phantom LoRA by DeeJayT](https://huggingface.co/vrgamedevgirl84/Wan14BT2VFusioniX/tree/main/FusionX_LoRa), compatible with 🧨 diffusers")
     with gr.Row():
         with gr.Column():
+            input_image_component = gr.Image(type="pil", label="Input Image (auto-resized to target H/W)", visible=False)
             prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
+            duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=MAX_DURATION, label="Duration (seconds)", info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
                 seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
                 randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
+                steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
+                guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage")
+                guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=3, label="Guidance Scale 2 - low noise stage")
             generate_button = gr.Button("Generate Video", variant="primary")
         with gr.Column():
     ui_inputs = [
         input_image_component, prompt_input,
+        negative_prompt_input, duration_seconds_input,
+        guidance_scale_input, guidance_scale_2_input, steps_slider, seed_input, randomize_seed_checkbox
     ]
     generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
+    # gr.Examples(
+    #     examples=[
+    #         [
+    #             "wan_i2v_input.JPG",
+    #             "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.",
+    #         ],
+    #     ],
+    #     inputs=[input_image_component, prompt_input], outputs=[video_output, seed_input], fn=generate_video, cache_examples="lazy"
+    # )
 if __name__ == "__main__":
     demo.queue().launch(mcp_server=True)

optimization.py CHANGED Viewed

@@ -36,6 +36,23 @@ def optimize_pipeline_(pipeline: Callable[P, Any], *args: P.args, **kwargs: P.kw
     @spaces.GPU(duration=1500)
     def compile_transformer():
         with capture_component_call(pipeline, 'transformer') as call:
             pipeline(*args, **kwargs)

     @spaces.GPU(duration=1500)
     def compile_transformer():
+        pipeline.load_lora_weights(
+           "vrgamedevgirl84/Wan14BT2VFusioniX",
+           weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
+            adapter_name="phantom"
+        )
+        kwargs_lora = {}
+        kwargs_lora["load_into_transformer_2"] = True
+        pipeline.load_lora_weights(
+           "vrgamedevgirl84/Wan14BT2VFusioniX",
+           weight_name="FusionX_LoRa/Phantom_Wan_14B_FusionX_LoRA.safetensors",
+            adapter_name="phantom_2", **kwargs_lora
+        )
+        pipeline.set_adapters(["phantom", "phantom_2"], adapter_weights=[1., 1.])
+        pipeline.fuse_lora(adapter_names=["phantom"], lora_scale=3., components=["transformer"])
+        pipeline.fuse_lora(adapter_names=["phantom_2"], lora_scale=1., components=["transformer_2"])
+        pipeline.unload_lora_weights()
         with capture_component_call(pipeline, 'transformer') as call:
             pipeline(*args, **kwargs)