Spaces:

Wan-AI
/

Wan-2.2-5B

Running on Zero

App Files Files Community

Update app.py

by hjbfd - opened 9 days ago

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+168

-42

Files changed (1) hide show

app.py +168 -42

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from PIL import Image
 import random
 import numpy as np
 import spaces
 import wan
 from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
@@ -61,7 +63,52 @@ pipeline = wan.WanTI2V(
 )
 print("Pipeline initialized and ready.")
-# --- Helper Functions (from Wan 2.1 Fast demo) ---
 def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
                                  min_slider_h, max_slider_h,
                                  min_slider_w, max_slider_w,
@@ -83,38 +130,65 @@ def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
     return new_h, new_w
-def handle_image_upload_for_dims_wan(uploaded_pil_image, current_h_val, current_w_val):
     """
-    Handle image upload and calculate appropriate dimensions for video generation.
     Args:
-        uploaded_pil_image: The uploaded image (PIL Image or numpy array)
         current_h_val: Current height slider value
         current_w_val: Current width slider value
     Returns:
-        Tuple of gr.update objects for height and width sliders
     """
-    if uploaded_pil_image is None:
-        return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
-        # Convert numpy array to PIL Image if needed
-        if hasattr(uploaded_pil_image, 'shape'):  # numpy array
-            pil_image = Image.fromarray(uploaded_pil_image).convert("RGB")
-        else:  # already PIL Image
-            pil_image = uploaded_pil_image
         new_h, new_w = _calculate_new_dimensions_wan(
             pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
             DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
         )
-        return gr.update(value=new_h), gr.update(value=new_w)
     except Exception as e:
-        gr.Warning("Error attempting to calculate new dimensions")
-        return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
-def get_duration(image,
                  prompt,
                  height,
                  width,
@@ -130,7 +204,8 @@ def get_duration(image,
 # --- 2. Gradio Inference Function ---
 @spaces.GPU(duration=get_duration)
 def generate_video(
-    image,
     prompt,
     height,
     width,
@@ -142,10 +217,11 @@ def generate_video(
     progress=gr.Progress(track_tqdm=True)
 ):
     """
-    Generate a video from text prompt and optional image using the Wan 2.2 TI2V model.
     Args:
-        image: Optional input image (numpy array) for image-to-video generation
         prompt: Text prompt describing the desired video
         height: Target video height in pixels
         width: Target video width in pixels
@@ -167,9 +243,21 @@ def generate_video(
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
     input_image = None
-    if image is not None:
-        input_image = Image.fromarray(image).convert("RGB")
-        # Resize image to match target dimensions
         input_image = input_image.resize((target_w, target_h))
     # Calculate number of frames based on duration
@@ -183,7 +271,7 @@ def generate_video(
         img=input_image,  # Pass None for T2V, Image for I2V
         size=SIZE_CONFIGS.get(size_str, (target_h, target_w)),
         max_area=MAX_AREA_CONFIGS.get(size_str, target_h * target_w),
-        frame_num=num_frames,  # Use calculated frames instead of cfg.frame_num
         shift=shift,
         sample_solver='unipc',
         sampling_steps=int(sampling_steps),
@@ -206,16 +294,29 @@ def generate_video(
 # --- 3. Gradio Interface ---
-css = ".gradio-container {max-width: 1100px !important; margin: 0 auto} #output_video {height: 500px;} #input_image {height: 500px;}"
 with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
-    gr.Markdown("# Wan 2.2 TI2V 5B")
-    gr.Markdown("generate high quality videos using **Wan 2.2 5B Text-Image-to-Video model**,[[model]](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B),[[paper]](https://arxiv.org/abs/2503.20314)")
     with gr.Row():
         with gr.Column(scale=2):
-            image_input = gr.Image(type="numpy", label="Optional (blank = text-to-image)", elem_id="input_image")
-            prompt_input = gr.Textbox(label="Prompt", value="A beautiful waterfall in a lush jungle, cinematic.", lines=3)
             duration_input = gr.Slider(
                 minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
                 maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
@@ -227,8 +328,20 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
             with gr.Accordion("Advanced Settings", open=False):
                 with gr.Row():
-                    height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
-                    width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
                 steps_input = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
                 scale_input = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
                 shift_input = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
@@ -238,17 +351,19 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
             video_output = gr.Video(label="Generated Video", elem_id="output_video")
             run_button = gr.Button("Generate Video", variant="primary")
-    # Add image upload handler
-    image_input.upload(
-        fn=handle_image_upload_for_dims_wan,
-        inputs=[image_input, height_input, width_input],
-        outputs=[height_input, width_input]
     )
-    image_input.clear(
-        fn=handle_image_upload_for_dims_wan,
-        inputs=[image_input, height_input, width_input],
-        outputs=[height_input, width_input]
     )
     example_image_path = os.path.join(os.path.dirname(__file__), "examples/i2v_input.JPG")
@@ -258,7 +373,7 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
             [None, "A cinematic shot of a boat sailing on a calm sea at sunset.", 704, 1280, 2.0],
             [None, "Drone footage flying over a futuristic city with flying cars.", 704, 1280, 2.0],
         ],
-        inputs=[image_input, prompt_input, height_input, width_input, duration_input],
         outputs=video_output,
         fn=generate_video,
         cache_examples="lazy",
@@ -266,7 +381,18 @@ with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
     run_button.click(
         fn=generate_video,
-        inputs=[image_input, prompt_input, height_input, width_input, duration_input, steps_input, scale_input, shift_input, seed_input],
         outputs=video_output
     )

 import random
 import numpy as np
 import spaces
+import cv2
+import tempfile
 import wan
 from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
 )
 print("Pipeline initialized and ready.")
+# --- Helper Functions ---
+def extract_first_frame_from_video(video_path):
+    """
+    Extract the first frame from a video file.
+    Args:
+        video_path: Path to the video file
+    Returns:
+        PIL Image of the first frame, or None if extraction fails
+    """
+    try:
+        cap = cv2.VideoCapture(video_path)
+        ret, frame = cap.read()
+        cap.release()
+        if ret:
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            return Image.fromarray(frame_rgb)
+        return None
+    except Exception as e:
+        print(f"Error extracting frame from video: {e}")
+        return None
+def get_video_dimensions(video_path):
+    """
+    Get the dimensions of a video file.
+    Args:
+        video_path: Path to the video file
+    Returns:
+        Tuple of (width, height) or None if extraction fails
+    """
+    try:
+        cap = cv2.VideoCapture(video_path)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap.release()
+        return width, height
+    except Exception as e:
+        print(f"Error getting video dimensions: {e}")
+        return None
 def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
                                  min_slider_h, max_slider_h,
                                  min_slider_w, max_slider_w,
     return new_h, new_w
+def handle_media_upload_for_dims_wan(uploaded_media, current_h_val, current_w_val):
     """
+    Handle image or video upload and calculate appropriate dimensions.
     Args:
+        uploaded_media: The uploaded file (can be image or video path)
         current_h_val: Current height slider value
         current_w_val: Current width slider value
     Returns:
+        Tuple of (gr.update for height, gr.update for width, first frame as numpy array or None)
     """
+    if uploaded_media is None:
+        return (gr.update(value=DEFAULT_H_SLIDER_VALUE),
+                gr.update(value=DEFAULT_W_SLIDER_VALUE),
+                None)
     try:
+        pil_image = None
+        # Check if it's a video file
+        if isinstance(uploaded_media, str) and uploaded_media.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.webm')):
+            # Extract first frame from video
+            pil_image = extract_first_frame_from_video(uploaded_media)
+            if pil_image is None:
+                gr.Warning("Could not extract frame from video")
+                return (gr.update(value=DEFAULT_H_SLIDER_VALUE),
+                        gr.update(value=DEFAULT_W_SLIDER_VALUE),
+                        None)
+        else:
+            # Handle as image
+            if hasattr(uploaded_media, 'shape'):  # numpy array
+                pil_image = Image.fromarray(uploaded_media).convert("RGB")
+            elif isinstance(uploaded_media, str):  # file path
+                pil_image = Image.open(uploaded_media).convert("RGB")
+            else:  # PIL Image
+                pil_image = uploaded_media
+        # Calculate dimensions
         new_h, new_w = _calculate_new_dimensions_wan(
             pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
             SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
             DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
         )
+        # Convert PIL image to numpy array for display
+        display_image = np.array(pil_image)
+        return gr.update(value=new_h), gr.update(value=new_w), display_image
     except Exception as e:
+        print(f"Error in handle_media_upload_for_dims_wan: {e}")
+        gr.Warning("Error processing uploaded file")
+        return (gr.update(value=DEFAULT_H_SLIDER_VALUE),
+                gr.update(value=DEFAULT_W_SLIDER_VALUE),
+                None)
+def get_duration(video_input,
+                 image_preview,
                  prompt,
                  height,
                  width,
 # --- 2. Gradio Inference Function ---
 @spaces.GPU(duration=get_duration)
 def generate_video(
+    video_input,
+    image_preview,
     prompt,
     height,
     width,
     progress=gr.Progress(track_tqdm=True)
 ):
     """
+    Generate a video from text prompt and optional image/video using the Wan 2.2 TI2V model.
     Args:
+        video_input: Optional input video file path
+        image_preview: Preview image (numpy array) extracted from video or uploaded image
         prompt: Text prompt describing the desired video
         height: Target video height in pixels
         width: Target video width in pixels
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
     input_image = None
+    # Process video input if provided
+    if video_input is not None:
+        if isinstance(video_input, str) and video_input.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.webm')):
+            input_image = extract_first_frame_from_video(video_input)
+        else:
+            # Fallback to image preview
+            if image_preview is not None:
+                input_image = Image.fromarray(image_preview).convert("RGB")
+    elif image_preview is not None:
+        # Use image preview if no video input
+        input_image = Image.fromarray(image_preview).convert("RGB")
+    # Resize image to match target dimensions if we have an input image
+    if input_image is not None:
         input_image = input_image.resize((target_w, target_h))
     # Calculate number of frames based on duration
         img=input_image,  # Pass None for T2V, Image for I2V
         size=SIZE_CONFIGS.get(size_str, (target_h, target_w)),
         max_area=MAX_AREA_CONFIGS.get(size_str, target_h * target_w),
+        frame_num=num_frames,
         shift=shift,
         sample_solver='unipc',
         sampling_steps=int(sampling_steps),
 # --- 3. Gradio Interface ---
+css = ".gradio-container {max-width: 1200px !important; margin: 0 auto} #output_video {height: 500px;} #image_preview {height: 400px;}"
 with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
+    gr.Markdown("# Wan 2.2 TI2V 5B - Video/Image to Video")
+    gr.Markdown("Generate high quality videos using **Wan 2.2 5B Text-Image-to-Video model** with support for video input. [[model]](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B), [[paper]](https://arxiv.org/abs/2503.20314)")
     with gr.Row():
         with gr.Column(scale=2):
+            video_input = gr.Video(
+                label="Upload Video or Image (optional - blank for text-to-video)",
+                sources=["upload"],
+            )
+            image_preview = gr.Image(
+                type="numpy",
+                label="Preview (first frame will be extracted from video)",
+                elem_id="image_preview",
+                interactive=False
+            )
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                value="A beautiful waterfall in a lush jungle, cinematic.",
+                lines=3
+            )
             duration_input = gr.Slider(
                 minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
                 maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
             with gr.Accordion("Advanced Settings", open=False):
                 with gr.Row():
+                    height_input = gr.Slider(
+                        minimum=SLIDER_MIN_H,
+                        maximum=SLIDER_MAX_H,
+                        step=MOD_VALUE,
+                        value=DEFAULT_H_SLIDER_VALUE,
+                        label=f"Output Height (multiple of {MOD_VALUE})"
+                    )
+                    width_input = gr.Slider(
+                        minimum=SLIDER_MIN_W,
+                        maximum=SLIDER_MAX_W,
+                        step=MOD_VALUE,
+                        value=DEFAULT_W_SLIDER_VALUE,
+                        label=f"Output Width (multiple of {MOD_VALUE})"
+                    )
                 steps_input = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
                 scale_input = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
                 shift_input = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
             video_output = gr.Video(label="Generated Video", elem_id="output_video")
             run_button = gr.Button("Generate Video", variant="primary")
+    # Add video/image upload handler
+    video_input.upload(
+        fn=handle_media_upload_for_dims_wan,
+        inputs=[video_input, height_input, width_input],
+        outputs=[height_input, width_input, image_preview]
     )
+    video_input.clear(
+        fn=lambda: (gr.update(value=DEFAULT_H_SLIDER_VALUE),
+                   gr.update(value=DEFAULT_W_SLIDER_VALUE),
+                   None),
+        inputs=[],
+        outputs=[height_input, width_input, image_preview]
     )
     example_image_path = os.path.join(os.path.dirname(__file__), "examples/i2v_input.JPG")
             [None, "A cinematic shot of a boat sailing on a calm sea at sunset.", 704, 1280, 2.0],
             [None, "Drone footage flying over a futuristic city with flying cars.", 704, 1280, 2.0],
         ],
+        inputs=[video_input, prompt_input, height_input, width_input, duration_input],
         outputs=video_output,
         fn=generate_video,
         cache_examples="lazy",
     run_button.click(
         fn=generate_video,
+        inputs=[
+            video_input,
+            image_preview,
+            prompt_input,
+            height_input,
+            width_input,
+            duration_input,
+            steps_input,
+            scale_input,
+            shift_input,
+            seed_input
+        ],
         outputs=video_output
     )