File size: 3,522 Bytes

fc56851

# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from diffusers.modular_pipelines import ModularPipelineBlocks, InputParam, OutputParam, ModularPipeline, PipelineState
import numpy as np
import torch
import PIL
from typing import List
from diffusers.modular_pipelines.wan.before_denoise import WanInputStep


def calculate_dimensions(image, mod_value):
    """
    Calculate output dimensions based on resolution settings.

    Args:
        image: PIL Image
        mod_value: Modulo value for dimension alignment

    Returns:
        Tuple of (width, height)
    """

    # Get max area from preset or override
    target_area = 720 * 1280

    # Calculate dimensions maintaining aspect ratio
    aspect_ratio = image.height / image.width
    calculated_height = round(np.sqrt(target_area * aspect_ratio)) // mod_value * mod_value
    calculated_width = round(np.sqrt(target_area / aspect_ratio)) // mod_value * mod_value

    return calculated_width, calculated_height


# Make the input step aware of `negative_prompt_embeds`.
# ChronoEdit uses a `guidance_scale` of 1.
class ChronoEditInputStep(WanInputStep):
    model_name = "chronoedit"

    @property
    def inputs(self) -> List[InputParam]:
        return [
            InputParam("num_videos_per_prompt", default=1),
            InputParam(
                "prompt_embeds",
                required=True,
                type_hint=torch.Tensor,
                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
            ),
            InputParam(
                "negative_prompt_embeds",
                type_hint=torch.Tensor,
                description="Pre-generated negative text embeddings. Can be generated from text_encoder step.",
            ),
        ]


class ChronoEditImageInputStep(ModularPipelineBlocks):
    model_name = "chronoedit"

    @property
    def inputs(self) -> List[InputParam]:
        return [InputParam(name="image")]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
        return [
            OutputParam(name="image", type_hint=PIL.Image.Image),
            OutputParam(name="height", type_hint=int, description="The height set w.r.t input image and specs"),
            OutputParam(name="width", type_hint=int, description="The width set w.r.t input image and specs"),
        ]

    def __call__(self, components: ModularPipeline, state: PipelineState) -> PipelineState:
        block_state = self.get_block_state(state)
        image = block_state.image
        mod_value = components.vae_scale_factor_spatial * components.transformer.config.patch_size[1]
        
        width, height = calculate_dimensions(image, mod_value)
        block_state.image = image.resize((width, height))
        block_state.height = height 
        block_state.width = width
        
        self.set_block_state(state, block_state)
        return components, state