diffusers-internal-dev
/

chronoedit-modular

Diffusers

modular_diffusers

Model card Files Files and versions

xet

Community

sayakpaul HF Staff commited on 15 days ago

Commit

1b761f5

verified ·

1 Parent(s): 0afe5e3

Upload encoders.py with huggingface_hub

Browse files

Files changed (1) hide show

encoders.py +169 -0

encoders.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, List
+from diffusers.modular_pipelines import (
+    ModularPipelineBlocks,
+    ComponentSpec,
+    InputParam,
+    OutputParam,
+    ModularPipeline,
+    PipelineState,
+)
+from diffusers.guiders import ClassifierFreeGuidance
+from transformers import UMT5EncoderModel, AutoTokenizer
+from diffusers.image_processor import PipelineImageInput
+import torch
+from diffusers.modular_pipelines.wan.encoders import WanTextEncoderStep
+from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor
+from diffusers.video_processor import VideoProcessor
+from diffusers.configuration_utils import FrozenDict
+class ChronoEditImageEncoderStep(ModularPipelineBlocks):
+    model_name = "chronoedit"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("image_processor", CLIPImageProcessor),
+            ComponentSpec("image_encoder", CLIPVisionModelWithProjection),
+        ]
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [InputParam("image", type_hint=PipelineImageInput)]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "image_embeds",
+                type_hint=torch.Tensor,
+                description="Image embeddings to use as conditions during the denoising process.",
+            )
+        ]
+    @staticmethod
+    def encode_image(components, image: PipelineImageInput, device: Optional[torch.device] = None):
+        device = device or components.image_encoder.device
+        image = components.image_processor(images=image, return_tensors="pt").to(device)
+        image_embeds = components.image_encoder(**image, output_hidden_states=True)
+        return image_embeds.hidden_states[-2]
+    @torch.no_grad()
+    def __call__(self, components: ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.image_embeds = self.encode_image(components, block_state.image, components._execution_device)
+        self.set_block_state(state, block_state)
+        return components, state
+class ChronoEditProcessImageStep(ModularPipelineBlocks):
+    model_name = "chronoedit"
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("image", type_hint=PipelineImageInput),
+            InputParam("image_embeds", type_hint=torch.Tensor, required=False),
+            InputParam("batch_size", type_hint=int, required=False),
+            InputParam("height", type_hint=int),
+            InputParam("width", type_hint=int),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("processed_image", type_hint=PipelineImageInput),
+            OutputParam("image_embeds", type_hint=torch.Tensor)
+        ]
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            )
+        ]
+    @torch.no_grad()
+    def __call__(self, components: ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        image = block_state.image
+        device = components._execution_device
+        block_state.processed_image = components.video_processor.preprocess(
+            image, height=block_state.height, width=block_state.width
+        ).to(device, dtype=torch.bfloat16)
+        if block_state.image_embeds is not None:
+            image_embeds = block_state.image_embeds
+            batch_size = block_state.batch_size
+            block_state.image_embeds = image_embeds.repeat(batch_size, 1, 1).to(torch.bfloat16)
+        self.set_block_state(state, block_state)
+        return components, state
+# Configure CFG with a guidance scale of 1.
+class ChronoEditTextEncoderStep(WanTextEncoderStep):
+    model_name = "chronoedit"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", UMT5EncoderModel),
+            ComponentSpec("tokenizer", AutoTokenizer),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 1.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: ModularPipeline, state: PipelineState) -> PipelineState:
+        # Get inputs and intermediates
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+        block_state.prepare_unconditional_embeds = components.guider.num_conditions > 1
+        block_state.device = components._execution_device
+        block_state.negative_prompt_embeds = None
+        # Encode input prompt
+        (
+            block_state.prompt_embeds,
+            block_state.negative_prompt_embeds,
+        ) = self.encode_prompt(
+            components,
+            block_state.prompt,
+            block_state.device,
+            1,
+            block_state.prepare_unconditional_embeds,
+            block_state.negative_prompt,
+            prompt_embeds=None,
+            negative_prompt_embeds=block_state.negative_prompt_embeds,
+        )
+        # Add outputs
+        self.set_block_state(state, block_state)
+        return components, state