Spaces:

roll-ai
/

FloVD

Paused

App Files Files Community

roll-ai commited on Jul 3

Commit

2dd8067

verified ·

1 Parent(s): 2bf9932

Update inference/flovd_demo.py

Browse files

Files changed (1) hide show

inference/flovd_demo.py +24 -7

inference/flovd_demo.py CHANGED Viewed

@@ -264,28 +264,45 @@ def save_flow_warped_video(image, flow, filename, fps=16):
         frame_list.append(Image.fromarray(frame))
     export_to_video(frame_list, filename, fps=fps)
 from diffusers.pipelines.cogvideo.pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
 def patch_prepare_latents_safe():
-    def new_prepare_latents(self, image, num_frames, height, width, batch_size, dtype, generator, do_classifier_free_guidance=False):
-        image_latents = self.vae.encode(image.to(self.device, dtype=dtype)).latent_dist.sample()
         image_latents = image_latents * self.vae.config.scaling_factor
         if image_latents.shape[2] != num_frames:
             latent_padding = torch.zeros(
                 (image_latents.shape[0], num_frames - image_latents.shape[2], image_latents.shape[3], image_latents.shape[4]),
-                device=image_latents.device, dtype=image_latents.dtype
             )
             image_latents = torch.cat([image_latents, latent_padding], dim=1)
-        noise = torch.randn_like(image_latents, generator=generator)
-        latents = noise.to(self.device, dtype=dtype)
-        return latents, image_latents.to(self.device, dtype=dtype)
     CogVideoXImageToVideoPipeline.prepare_latents = new_prepare_latents
 def generate_video(
     prompt: str,
     fvsm_path: str,

         frame_list.append(Image.fromarray(frame))
     export_to_video(frame_list, filename, fps=fps)
 from diffusers.pipelines.cogvideo.pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
 def patch_prepare_latents_safe():
+    def new_prepare_latents(
+        self,
+        image,
+        batch_size,
+        latent_channels,
+        num_frames,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        image_latents = self.vae.encode(image.to(device, dtype=dtype)).latent_dist.sample()
         image_latents = image_latents * self.vae.config.scaling_factor
+        # Pad temporal dimension if needed
         if image_latents.shape[2] != num_frames:
             latent_padding = torch.zeros(
                 (image_latents.shape[0], num_frames - image_latents.shape[2], image_latents.shape[3], image_latents.shape[4]),
+                device=image_latents.device,
+                dtype=image_latents.dtype
             )
             image_latents = torch.cat([image_latents, latent_padding], dim=1)
+        if latents is None:
+            noise = torch.randn_like(image_latents, generator=generator)
+            latents = noise.to(device=device, dtype=dtype)
+        return latents, image_latents.to(device, dtype=dtype)
+    from diffusers.pipelines.cogvideo.pipeline_cogvideox_image2video import CogVideoXImageToVideoPipeline
     CogVideoXImageToVideoPipeline.prepare_latents = new_prepare_latents
 def generate_video(
     prompt: str,
     fvsm_path: str,