Spaces:

jhshao
/

ChronoDepth

Runtime error

App Files Files Community

jhshao commited on Jun 11, 2024

Commit

02f6d94

1 Parent(s): 8bd250a

add app

Browse files

Files changed (6) hide show

.gitattributes copy +37 -0
README.md +18 -6
app.py +362 -0
chronodepth_pipeline.py +530 -0
gradio_patches/examples.py +13 -0
requirements.txt +14 -0

.gitattributes copy ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+files/sora_1764106507569053773.mp4 filter=lfs diff=lfs merge=lfs -text
+files/sora_e2.mp4 filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,25 @@
 ---
 title: ChronoDepth
-emoji: 👁
-colorFrom: gray
-colorTo: gray
 sdk: gradio
-sdk_version: 4.36.1
 app_file: app.py
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: ChronoDepth
+emoji: 🔥
+colorFrom: pink
+colorTo: blue
 sdk: gradio
+sdk_version: 4.36.0
 app_file: app.py
 pinned: false
+license: cc-by-4.0
 ---
+This is a demo of the monocular video depth estimation pipeline, described in the paper titled ["Learning Temporally Consistent Video Depth from Video Diffusion Priors"](https://arxiv.org/abs/2406.01493).
+```bibtex
+@misc{shao2024learning,
+      title={Learning Temporally Consistent Video Depth from Video Diffusion Priors},
+      author={Jiahao Shao and Yuanbo Yang and Hongyu Zhou and Youmin Zhang and Yujun Shen and Matteo Poggi and Yiyi Liao},
+      year={2024},
+      eprint={2406.01493},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# MIT License
+# Copyright (c) 2024 Jiahao Shao
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import functools
+import os
+import zipfile
+import tempfile
+from io import BytesIO
+import spaces
+import gradio as gr
+import numpy as np
+import torch as torch
+from PIL import Image
+from tqdm import tqdm
+import mediapy as media
+from huggingface_hub import login
+from chronodepth_pipeline import ChronoDepthPipeline
+from gradio_patches.examples import Examples
+default_seed = 2024
+default_num_inference_steps = 5
+default_num_frames = 10
+default_window_size = 9
+default_video_processing_resolution = 768
+default_video_out_max_frames = 10
+default_decode_chunk_size = 10
+def process_video(
+    pipe,
+    path_input,
+    num_inference_steps=default_num_inference_steps,
+    num_frames=default_num_frames,
+    window_size=default_window_size,
+    out_max_frames=default_video_out_max_frames,
+    progress=gr.Progress(),
+):
+    if path_input is None:
+        raise gr.Error(
+            "Missing video in the first pane: upload a file or use one from the gallery below."
+        )
+    name_base, name_ext = os.path.splitext(os.path.basename(path_input))
+    print(f"Processing video {name_base}{name_ext}")
+    path_output_dir = tempfile.mkdtemp()
+    path_out_vis = os.path.join(path_output_dir, f"{name_base}_depth_colored.mp4")
+    path_out_16bit = os.path.join(path_output_dir, f"{name_base}_depth_16bit.zip")
+    generator = torch.Generator(device=pipe.device).manual_seed(default_seed)
+    import time
+    start_time = time.time()
+    zipf = None
+    try:
+        if window_size is None or window_size == num_frames:
+            inpaint_inference = False
+        else:
+            inpaint_inference = True
+        data_ls = []
+        video_data = media.read_video(path_input)
+        video_length = len(video_data)
+        fps = video_data.metadata.fps
+        duration_sec = video_length / fps
+        out_duration_sec = out_max_frames / fps
+        if duration_sec > out_duration_sec:
+            gr.Warning(
+                f"Only the first ~{int(out_duration_sec)} seconds will be processed; "
+                f"use alternative setups such as ChronoDepth on github for full processing"
+            )
+            video_length = out_max_frames
+        for i in tqdm(range(video_length-num_frames+1)):
+            is_first_clip = i == 0
+            is_last_clip = i == video_length - num_frames
+            is_new_clip = (
+                (inpaint_inference and i % window_size == 0)
+                or (inpaint_inference == False and i % num_frames == 0)
+            )
+            if is_first_clip or is_last_clip or is_new_clip:
+                data_ls.append(np.array(video_data[i: i+num_frames])) # [t, H, W, 3]
+        zipf = zipfile.ZipFile(path_out_16bit, "w", zipfile.ZIP_DEFLATED)
+        depth_colored_pred = []
+        depth_pred = []
+        # -------------------- Inference and saving --------------------
+        with torch.no_grad():
+            for iter, batch in enumerate(tqdm(data_ls)):
+                rgb_int = batch
+                input_images = [Image.fromarray(rgb_int[i]) for i in range(num_frames)]
+                # Predict depth
+                if iter == 0: # First clip
+                    pipe_out = pipe(
+                        input_images,
+                        num_frames=len(input_images),
+                        num_inference_steps=num_inference_steps,
+                        decode_chunk_size=default_decode_chunk_size,
+                        motion_bucket_id=127,
+                        fps=7,
+                        noise_aug_strength=0.0,
+                        generator=generator,
+                    )
+                elif inpaint_inference and (iter == len(data_ls) - 1): # temporal inpaint inference for last clip
+                    last_window_size = window_size if video_length%window_size == 0 else video_length%window_size
+                    pipe_out = pipe(
+                        input_images,
+                        num_frames=num_frames,
+                        num_inference_steps=num_inference_steps,
+                        decode_chunk_size=default_decode_chunk_size,
+                        motion_bucket_id=127,
+                        fps=7,
+                        noise_aug_strength=0.0,
+                        generator=generator,
+                        depth_pred_last=depth_frames_pred_ts[last_window_size:],
+                    )
+                elif inpaint_inference and iter > 0: # temporal inpaint inference
+                    pipe_out = pipe(
+                        input_images,
+                        num_frames=num_frames,
+                        num_inference_steps=num_inference_steps,
+                        decode_chunk_size=default_decode_chunk_size,
+                        motion_bucket_id=127,
+                        fps=7,
+                        noise_aug_strength=0.0,
+                        generator=generator,
+                        depth_pred_last=depth_frames_pred_ts[window_size:],
+                    )
+                else: # separate inference
+                    pipe_out = pipe(
+                        input_images,
+                        num_frames=num_frames,
+                        num_inference_steps=num_inference_steps,
+                        decode_chunk_size=default_decode_chunk_size,
+                        motion_bucket_id=127,
+                        fps=7,
+                        noise_aug_strength=0.0,
+                        generator=generator,
+                    )
+                depth_frames_pred = [pipe_out.depth_np[i] for i in range(num_frames)]
+                depth_frames_colored_pred = []
+                for i in range(num_frames):
+                    depth_frame_colored_pred = np.array(pipe_out.depth_colored[i])
+                    depth_frames_colored_pred.append(depth_frame_colored_pred)
+                depth_frames_colored_pred = np.stack(depth_frames_colored_pred, axis=0)
+                depth_frames_pred = np.stack(depth_frames_pred, axis=0)
+                depth_frames_pred_ts = torch.from_numpy(depth_frames_pred).to(pipe.device)
+                depth_frames_pred_ts = depth_frames_pred_ts * 2 - 1
+                if inpaint_inference == False:
+                    if iter == len(data_ls) - 1:
+                        last_window_size = num_frames if video_length%num_frames == 0 else video_length%num_frames
+                        depth_colored_pred.append(depth_frames_colored_pred[-last_window_size:])
+                        depth_pred.append(depth_frames_pred[-last_window_size:])
+                    else:
+                        depth_colored_pred.append(depth_frames_colored_pred)
+                        depth_pred.append(depth_frames_pred)
+                else:
+                    if iter == 0:
+                        depth_colored_pred.append(depth_frames_colored_pred)
+                        depth_pred.append(depth_frames_pred)
+                    elif iter == len(data_ls) - 1:
+                        depth_colored_pred.append(depth_frames_colored_pred[-last_window_size:])
+                        depth_pred.append(depth_frames_pred[-last_window_size:])
+                    else:
+                        depth_colored_pred.append(depth_frames_colored_pred[-window_size:])
+                        depth_pred.append(depth_frames_pred[-window_size:])
+        depth_colored_pred = np.concatenate(depth_colored_pred, axis=0)
+        depth_pred = np.concatenate(depth_pred, axis=0)
+        # -------------------- Save results --------------------
+        # Save images
+        for i in tqdm(range(len(depth_pred))):
+            archive_path = os.path.join(
+                f"{name_base}_depth_16bit", f"{i:05d}.png"
+            )
+            img_byte_arr = BytesIO()
+            depth_16bit = Image.fromarray((depth_pred[i] * 65535.0).astype(np.uint16))
+            depth_16bit.save(img_byte_arr, format="png")
+            img_byte_arr.seek(0)
+            zipf.writestr(archive_path, img_byte_arr.read())
+        # Export to video
+        media.write_video(path_out_vis, depth_colored_pred, fps=fps)
+    finally:
+        if zipf is not None:
+            zipf.close()
+    end_time = time.time()
+    print(f"Processing time: {end_time - start_time} seconds")
+    return (
+        path_out_vis,
+        [path_out_vis, path_out_16bit],
+    )
+def run_demo_server(pipe):
+    process_pipe_video = spaces.GPU(
+        functools.partial(process_video, pipe), duration=210
+    )
+    os.environ["GRADIO_ALLOW_FLAGGING"] = "never"
+    with gr.Blocks(
+        analytics_enabled=False,
+        title="ChronoDepth Video Depth Estimation",
+        css="""
+            #download {
+                height: 118px;
+            }
+            .slider .inner {
+                width: 5px;
+                background: #FFF;
+            }
+            .viewport {
+                aspect-ratio: 4/3;
+            }
+            h1 {
+                text-align: center;
+                display: block;
+            }
+            h2 {
+                text-align: center;
+                display: block;
+            }
+            h3 {
+                text-align: center;
+                display: block;
+            }
+        """,
+    ) as demo:
+        gr.Markdown(
+            """
+            # ChronoDepth Video Depth Estimation
+            <p align="center">
+            <a title="Website" href="https://jhaoshao.github.io/ChronoDepth/" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://img.shields.io/website?url=https%3A%2F%2Fjhaoshao.github.io%2FChronoDepth%2F&up_message=ChronoDepth&up_color=blue&style=flat&logo=timescale&logoColor=%23FFDC0F">
+            </a>
+            <a title="arXiv" href="https://arxiv.org/abs/2312.02145" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://img.shields.io/badge/arXiv-PDF-b31b1b">
+            </a>
+            <a title="Github" href="https://github.com/jhaoshao/ChronoDepth" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
+                <img src="https://img.shields.io/github/stars/jhaoshao/ChronoDepth?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
+            </a>
+            </p>
+            ChronoDepth is the state-of-the-art video depth estimator for videos in the wild.
+            Upload your video and have a try!<br>
+            We set denoising steps to 5, number of frames for each video clip to 10, and overlap between clips to 1.
+        """
+        )
+        with gr.Row():
+            with gr.Column():
+                video_input = gr.Video(
+                    label="Input Video",
+                    sources=["upload"],
+                )
+                with gr.Row():
+                    video_submit_btn = gr.Button(
+                        value="Compute Depth", variant="primary"
+                    )
+                    video_reset_btn = gr.Button(value="Reset")
+            with gr.Column():
+                video_output_video = gr.Video(
+                    label="Output video depth (red-near, blue-far)",
+                    interactive=False,
+                )
+                video_output_files = gr.Files(
+                    label="Depth outputs",
+                    elem_id="download",
+                    interactive=False,
+                )
+        Examples(
+            fn=process_pipe_video,
+            examples=[
+                os.path.join("files", name)
+                for name in [
+                    "sora_e2.mp4",
+                    "sora_1758192960116785459.mp4",
+                ]
+            ],
+            inputs=[video_input],
+            outputs=[video_output_video, video_output_files],
+            cache_examples=True,
+            directory_name="examples_video",
+        )
+        video_submit_btn.click(
+            fn=process_pipe_video,
+            inputs=[video_input],
+            outputs=[video_output_video, video_output_files],
+            concurrency_limit=1,
+        )
+        video_reset_btn.click(
+            fn=lambda: (None, None, None),
+            inputs=[],
+            outputs=[video_input, video_output_video],
+            concurrency_limit=1,
+        )
+        demo.queue(
+            api_open=False,
+        ).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+        )
+def main():
+    CHECKPOINT = "jhshao/ChronoDepth"
+    if "HF_TOKEN_LOGIN" in os.environ:
+        login(token=os.environ["HF_TOKEN_LOGIN"])
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Running on device: {device}")
+    pipe = ChronoDepthPipeline.from_pretrained(CHECKPOINT)
+    try:
+        import xformers
+        pipe.enable_xformers_memory_efficient_attention()
+    except:
+        pass  # run without xformers
+    pipe = pipe.to(device)
+    run_demo_server(pipe)
+if __name__ == "__main__":
+    main()

chronodepth_pipeline.py ADDED Viewed

	@@ -0,0 +1,530 @@

+# Adapted from Marigold: https://github.com/prs-eth/Marigold and diffusers
+import inspect
+from typing import Union, Optional, List
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from tqdm.auto import tqdm
+import PIL
+from PIL import Image
+from diffusers import (
+    DiffusionPipeline,
+    EulerDiscreteScheduler,
+    UNetSpatioTemporalConditionModel,
+    AutoencoderKLTemporalDecoder,
+)
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from transformers import (
+    CLIPVisionModelWithProjection,
+    CLIPImageProcessor,
+)
+from einops import rearrange, repeat
+class ChronoDepthOutput(BaseOutput):
+    r"""
+    Output class for zero-shot text-to-video pipeline.
+    Args:
+        frames (`[List[PIL.Image.Image]`, `np.ndarray`]):
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+    """
+    depth_np: np.ndarray
+    depth_colored: Union[List[PIL.Image.Image], np.ndarray]
+class ChronoDepthPipeline(DiffusionPipeline):
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    rgb_latent_scale_factor = 0.18215
+    depth_latent_scale_factor = 0.18215
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        if not hasattr(self, "dtype"):
+            self.dtype = self.unet.dtype
+    def encode_RGB(self,
+                   image: torch.Tensor,
+                   ):
+        video_length = image.shape[1]
+        image = rearrange(image, "b f c h w -> (b f) c h w")
+        latents = self.vae.encode(image).latent_dist.sample()
+        latents = rearrange(latents, "(b f) c h w -> b f c h w", f=video_length)
+        latents = latents * self.vae.config.scaling_factor
+        return latents
+    def _encode_image(self, image, device, discard=True):
+        '''
+        set image to zero tensor discards the image embeddings if discard is True
+        '''
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            if discard:
+                image = np.zeros_like(image)
+            image = self.image_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+            # Normalize the image with for CLIP input
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        return image_embeddings
+    def decode_depth(self, depth_latent: torch.Tensor, decode_chunk_size=5) -> torch.Tensor:
+        num_frames = depth_latent.shape[1]
+        depth_latent = rearrange(depth_latent, "b f c h w -> (b f) c h w")
+        depth_latent = depth_latent / self.vae.config.scaling_factor
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        depth_frames = []
+        for i in range(0, depth_latent.shape[0], decode_chunk_size):
+            num_frames_in = depth_latent[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            depth_frame = self.vae.decode(depth_latent[i : i + decode_chunk_size], **decode_kwargs).sample
+            depth_frames.append(depth_frame)
+        depth_frames = torch.cat(depth_frames, dim=0)
+        depth_frames = depth_frames.reshape(-1, num_frames, *depth_frames.shape[1:])
+        depth_mean = depth_frames.mean(dim=2, keepdim=True)
+        return depth_mean
+    def _get_add_time_ids(self,
+                          fps,
+                          motion_bucket_id,
+                          noise_aug_strength,
+                          dtype,
+                          batch_size,
+                          ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * \
+            len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size, 1)
+        return add_time_ids
+    def decode_latents(self, latents, num_frames, decode_chunk_size=14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+        self,
+        shape,
+        dtype,
+        device,
+        generator,
+        latent=None,
+    ):
+        if isinstance(generator, list) and len(generator) != shape[0]:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {shape[0]}. Make sure the batch size matches the length of the generators."
+            )
+        if latent is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Union[List[PIL.Image.Image], torch.FloatTensor],
+        height: int = 576,
+        width: int = 768,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 10,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        color_map: str="Spectral",
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        show_progress_bar: bool = True,
+        match_input_res: bool = True,
+        depth_pred_last: Optional[torch.FloatTensor] = None,
+    ):
+        assert height >= 0 and width >=0
+        assert num_inference_steps >=1
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(input_image, height, width)
+        # 2. Define call parameters
+        if isinstance(input_image, list):
+            batch_size = 1
+            input_size = input_image[0].size
+        elif isinstance(input_image, torch.Tensor):
+            batch_size = input_image.shape[0]
+            input_size = input_image.shape[:-3:-1]
+        assert batch_size == 1, "Batch size must be 1 for now"
+        device = self._execution_device
+        # 3. Encode input image
+        image_embeddings = self._encode_image(input_image[0], device)
+        image_embeddings = image_embeddings.repeat((batch_size, 1, 1))
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        input_image = self.image_processor.preprocess(input_image, height=height, width=width).to(device)
+        assert input_image.min() >= -1.0 and input_image.max() <= 1.0
+        noise = randn_tensor(input_image.shape, generator=generator, device=device, dtype=input_image.dtype)
+        input_image = input_image + noise_aug_strength * noise
+        if depth_pred_last is not None:
+            depth_pred_last = depth_pred_last.to(device)
+            # resize depth
+            from torchvision.transforms import InterpolationMode
+            from torchvision.transforms.functional import resize
+            depth_pred_last = resize(depth_pred_last.unsqueeze(1), (height, width), InterpolationMode.NEAREST_EXACT, antialias=True)
+            depth_pred_last = repeat(depth_pred_last, 'f c h w ->b f c h w', b=batch_size)
+        rgb_batch = repeat(input_image, 'f c h w ->b f c h w', b=batch_size)
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+        )
+        added_time_ids = added_time_ids.to(device)
+        depth_pred_raw = self.single_infer(rgb_batch,
+                                           image_embeddings,
+                                           added_time_ids,
+                                           num_inference_steps,
+                                           show_progress_bar,
+                                           generator,
+                                           depth_pred_last=depth_pred_last,
+                                           decode_chunk_size=decode_chunk_size)
+        depth_colored_img_list = []
+        depth_frames = []
+        for i in range(num_frames):
+            depth_frame = depth_pred_raw[:, i].squeeze()
+            # Convert to numpy
+            depth_frame = depth_frame.cpu().numpy().astype(np.float32)
+            if match_input_res:
+                pred_img = Image.fromarray(depth_frame)
+                pred_img = pred_img.resize(input_size, resample=Image.NEAREST)
+                depth_frame = np.asarray(pred_img)
+            # Clip output range: current size is the original size
+            depth_frame = depth_frame.clip(0, 1)
+            # Colorize
+            depth_colored = plt.get_cmap(color_map)(depth_frame, bytes=True)[..., :3]
+            depth_colored_img = Image.fromarray(depth_colored)
+            depth_colored_img_list.append(depth_colored_img)
+            depth_frames.append(depth_frame)
+        depth_frame = np.stack(depth_frames)
+        self.maybe_free_model_hooks()
+        return ChronoDepthOutput(
+            depth_np = depth_frames,
+            depth_colored = depth_colored_img_list,
+        )
+    @torch.no_grad()
+    def single_infer(self,
+                     input_rgb: torch.Tensor,
+                     image_embeddings: torch.Tensor,
+                     added_time_ids: torch.Tensor,
+                     num_inference_steps: int,
+                     show_pbar: bool,
+                     generator: Optional[Union[torch.Generator, List[torch.Generator]]],
+                     depth_pred_last: Optional[torch.Tensor] = None,
+                     decode_chunk_size=1,
+                     ):
+        device = input_rgb.device
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        rgb_latent = self.encode_RGB(input_rgb)
+        rgb_latent = rgb_latent.to(image_embeddings.dtype)
+        if depth_pred_last is not None:
+            depth_pred_last = depth_pred_last.repeat(1, 1, 3, 1, 1)
+            depth_pred_last_latent = self.encode_RGB(depth_pred_last)
+            depth_pred_last_latent = depth_pred_last_latent.to(image_embeddings.dtype)
+        else:
+            depth_pred_last_latent = None
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        depth_latent = self.prepare_latents(
+            rgb_latent.shape,
+            image_embeddings.dtype,
+            device,
+            generator
+        )
+        if show_pbar:
+            iterable = tqdm(
+                enumerate(timesteps),
+                total=len(timesteps),
+                leave=False,
+                desc=" " * 4 + "Diffusion denoising",
+            )
+        else:
+            iterable = enumerate(timesteps)
+        for i, t in iterable:
+            if depth_pred_last_latent is not None:
+                known_frames_num = depth_pred_last_latent.shape[1]
+                epsilon = randn_tensor(
+                    depth_pred_last_latent.shape,
+                    generator=generator,
+                    device=device,
+                    dtype=image_embeddings.dtype
+                    )
+                depth_latent[:, :known_frames_num] = depth_pred_last_latent + epsilon * self.scheduler.sigmas[i]
+            depth_latent = self.scheduler.scale_model_input(depth_latent, t)
+            unet_input = torch.cat([rgb_latent, depth_latent], dim=2)
+            noise_pred = self.unet(
+                unet_input, t, image_embeddings, added_time_ids=added_time_ids
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            depth_latent = self.scheduler.step(noise_pred, t, depth_latent).prev_sample
+        torch.cuda.empty_cache()
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        depth = self.decode_depth(depth_latent, decode_chunk_size=decode_chunk_size)
+        # clip prediction
+        depth = torch.clip(depth, -1.0, 1.0)
+        # shift to [0, 1]
+        depth = (depth + 1.0) / 2.0
+        return depth
+# resizing utils
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out

gradio_patches/examples.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from pathlib import Path
+import gradio
+from gradio.utils import get_cache_folder
+class Examples(gradio.helpers.Examples):
+    def __init__(self, *args, directory_name=None, **kwargs):
+        super().__init__(*args, **kwargs, _initiated_directly=False)
+        if directory_name is not None:
+            self.cached_folder = get_cache_folder() / directory_name
+            self.cached_file = Path(self.cached_folder) / "log.csv"
+        self.create()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+spaces
+gradio>=4.32.1
+diffusers==0.26.0
+easydict==1.13
+einops==0.8.0
+matplotlib==3.8.4
+mediapy==1.2.2
+numpy==1.26.4
+Pillow==10.3.0
+torch==2.0.1
+torchvision==0.15.2
+tqdm==4.66.2
+accelerate==0.28.0
+transformers==4.36.2