EasyAnimate

Running

App Files Files Community

bubbliiiing commited on Jul 5, 2024

Commit

e262715

1 Parent(s): 788d423

update v3

Browse files

Files changed (15) hide show

.gitignore +160 -0
app.py +3 -3
easyanimate/api/api.py +38 -4
easyanimate/api/post_infer.py +9 -7
easyanimate/data/dataset_image_video.py +64 -3
easyanimate/models/attention.py +196 -139
easyanimate/models/autoencoder_magvit.py +9 -3
easyanimate/models/motion_module.py +146 -277
easyanimate/models/norm.py +97 -0
easyanimate/models/patch.py +1 -1
easyanimate/models/transformer3d.py +81 -75
easyanimate/pipeline/pipeline_easyanimate.py +1 -1
easyanimate/pipeline/pipeline_easyanimate_inpaint.py +257 -91
easyanimate/ui/ui.py +810 -173
easyanimate/utils/utils.py +107 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py CHANGED Viewed

@@ -11,9 +11,9 @@ if __name__ == "__main__":
     server_port = 7860
     # Params below is used when ui_mode = "modelscope"
-    edition = "v2"
-    config_path = "config/easyanimate_video_magvit_motion_module_v2.yaml"
-    model_name = "models/Diffusion_Transformer/EasyAnimateV2-XL-2-512x512"
     savedir_sample = "samples"
     if ui_mode == "modelscope":

     server_port = 7860
     # Params below is used when ui_mode = "modelscope"
+    edition = "v3"
+    config_path = "config/easyanimate_video_slicevae_motion_module_v3.yaml"
+    model_name = "models/Diffusion_Transformer/EasyAnimateV3-XL-2-InP-512x512"
     savedir_sample = "samples"
     if ui_mode == "modelscope":

easyanimate/api/api.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import io
 import base64
 import torch
 import gradio as gr
 from fastapi import FastAPI
 from io import BytesIO
 # Function to encode a file to Base64
 def encode_file_to_base64(file_path):
@@ -59,16 +63,34 @@ def infer_forward_api(_: gr.Blocks, app: FastAPI, controller):
         lora_model_path = datas.get('lora_model_path', 'none')
         lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
         prompt_textbox = datas.get('prompt_textbox', None)
-        negative_prompt_textbox = datas.get('negative_prompt_textbox', '')
         sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
         sample_step_slider = datas.get('sample_step_slider', 30)
         width_slider = datas.get('width_slider', 672)
         height_slider = datas.get('height_slider', 384)
         is_image = datas.get('is_image', False)
         length_slider = datas.get('length_slider', 144)
         cfg_scale_slider = datas.get('cfg_scale_slider', 6)
         seed_textbox = datas.get("seed_textbox", 43)
         try:
             save_sample_path, comment = controller.generate(
                 "",
@@ -80,17 +102,29 @@ def infer_forward_api(_: gr.Blocks, app: FastAPI, controller):
                 negative_prompt_textbox,
                 sampler_dropdown,
                 sample_step_slider,
                 width_slider,
                 height_slider,
-                is_image,
                 length_slider,
                 cfg_scale_slider,
                 seed_textbox,
                 is_api = True,
             )
         except Exception as e:
             torch.cuda.empty_cache()
             save_sample_path = ""
             comment = f"Error. error information is {str(e)}"
-        return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}

 import io
+import gc
 import base64
 import torch
 import gradio as gr
+import tempfile
+import hashlib
 from fastapi import FastAPI
 from io import BytesIO
+from PIL import Image
 # Function to encode a file to Base64
 def encode_file_to_base64(file_path):
         lora_model_path = datas.get('lora_model_path', 'none')
         lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
         prompt_textbox = datas.get('prompt_textbox', None)
+        negative_prompt_textbox = datas.get('negative_prompt_textbox', 'The video is not of a high quality, it has a low resolution, and the audio quality is not clear. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, rotating camera, blurry camera, shaking camera. Deformation, low-resolution, blurry, ugly, distortion.')
         sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
         sample_step_slider = datas.get('sample_step_slider', 30)
+        resize_method = datas.get('resize_method', "Generate by")
         width_slider = datas.get('width_slider', 672)
         height_slider = datas.get('height_slider', 384)
+        base_resolution = datas.get('base_resolution', 512)
         is_image = datas.get('is_image', False)
+        generation_method = datas.get('generation_method', False)
         length_slider = datas.get('length_slider', 144)
+        overlap_video_length = datas.get('overlap_video_length', 4)
+        partial_video_length = datas.get('partial_video_length', 72)
         cfg_scale_slider = datas.get('cfg_scale_slider', 6)
+        start_image = datas.get('start_image', None)
+        end_image = datas.get('end_image', None)
         seed_textbox = datas.get("seed_textbox", 43)
+        generation_method = "Image Generation" if is_image else generation_method
+        temp_directory = tempfile.gettempdir()
+        if start_image is not None:
+            start_image = base64.b64decode(start_image)
+            start_image = [Image.open(BytesIO(start_image))]
+        if end_image is not None:
+            end_image = base64.b64decode(end_image)
+            end_image = [Image.open(BytesIO(end_image))]
         try:
             save_sample_path, comment = controller.generate(
                 "",
                 negative_prompt_textbox,
                 sampler_dropdown,
                 sample_step_slider,
+                resize_method,
                 width_slider,
                 height_slider,
+                base_resolution,
+                generation_method,
                 length_slider,
+                overlap_video_length,
+                partial_video_length,
                 cfg_scale_slider,
+                start_image,
+                end_image,
                 seed_textbox,
                 is_api = True,
             )
         except Exception as e:
+            gc.collect()
             torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
             save_sample_path = ""
             comment = f"Error. error information is {str(e)}"
+            return {"message": comment}
+        if save_sample_path != "":
+            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
+        else:
+            return {"message": comment, "save_sample_path": save_sample_path}

easyanimate/api/post_infer.py CHANGED Viewed

@@ -26,7 +26,7 @@ def post_update_edition(edition, url='http://0.0.0.0:7860'):
     data = r.content.decode('utf-8')
     return data
-def post_infer(is_image, length_slider, url='http://127.0.0.1:7860'):
     datas = json.dumps({
         "base_model_path": "none",
         "motion_module_path": "none",
@@ -38,7 +38,7 @@ def post_infer(is_image, length_slider, url='http://127.0.0.1:7860'):
         "sample_step_slider": 30,
         "width_slider": 672,
         "height_slider": 384,
-        "is_image": is_image,
         "length_slider": length_slider,
         "cfg_scale_slider": 6,
         "seed_textbox": 43,
@@ -55,29 +55,31 @@ if __name__ == '__main__':
     # -------------------------- #
     #  Step 1: update edition
     # -------------------------- #
-    edition = "v2"
     outputs = post_update_edition(edition)
     print('Output update edition: ', outputs)
     # -------------------------- #
     #  Step 2: update edition
     # -------------------------- #
-    diffusion_transformer_path = "/your-path/EasyAnimate/models/Diffusion_Transformer/EasyAnimateV2-XL-2-512x512"
     outputs = post_diffusion_transformer(diffusion_transformer_path)
     print('Output update edition: ', outputs)
     # -------------------------- #
     #  Step 3: infer
     # -------------------------- #
-    is_image = False
-    length_slider = 27
-    outputs = post_infer(is_image, length_slider)
     # Get decoded data
     outputs = json.loads(outputs)
     base64_encoding = outputs["base64_encoding"]
     decoded_data = base64.b64decode(base64_encoding)
     if is_image or length_slider == 1:
         file_path = "1.png"
     else:

     data = r.content.decode('utf-8')
     return data
+def post_infer(generation_method, length_slider, url='http://127.0.0.1:7860'):
     datas = json.dumps({
         "base_model_path": "none",
         "motion_module_path": "none",
         "sample_step_slider": 30,
         "width_slider": 672,
         "height_slider": 384,
+        "generation_method": "Video Generation",
         "length_slider": length_slider,
         "cfg_scale_slider": 6,
         "seed_textbox": 43,
     # -------------------------- #
     #  Step 1: update edition
     # -------------------------- #
+    edition = "v3"
     outputs = post_update_edition(edition)
     print('Output update edition: ', outputs)
     # -------------------------- #
     #  Step 2: update edition
     # -------------------------- #
+    diffusion_transformer_path = "models/Diffusion_Transformer/EasyAnimateV3-XL-2-512x512"
     outputs = post_diffusion_transformer(diffusion_transformer_path)
     print('Output update edition: ', outputs)
     # -------------------------- #
     #  Step 3: infer
     # -------------------------- #
+    # "Video Generation" and "Image Generation"
+    generation_method = "Video Generation"
+    length_slider = 72
+    outputs = post_infer(generation_method, length_slider)
     # Get decoded data
     outputs = json.loads(outputs)
     base64_encoding = outputs["base64_encoding"]
     decoded_data = base64.b64decode(base64_encoding)
+    is_image = True if generation_method == "Image Generation" else False
     if is_image or length_slider == 1:
         file_path = "1.png"
     else:

easyanimate/data/dataset_image_video.py CHANGED Viewed

@@ -12,6 +12,7 @@ import gc
 import numpy as np
 import torch
 import torchvision.transforms as transforms
 from func_timeout import func_timeout, FunctionTimedOut
 from decord import VideoReader
 from PIL import Image
@@ -21,6 +22,52 @@ from contextlib import contextmanager
 VIDEO_READER_TIMEOUT = 20
 class ImageVideoSampler(BatchSampler):
     """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
@@ -88,10 +135,11 @@ class ImageVideoDataset(Dataset):
             video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
             image_sample_size=512,
             video_repeat=0,
-            text_drop_ratio=0.001,
             enable_bucket=False,
             video_length_drop_start=0.1,
             video_length_drop_end=0.9,
         ):
         # Loading annotations from files
         print(f"loading annotations from {ann_path} ...")
@@ -120,6 +168,8 @@ class ImageVideoDataset(Dataset):
         # TODO: enable bucket training
         self.enable_bucket = enable_bucket
         self.text_drop_ratio = text_drop_ratio
         self.video_length_drop_start = video_length_drop_start
         self.video_length_drop_end = video_length_drop_end
@@ -165,7 +215,7 @@ class ImageVideoDataset(Dataset):
                 video_length = int(self.video_length_drop_end * len(video_reader))
                 clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
-                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length)
                 batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
                 try:
@@ -230,6 +280,17 @@ class ImageVideoDataset(Dataset):
             except Exception as e:
                 print(e, self.dataset[idx % len(self.dataset)])
                 idx = random.randint(0, self.length-1)
         return sample
 if __name__ == "__main__":
@@ -238,4 +299,4 @@ if __name__ == "__main__":
     )
     dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=16)
     for idx, batch in enumerate(dataloader):
-        print(batch["pixel_values"].shape, len(batch["text"]))

 import numpy as np
 import torch
 import torchvision.transforms as transforms
 from func_timeout import func_timeout, FunctionTimedOut
 from decord import VideoReader
 from PIL import Image
 VIDEO_READER_TIMEOUT = 20
+def get_random_mask(shape):
+    f, c, h, w = shape
+    if f != 1:
+        mask_index = np.random.randint(1, 4)
+    else:
+        mask_index = np.random.randint(1, 2)
+    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
+    if mask_index == 0:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+        mask[:, :, start_y:end_y, start_x:end_x] = 1
+    elif mask_index == 1:
+        mask[:, :, :, :] = 1
+    elif mask_index == 2:
+        mask_frame_index = np.random.randint(1, 5)
+        mask[mask_frame_index:, :, :, :] = 1
+    elif mask_index == 3:
+        mask_frame_index = np.random.randint(1, 5)
+        mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
+    elif mask_index == 4:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+        mask_frame_before = np.random.randint(0, f // 2)
+        mask_frame_after = np.random.randint(f // 2, f)
+        mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
+    else:
+        raise ValueError(f"The mask_index {mask_index} is not define")
+    return mask
 class ImageVideoSampler(BatchSampler):
     """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
             video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
             image_sample_size=512,
             video_repeat=0,
+            text_drop_ratio=-1,
             enable_bucket=False,
             video_length_drop_start=0.1,
             video_length_drop_end=0.9,
+            enable_inpaint=False,
         ):
         # Loading annotations from files
         print(f"loading annotations from {ann_path} ...")
         # TODO: enable bucket training
         self.enable_bucket = enable_bucket
         self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint  = enable_inpaint
         self.video_length_drop_start = video_length_drop_start
         self.video_length_drop_end = video_length_drop_end
                 video_length = int(self.video_length_drop_end * len(video_reader))
                 clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
                 batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
                 try:
             except Exception as e:
                 print(e, self.dataset[idx % len(self.dataset)])
                 idx = random.randint(0, self.length-1)
+        if self.enable_inpaint and not self.enable_bucket:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
+            sample["mask_pixel_values"] = mask_pixel_values
+            sample["mask"] = mask
+            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
+            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
+            sample["clip_pixel_values"] = clip_pixel_values
         return sample
 if __name__ == "__main__":
     )
     dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=16)
     for idx, batch in enumerate(dataloader):
+        print(batch["pixel_values"].shape, len(batch["text"]))

easyanimate/models/attention.py CHANGED Viewed

@@ -11,17 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
 from typing import Any, Dict, Optional
 import torch
 import torch.nn.functional as F
 import torch.nn.init as init
-from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
 from diffusers.models.attention import AdaLayerNorm, FeedForward
-from diffusers.models.attention_processor import Attention
 from diffusers.models.embeddings import SinusoidalPositionalEmbedding
-from diffusers.models.lora import LoRACompatibleLinear
 from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormZero
 from diffusers.utils import USE_PEFT_BACKEND
 from diffusers.utils.import_utils import is_xformers_available
@@ -29,7 +37,8 @@ from diffusers.utils.torch_utils import maybe_allow_in_graph
 from einops import rearrange, repeat
 from torch import nn
-from .motion_module import get_motion_module
 if is_xformers_available():
     import xformers
@@ -38,6 +47,13 @@ else:
     xformers = None
 @maybe_allow_in_graph
 class GatedSelfAttentionDense(nn.Module):
     r"""
@@ -59,8 +75,8 @@ class GatedSelfAttentionDense(nn.Module):
         self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
         self.ff = FeedForward(query_dim, activation_fn="geglu")
-        self.norm1 = nn.LayerNorm(query_dim)
-        self.norm2 = nn.LayerNorm(query_dim)
         self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
         self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
@@ -80,14 +96,6 @@ class GatedSelfAttentionDense(nn.Module):
         return x
-def zero_module(module):
-    # Zero out the parameters of a module and return it.
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
 class KVCompressionCrossAttention(nn.Module):
     r"""
     A cross attention layer.
@@ -154,7 +162,7 @@ class KVCompressionCrossAttention(nn.Module):
             stride=2,
             bias=True
         )
-        self.kv_compression_norm = nn.LayerNorm(query_dim)
         init.constant_(self.kv_compression.weight, 1 / 4)
         if self.kv_compression.bias is not None:
             init.constant_(self.kv_compression.bias, 0)
@@ -410,6 +418,8 @@ class TemporalTransformerBlock(nn.Module):
         # motion module kwargs
         motion_module_type = "VanillaGrid",
         motion_module_kwargs = None,
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
@@ -442,7 +452,7 @@ class TemporalTransformerBlock(nn.Module):
         elif self.use_ada_layer_norm_zero:
             self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
         else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.kvcompression = kvcompression
         if kvcompression:
@@ -456,16 +466,28 @@ class TemporalTransformerBlock(nn.Module):
                 upcast_attention=upcast_attention,
             )
         else:
-            self.attn1 = Attention(
-                query_dim=dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-                upcast_attention=upcast_attention,
-            )
-        print(self.attn1)
         self.attn_temporal = get_motion_module(
             in_channels = dim,
@@ -481,27 +503,45 @@ class TemporalTransformerBlock(nn.Module):
             self.norm2 = (
                 AdaLayerNorm(dim, num_embeds_ada_norm)
                 if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             )
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
             self.attn2 = None
         # 3. Feed-forward
         if not self.use_ada_layer_norm_single:
-            self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
         # 4. Fuser
         if attention_type == "gated" or attention_type == "gated-text-image":
             self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
@@ -654,6 +694,9 @@ class TemporalTransformerBlock(nn.Module):
             )
         else:
             ff_output = self.ff(norm_hidden_states, scale=lora_scale)
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
@@ -723,6 +766,8 @@ class SelfAttentionTemporalTransformerBlock(nn.Module):
         attention_type: str = "default",
         positional_embeddings: Optional[str] = None,
         num_positional_embeddings: Optional[int] = None,
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
@@ -755,17 +800,30 @@ class SelfAttentionTemporalTransformerBlock(nn.Module):
         elif self.use_ada_layer_norm_zero:
             self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
         else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=attention_bias,
-            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-            upcast_attention=upcast_attention,
-        )
         # 2. Cross-Attn
         if cross_attention_dim is not None or double_self_attention:
@@ -775,27 +833,45 @@ class SelfAttentionTemporalTransformerBlock(nn.Module):
             self.norm2 = (
                 AdaLayerNorm(dim, num_embeds_ada_norm)
                 if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             )
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
             self.attn2 = None
         # 3. Feed-forward
         if not self.use_ada_layer_norm_single:
-            self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
         # 4. Fuser
         if attention_type == "gated" or attention_type == "gated-text-image":
             self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
@@ -927,6 +1003,9 @@ class SelfAttentionTemporalTransformerBlock(nn.Module):
             )
         else:
             ff_output = self.ff(norm_hidden_states, scale=lora_scale)
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
@@ -997,6 +1076,8 @@ class KVCompressionTransformerBlock(nn.Module):
         positional_embeddings: Optional[str] = None,
         num_positional_embeddings: Optional[int] = None,
         kvcompression: Optional[bool] = False,
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
@@ -1029,7 +1110,7 @@ class KVCompressionTransformerBlock(nn.Module):
         elif self.use_ada_layer_norm_zero:
             self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
         else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.kvcompression = kvcompression
         if kvcompression:
@@ -1043,16 +1124,28 @@ class KVCompressionTransformerBlock(nn.Module):
                 upcast_attention=upcast_attention,
             )
         else:
-            self.attn1 = Attention(
-                query_dim=dim,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
-                upcast_attention=upcast_attention,
-            )
-        print(self.attn1)
         # 2. Cross-Attn
         if cross_attention_dim is not None or double_self_attention:
@@ -1062,27 +1155,45 @@ class KVCompressionTransformerBlock(nn.Module):
             self.norm2 = (
                 AdaLayerNorm(dim, num_embeds_ada_norm)
                 if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             )
-            self.attn2 = Attention(
-                query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
-                heads=num_attention_heads,
-                dim_head=attention_head_dim,
-                dropout=dropout,
-                bias=attention_bias,
-                upcast_attention=upcast_attention,
-            )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
             self.attn2 = None
         # 3. Feed-forward
         if not self.use_ada_layer_norm_single:
-            self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
         # 4. Fuser
         if attention_type == "gated" or attention_type == "gated-text-image":
             self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
@@ -1229,6 +1340,9 @@ class KVCompressionTransformerBlock(nn.Module):
             )
         else:
             ff_output = self.ff(norm_hidden_states, scale=lora_scale)
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
@@ -1239,61 +1353,4 @@ class KVCompressionTransformerBlock(nn.Module):
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
-        return hidden_states
-class FeedForward(nn.Module):
-    r"""
-    A feed-forward layer.
-    Parameters:
-        dim (`int`): The number of channels in the input.
-        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
-        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
-        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
-    """
-    def __init__(
-        self,
-        dim: int,
-        dim_out: Optional[int] = None,
-        mult: int = 4,
-        dropout: float = 0.0,
-        activation_fn: str = "geglu",
-        final_dropout: bool = False,
-    ):
-        super().__init__()
-        inner_dim = int(dim * mult)
-        dim_out = dim_out if dim_out is not None else dim
-        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
-        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim)
-        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh")
-        elif activation_fn == "geglu":
-            act_fn = GEGLU(dim, inner_dim)
-        elif activation_fn == "geglu-approximate":
-            act_fn = ApproximateGELU(dim, inner_dim)
-        self.net = nn.ModuleList([])
-        # project in
-        self.net.append(act_fn)
-        # project dropout
-        self.net.append(nn.Dropout(dropout))
-        # project out
-        self.net.append(linear_cls(inner_dim, dim_out))
-        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
-        if final_dropout:
-            self.net.append(nn.Dropout(dropout))
-    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
-        compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
-        for module in self.net:
-            if isinstance(module, compatible_cls):
-                hidden_states = module(hidden_states, scale)
-            else:
-                hidden_states = module(hidden_states)
-        return hidden_states

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import Any, Dict, Optional
+import diffusers
+import pkg_resources
 import torch
 import torch.nn.functional as F
 import torch.nn.init as init
+installed_version = diffusers.__version__
+if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+    from diffusers.models.attention_processor import (Attention,
+                                                      AttnProcessor2_0,
+                                                      HunyuanAttnProcessor2_0)
+else:
+    from diffusers.models.attention_processor import Attention, AttnProcessor2_0
 from diffusers.models.attention import AdaLayerNorm, FeedForward
 from diffusers.models.embeddings import SinusoidalPositionalEmbedding
 from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormZero
 from diffusers.utils import USE_PEFT_BACKEND
 from diffusers.utils.import_utils import is_xformers_available
 from einops import rearrange, repeat
 from torch import nn
+from .motion_module import PositionalEncoding, get_motion_module
+from .norm import FP32LayerNorm
 if is_xformers_available():
     import xformers
     xformers = None
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
 @maybe_allow_in_graph
 class GatedSelfAttentionDense(nn.Module):
     r"""
         self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
         self.ff = FeedForward(query_dim, activation_fn="geglu")
+        self.norm1 = FP32LayerNorm(query_dim)
+        self.norm2 = FP32LayerNorm(query_dim)
         self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
         self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
         return x
 class KVCompressionCrossAttention(nn.Module):
     r"""
     A cross attention layer.
             stride=2,
             bias=True
         )
+        self.kv_compression_norm = FP32LayerNorm(query_dim)
         init.constant_(self.kv_compression.weight, 1 / 4)
         if self.kv_compression.bias is not None:
             init.constant_(self.kv_compression.bias, 0)
         # motion module kwargs
         motion_module_type = "VanillaGrid",
         motion_module_kwargs = None,
+        qk_norm = False,
+        after_norm = False,
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
         elif self.use_ada_layer_norm_zero:
             self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
         else:
+            self.norm1 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.kvcompression = kvcompression
         if kvcompression:
                 upcast_attention=upcast_attention,
             )
         else:
+            if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+                self.attn1 = Attention(
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                    upcast_attention=upcast_attention,
+                    qk_norm="layer_norm" if qk_norm else None,
+                    processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+                )
+            else:
+                self.attn1 = Attention(
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                    upcast_attention=upcast_attention,
+                )
         self.attn_temporal = get_motion_module(
             in_channels = dim,
             self.norm2 = (
                 AdaLayerNorm(dim, num_embeds_ada_norm)
                 if self.use_ada_layer_norm
+                else FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             )
+            if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+                self.attn2 = Attention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    qk_norm="layer_norm" if qk_norm else None,
+                    processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+                )  # is self-attn if encoder_hidden_states is none
+            else:
+                self.attn2 = Attention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
             self.attn2 = None
         # 3. Feed-forward
         if not self.use_ada_layer_norm_single:
+            self.norm3 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        if after_norm:
+            self.norm4 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        else:
+            self.norm4 = None
         # 4. Fuser
         if attention_type == "gated" or attention_type == "gated-text-image":
             self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
             )
         else:
             ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+        if self.norm4 is not None:
+            ff_output = self.norm4(ff_output)
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
         attention_type: str = "default",
         positional_embeddings: Optional[str] = None,
         num_positional_embeddings: Optional[int] = None,
+        qk_norm = False,
+        after_norm = False,
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
         elif self.use_ada_layer_norm_zero:
             self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
         else:
+            self.norm1 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+                qk_norm="layer_norm" if qk_norm else None,
+                processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+            )
+        else:
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+            )
         # 2. Cross-Attn
         if cross_attention_dim is not None or double_self_attention:
             self.norm2 = (
                 AdaLayerNorm(dim, num_embeds_ada_norm)
                 if self.use_ada_layer_norm
+                else FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             )
+            if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+                self.attn2 = Attention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    qk_norm="layer_norm" if qk_norm else None,
+                    processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+                )  # is self-attn if encoder_hidden_states is none
+            else:
+                self.attn2 = Attention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
             self.attn2 = None
         # 3. Feed-forward
         if not self.use_ada_layer_norm_single:
+            self.norm3 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        if after_norm:
+            self.norm4 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        else:
+            self.norm4 = None
         # 4. Fuser
         if attention_type == "gated" or attention_type == "gated-text-image":
             self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
             )
         else:
             ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+        if self.norm4 is not None:
+            ff_output = self.norm4(ff_output)
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
         positional_embeddings: Optional[str] = None,
         num_positional_embeddings: Optional[int] = None,
         kvcompression: Optional[bool] = False,
+        qk_norm = False,
+        after_norm = False,
     ):
         super().__init__()
         self.only_cross_attention = only_cross_attention
         elif self.use_ada_layer_norm_zero:
             self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
         else:
+            self.norm1 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.kvcompression = kvcompression
         if kvcompression:
                 upcast_attention=upcast_attention,
             )
         else:
+            if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+                self.attn1 = Attention(
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                    upcast_attention=upcast_attention,
+                    qk_norm="layer_norm" if qk_norm else None,
+                    processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+                )
+            else:
+                self.attn1 = Attention(
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                    upcast_attention=upcast_attention,
+                )
         # 2. Cross-Attn
         if cross_attention_dim is not None or double_self_attention:
             self.norm2 = (
                 AdaLayerNorm(dim, num_embeds_ada_norm)
                 if self.use_ada_layer_norm
+                else FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
             )
+            if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+                self.attn2 = Attention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    qk_norm="layer_norm" if qk_norm else None,
+                    processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+                )  # is self-attn if encoder_hidden_states is none
+            else:
+                self.attn2 = Attention(
+                    query_dim=dim,
+                    cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                )  # is self-attn if encoder_hidden_states is none
         else:
             self.norm2 = None
             self.attn2 = None
         # 3. Feed-forward
         if not self.use_ada_layer_norm_single:
+            self.norm3 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        if after_norm:
+            self.norm4 = FP32LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        else:
+            self.norm4 = None
         # 4. Fuser
         if attention_type == "gated" or attention_type == "gated-text-image":
             self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
             )
         else:
             ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+        if self.norm4 is not None:
+            ff_output = self.norm4(ff_output)
         if self.use_ada_layer_norm_zero:
             ff_output = gate_mlp.unsqueeze(1) * ff_output
         if hidden_states.ndim == 4:
             hidden_states = hidden_states.squeeze(1)
+        return hidden_states

easyanimate/models/autoencoder_magvit.py CHANGED Viewed

@@ -17,7 +17,12 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import FromOriginalVAEMixin
 from diffusers.models.attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS, CROSS_ATTENTION_PROCESSORS, Attention,
     AttentionProcessor, AttnAddedKVProcessor, AttnProcessor)
@@ -93,6 +98,7 @@ class AutoencoderKLMagvit(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
         norm_num_groups: int = 32,
         scaling_factor: float = 0.1825,
         slice_compression_vae=False,
         mini_batch_encoder=9,
         mini_batch_decoder=3,
     ):
@@ -145,8 +151,8 @@ class AutoencoderKLMagvit(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
         self.mini_batch_encoder = mini_batch_encoder
         self.mini_batch_decoder = mini_batch_decoder
         self.use_slicing = False
-        self.use_tiling = False
-        self.tile_sample_min_size = 256
         self.tile_overlap_factor = 0.25
         self.tile_latent_min_size = int(self.tile_sample_min_size / (2 ** (len(ch_mult) - 1)))
         self.scaling_factor = scaling_factor

 import torch.nn as nn
 import torch.nn.functional as F
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+try:
+    from diffusers.loaders import FromOriginalVAEMixin
+except:
+    from diffusers.loaders import FromOriginalModelMixin as FromOriginalVAEMixin
 from diffusers.models.attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS, CROSS_ATTENTION_PROCESSORS, Attention,
     AttentionProcessor, AttnAddedKVProcessor, AttnProcessor)
         norm_num_groups: int = 32,
         scaling_factor: float = 0.1825,
         slice_compression_vae=False,
+        use_tiling=False,
         mini_batch_encoder=9,
         mini_batch_decoder=3,
     ):
         self.mini_batch_encoder = mini_batch_encoder
         self.mini_batch_decoder = mini_batch_decoder
         self.use_slicing = False
+        self.use_tiling = use_tiling
+        self.tile_sample_min_size = 384
         self.tile_overlap_factor = 0.25
         self.tile_latent_min_size = int(self.tile_sample_min_size / (2 ** (len(ch_mult) - 1)))
         self.scaling_factor = scaling_factor

easyanimate/models/motion_module.py CHANGED Viewed

@@ -1,248 +1,33 @@
 """Modified from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/motion_module.py
 """
 import math
-from typing import Any, Callable, List, Optional, Tuple, Union
 import torch
-import torch.nn.functional as F
 from diffusers.models.attention import FeedForward
 from diffusers.utils.import_utils import is_xformers_available
 from einops import rearrange, repeat
 from torch import nn
 if is_xformers_available():
     import xformers
     import xformers.ops
 else:
     xformers = None
-class CrossAttention(nn.Module):
-    r"""
-    A cross attention layer.
-    Parameters:
-        query_dim (`int`): The number of channels in the query.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
-        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
-        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        bias (`bool`, *optional*, defaults to False):
-            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
-    """
-    def __init__(
-        self,
-        query_dim: int,
-        cross_attention_dim: Optional[int] = None,
-        heads: int = 8,
-        dim_head: int = 64,
-        dropout: float = 0.0,
-        bias=False,
-        upcast_attention: bool = False,
-        upcast_softmax: bool = False,
-        added_kv_proj_dim: Optional[int] = None,
-        norm_num_groups: Optional[int] = None,
-    ):
-        super().__init__()
-        inner_dim = dim_head * heads
-        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
-        self.upcast_attention = upcast_attention
-        self.upcast_softmax = upcast_softmax
-        self.scale = dim_head**-0.5
-        self.heads = heads
-        # for slice_size > 0 the attention score computation
-        # is split across the batch axis to save memory
-        # You can set slice_size with `set_attention_slice`
-        self.sliceable_head_dim = heads
-        self._slice_size = None
-        self._use_memory_efficient_attention_xformers = False
-        self.added_kv_proj_dim = added_kv_proj_dim
-        if norm_num_groups is not None:
-            self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
-        else:
-            self.group_norm = None
-        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
-        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
-        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
-        if self.added_kv_proj_dim is not None:
-            self.add_k_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
-            self.add_v_proj = nn.Linear(added_kv_proj_dim, cross_attention_dim)
-        self.to_out = nn.ModuleList([])
-        self.to_out.append(nn.Linear(inner_dim, query_dim))
-        self.to_out.append(nn.Dropout(dropout))
-    def set_use_memory_efficient_attention_xformers(
-        self, valid: bool, attention_op: Optional[Callable] = None
-    ) -> None:
-        self._use_memory_efficient_attention_xformers = valid
-    def reshape_heads_to_batch_dim(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.heads
-        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
-        return tensor
-    def reshape_batch_dim_to_heads(self, tensor):
-        batch_size, seq_len, dim = tensor.shape
-        head_size = self.heads
-        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
-        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
-        return tensor
-    def set_attention_slice(self, slice_size):
-        if slice_size is not None and slice_size > self.sliceable_head_dim:
-            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
-        self._slice_size = slice_size
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = hidden_states.shape
-        encoder_hidden_states = encoder_hidden_states
-        if self.group_norm is not None:
-            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = self.to_q(hidden_states)
-        dim = query.shape[-1]
-        query = self.reshape_heads_to_batch_dim(query)
-        if self.added_kv_proj_dim is not None:
-            key = self.to_k(hidden_states)
-            value = self.to_v(hidden_states)
-            encoder_hidden_states_key_proj = self.add_k_proj(encoder_hidden_states)
-            encoder_hidden_states_value_proj = self.add_v_proj(encoder_hidden_states)
-            key = self.reshape_heads_to_batch_dim(key)
-            value = self.reshape_heads_to_batch_dim(value)
-            encoder_hidden_states_key_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_key_proj)
-            encoder_hidden_states_value_proj = self.reshape_heads_to_batch_dim(encoder_hidden_states_value_proj)
-            key = torch.concat([encoder_hidden_states_key_proj, key], dim=1)
-            value = torch.concat([encoder_hidden_states_value_proj, value], dim=1)
-        else:
-            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-            key = self.to_k(encoder_hidden_states)
-            value = self.to_v(encoder_hidden_states)
-            key = self.reshape_heads_to_batch_dim(key)
-            value = self.reshape_heads_to_batch_dim(value)
-        if attention_mask is not None:
-            if attention_mask.shape[-1] != query.shape[1]:
-                target_length = query.shape[1]
-                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
-                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
-        # attention, what we cannot get enough of
-        if self._use_memory_efficient_attention_xformers:
-            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
-            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
-            hidden_states = hidden_states.to(query.dtype)
-        else:
-            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
-                hidden_states = self._attention(query, key, value, attention_mask)
-            else:
-                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
-        # linear proj
-        hidden_states = self.to_out[0](hidden_states)
-        # dropout
-        hidden_states = self.to_out[1](hidden_states)
-        return hidden_states
-    def _attention(self, query, key, value, attention_mask=None):
-        if self.upcast_attention:
-            query = query.float()
-            key = key.float()
-        attention_scores = torch.baddbmm(
-            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
-            query,
-            key.transpose(-1, -2),
-            beta=0,
-            alpha=self.scale,
-        )
-        if attention_mask is not None:
-            attention_scores = attention_scores + attention_mask
-        if self.upcast_softmax:
-            attention_scores = attention_scores.float()
-        attention_probs = attention_scores.softmax(dim=-1)
-        # cast back to the original dtype
-        attention_probs = attention_probs.to(value.dtype)
-        # compute attention output
-        hidden_states = torch.bmm(attention_probs, value)
-        # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
-        return hidden_states
-    def _sliced_attention(self, query, key, value, sequence_length, dim, attention_mask):
-        batch_size_attention = query.shape[0]
-        hidden_states = torch.zeros(
-            (batch_size_attention, sequence_length, dim // self.heads), device=query.device, dtype=query.dtype
-        )
-        slice_size = self._slice_size if self._slice_size is not None else hidden_states.shape[0]
-        for i in range(hidden_states.shape[0] // slice_size):
-            start_idx = i * slice_size
-            end_idx = (i + 1) * slice_size
-            query_slice = query[start_idx:end_idx]
-            key_slice = key[start_idx:end_idx]
-            if self.upcast_attention:
-                query_slice = query_slice.float()
-                key_slice = key_slice.float()
-            attn_slice = torch.baddbmm(
-                torch.empty(slice_size, query.shape[1], key.shape[1], dtype=query_slice.dtype, device=query.device),
-                query_slice,
-                key_slice.transpose(-1, -2),
-                beta=0,
-                alpha=self.scale,
-            )
-            if attention_mask is not None:
-                attn_slice = attn_slice + attention_mask[start_idx:end_idx]
-            if self.upcast_softmax:
-                attn_slice = attn_slice.float()
-            attn_slice = attn_slice.softmax(dim=-1)
-            # cast back to the original dtype
-            attn_slice = attn_slice.to(value.dtype)
-            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
-            hidden_states[start_idx:end_idx] = attn_slice
-        # reshape hidden_states
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
-        return hidden_states
-    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
-        # TODO attention_mask
-        query = query.contiguous()
-        key = key.contiguous()
-        value = value.contiguous()
-        hidden_states = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=attention_mask)
-        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
-        return hidden_states
 def zero_module(module):
     # Zero out the parameters of a module and return it.
     for p in module.parameters():
@@ -275,6 +60,11 @@ class VanillaTemporalModule(nn.Module):
         zero_initialize                    = True,
         block_size                         = 1,
         grid                               = False,
     ):
         super().__init__()
@@ -289,17 +79,87 @@ class VanillaTemporalModule(nn.Module):
             temporal_position_encoding_max_len=temporal_position_encoding_max_len,
             grid=grid,
             block_size=block_size,
         )
         if zero_initialize:
             self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
     def forward(self, input_tensor, encoder_hidden_states=None, attention_mask=None, anchor_frame_idx=None):
         hidden_states = input_tensor
         hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
         output = hidden_states
         return output
 class TemporalTransformer3DModel(nn.Module):
     def __init__(
         self,
@@ -321,6 +181,8 @@ class TemporalTransformer3DModel(nn.Module):
         temporal_position_encoding_max_len = 4096,
         grid                               = False,
         block_size                         = 1,
     ):
         super().__init__()
@@ -348,6 +210,8 @@ class TemporalTransformer3DModel(nn.Module):
                     temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                     block_size=block_size,
                     grid=grid,
                 )
                 for d in range(num_layers)
             ]
@@ -398,6 +262,8 @@ class TemporalTransformerBlock(nn.Module):
         temporal_position_encoding_max_len = 4096,
         block_size                         = 1,
         grid                               = False,
     ):
         super().__init__()
@@ -422,15 +288,36 @@ class TemporalTransformerBlock(nn.Module):
                     temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                     block_size=block_size,
                     grid=grid,
                 )
             )
-            norms.append(nn.LayerNorm(dim))
         self.attention_blocks = nn.ModuleList(attention_blocks)
         self.norms = nn.ModuleList(norms)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
-        self.ff_norm = nn.LayerNorm(dim)
     def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None, height=None, weight=None):
         for attention_block, norm in zip(self.attention_blocks, self.norms):
@@ -468,7 +355,7 @@ class PositionalEncoding(nn.Module):
         x = x + self.pe[:, :x.size(1)]
         return self.dropout(x)
-class VersatileAttention(CrossAttention):
     def __init__(
             self,
             attention_mode                     = None,
@@ -477,21 +364,23 @@ class VersatileAttention(CrossAttention):
             temporal_position_encoding_max_len = 4096,
             grid                               = False,
             block_size                         = 1,
             *args, **kwargs
         ):
         super().__init__(*args, **kwargs)
-        assert attention_mode == "Temporal"
         self.attention_mode = attention_mode
         self.is_cross_attention = kwargs["cross_attention_dim"] is not None
         self.block_size = block_size
         self.grid = grid
         self.pos_encoder = PositionalEncoding(
             kwargs["query_dim"],
             dropout=0.,
             max_len=temporal_position_encoding_max_len
-        ) if (temporal_position_encoding and attention_mode == "Temporal") else None
     def extra_repr(self):
         return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
@@ -503,8 +392,13 @@ class VersatileAttention(CrossAttention):
             # for add pos_encoder
             _, before_d, _c = hidden_states.size()
             hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
-            if self.pos_encoder is not None:
-                hidden_states = self.pos_encoder(hidden_states)
             if self.grid:
                 hidden_states = rearrange(hidden_states, "(b d) f c -> b f d c", f=video_length, d=before_d)
@@ -515,61 +409,36 @@ class VersatileAttention(CrossAttention):
             else:
                 d = before_d
             encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
         else:
             raise NotImplementedError
-        encoder_hidden_states = encoder_hidden_states
-        if self.group_norm is not None:
-            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = self.to_q(hidden_states)
-        dim = query.shape[-1]
-        query = self.reshape_heads_to_batch_dim(query)
-        if self.added_kv_proj_dim is not None:
-            raise NotImplementedError
         encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-        key = self.to_k(encoder_hidden_states)
-        value = self.to_v(encoder_hidden_states)
-        key = self.reshape_heads_to_batch_dim(key)
-        value = self.reshape_heads_to_batch_dim(value)
-        if attention_mask is not None:
-            if attention_mask.shape[-1] != query.shape[1]:
-                target_length = query.shape[1]
-                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
-                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
         bs = 512
         new_hidden_states = []
-        for i in range(0, query.shape[0], bs):
-            # attention, what we cannot get enough of
-            if self._use_memory_efficient_attention_xformers:
-                hidden_states = self._memory_efficient_attention_xformers(query[i : i + bs], key[i : i + bs], value[i : i + bs], attention_mask[i : i + bs] if attention_mask is not None else attention_mask)
-                # Some versions of xformers return output in fp32, cast it back to the dtype of the input
-                hidden_states = hidden_states.to(query.dtype)
-            else:
-                if self._slice_size is None or query[i : i + bs].shape[0] // self._slice_size == 1:
-                    hidden_states = self._attention(query[i : i + bs], key[i : i + bs], value[i : i + bs], attention_mask[i : i + bs] if attention_mask is not None else attention_mask)
-                else:
-                    hidden_states = self._sliced_attention(query[i : i + bs], key[i : i + bs], value[i : i + bs], sequence_length, dim, attention_mask[i : i + bs] if attention_mask is not None else attention_mask)
-            new_hidden_states.append(hidden_states)
         hidden_states = torch.cat(new_hidden_states, dim = 0)
-        # linear proj
-        hidden_states = self.to_out[0](hidden_states)
-        # dropout
-        hidden_states = self.to_out[1](hidden_states)
         if self.attention_mode == "Temporal":
             hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
             if self.grid:
                 hidden_states = rearrange(hidden_states, "(b f n m) (h w) c -> (b f) h n w m c", f=video_length, n=self.block_size, m=self.block_size, h=height // self.block_size, w=weight // self.block_size)
                 hidden_states = rearrange(hidden_states, "b h n w m c -> b (h n) (w m) c")
                 hidden_states = rearrange(hidden_states, "b h w c -> b (h w) c")
         return hidden_states

 """Modified from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/motion_module.py
 """
 import math
+import diffusers
+import pkg_resources
 import torch
+installed_version = diffusers.__version__
+if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+    from diffusers.models.attention_processor import (Attention,
+                                                      AttnProcessor2_0,
+                                                      HunyuanAttnProcessor2_0)
+else:
+    from diffusers.models.attention_processor import Attention, AttnProcessor2_0
 from diffusers.models.attention import FeedForward
 from diffusers.utils.import_utils import is_xformers_available
 from einops import rearrange, repeat
 from torch import nn
+from .norm import FP32LayerNorm
 if is_xformers_available():
     import xformers
     import xformers.ops
 else:
     xformers = None
 def zero_module(module):
     # Zero out the parameters of a module and return it.
     for p in module.parameters():
         zero_initialize                    = True,
         block_size                         = 1,
         grid                               = False,
+        remove_time_embedding_in_photo     = False,
+        global_num_attention_heads         = 16,
+        global_attention                   = False,
+        qk_norm                            = False,
     ):
         super().__init__()
             temporal_position_encoding_max_len=temporal_position_encoding_max_len,
             grid=grid,
             block_size=block_size,
+            remove_time_embedding_in_photo=remove_time_embedding_in_photo,
+            qk_norm=qk_norm,
         )
+        self.global_transformer = GlobalTransformer3DModel(
+            in_channels=in_channels,
+            num_attention_heads=global_num_attention_heads,
+            attention_head_dim=in_channels // global_num_attention_heads // temporal_attention_dim_div,
+            qk_norm=qk_norm,
+        ) if global_attention else None
         if zero_initialize:
             self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)
+            if global_attention:
+                self.global_transformer.proj_out = zero_module(self.global_transformer.proj_out)
     def forward(self, input_tensor, encoder_hidden_states=None, attention_mask=None, anchor_frame_idx=None):
         hidden_states = input_tensor
         hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask)
+        if self.global_transformer is not None:
+            hidden_states = self.global_transformer(hidden_states)
         output = hidden_states
         return output
+class GlobalTransformer3DModel(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_attention_heads,
+        attention_head_dim,
+        dropout                            = 0.0,
+        attention_bias                     = False,
+        upcast_attention                   = False,
+        qk_norm                            = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm1 = FP32LayerNorm(inner_dim)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.norm2 = FP32LayerNorm(inner_dim)
+        if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2"):
+            self.attention = Attention(
+                query_dim=inner_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                qk_norm="layer_norm" if qk_norm else None,
+                processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+            )
+        else:
+            self.attention = Attention(
+                query_dim=inner_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def forward(self, hidden_states):
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length, height, width = hidden_states.shape[2], hidden_states.shape[3], hidden_states.shape[4]
+        hidden_states = rearrange(hidden_states, "b c f h w -> b (f h w) c")
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.proj_in(hidden_states)
+        # Attention Blocks
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.attention(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        output = hidden_states + residual
+        output = rearrange(output, "b (f h w) c -> b c f h w", f=video_length, h=height, w=width)
+        return output
 class TemporalTransformer3DModel(nn.Module):
     def __init__(
         self,
         temporal_position_encoding_max_len = 4096,
         grid                               = False,
         block_size                         = 1,
+        remove_time_embedding_in_photo     = False,
+        qk_norm                            = False,
     ):
         super().__init__()
                     temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                     block_size=block_size,
                     grid=grid,
+                    remove_time_embedding_in_photo=remove_time_embedding_in_photo,
+                    qk_norm=qk_norm
                 )
                 for d in range(num_layers)
             ]
         temporal_position_encoding_max_len = 4096,
         block_size                         = 1,
         grid                               = False,
+        remove_time_embedding_in_photo     = False,
+        qk_norm                            = False,
     ):
         super().__init__()
                     temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                     block_size=block_size,
                     grid=grid,
+                    remove_time_embedding_in_photo=remove_time_embedding_in_photo,
+                    qk_norm="layer_norm" if qk_norm else None,
+                    processor=HunyuanAttnProcessor2_0() if qk_norm else AttnProcessor2_0(),
+                ) if pkg_resources.parse_version(installed_version) >= pkg_resources.parse_version("0.28.2") else \
+                VersatileAttention(
+                    attention_mode=block_name.split("_")[0],
+                    cross_attention_dim=cross_attention_dim if block_name.endswith("_Cross") else None,
+                    query_dim=dim,
+                    heads=num_attention_heads,
+                    dim_head=attention_head_dim,
+                    dropout=dropout,
+                    bias=attention_bias,
+                    upcast_attention=upcast_attention,
+                    cross_frame_attention_mode=cross_frame_attention_mode,
+                    temporal_position_encoding=temporal_position_encoding,
+                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                    block_size=block_size,
+                    grid=grid,
+                    remove_time_embedding_in_photo=remove_time_embedding_in_photo,
                 )
             )
+            norms.append(FP32LayerNorm(dim))
         self.attention_blocks = nn.ModuleList(attention_blocks)
         self.norms = nn.ModuleList(norms)
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.ff_norm = FP32LayerNorm(dim)
     def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None, height=None, weight=None):
         for attention_block, norm in zip(self.attention_blocks, self.norms):
         x = x + self.pe[:, :x.size(1)]
         return self.dropout(x)
+class VersatileAttention(Attention):
     def __init__(
             self,
             attention_mode                     = None,
             temporal_position_encoding_max_len = 4096,
             grid                               = False,
             block_size                         = 1,
+            remove_time_embedding_in_photo     = False,
             *args, **kwargs
         ):
         super().__init__(*args, **kwargs)
+        assert attention_mode == "Temporal" or attention_mode == "Global"
         self.attention_mode = attention_mode
         self.is_cross_attention = kwargs["cross_attention_dim"] is not None
         self.block_size = block_size
         self.grid = grid
+        self.remove_time_embedding_in_photo = remove_time_embedding_in_photo
         self.pos_encoder = PositionalEncoding(
             kwargs["query_dim"],
             dropout=0.,
             max_len=temporal_position_encoding_max_len
+        ) if (temporal_position_encoding and attention_mode == "Temporal") or (temporal_position_encoding and attention_mode == "Global") else None
     def extra_repr(self):
         return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"
             # for add pos_encoder
             _, before_d, _c = hidden_states.size()
             hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            if self.remove_time_embedding_in_photo:
+                if self.pos_encoder is not None and video_length > 1:
+                    hidden_states = self.pos_encoder(hidden_states)
+            else:
+                if self.pos_encoder is not None:
+                    hidden_states = self.pos_encoder(hidden_states)
             if self.grid:
                 hidden_states = rearrange(hidden_states, "(b d) f c -> b f d c", f=video_length, d=before_d)
             else:
                 d = before_d
             encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) if encoder_hidden_states is not None else encoder_hidden_states
+        elif self.attention_mode == "Global":
+            # for add pos_encoder
+            _, d, _c = hidden_states.size()
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            if self.pos_encoder is not None:
+                hidden_states = self.pos_encoder(hidden_states)
+            hidden_states = rearrange(hidden_states, "(b d) f c -> b (f d) c", f=video_length, d=d)
         else:
             raise NotImplementedError
         encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
         bs = 512
         new_hidden_states = []
+        for i in range(0, hidden_states.shape[0], bs):
+            __hidden_states = super().forward(
+                hidden_states[i : i + bs],
+                encoder_hidden_states=encoder_hidden_states[i : i + bs],
+                attention_mask=attention_mask
+            )
+            new_hidden_states.append(__hidden_states)
         hidden_states = torch.cat(new_hidden_states, dim = 0)
         if self.attention_mode == "Temporal":
             hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
             if self.grid:
                 hidden_states = rearrange(hidden_states, "(b f n m) (h w) c -> (b f) h n w m c", f=video_length, n=self.block_size, m=self.block_size, h=height // self.block_size, w=weight // self.block_size)
                 hidden_states = rearrange(hidden_states, "b h n w m c -> b (h n) (w m) c")
                 hidden_states = rearrange(hidden_states, "b h w c -> b (h w) c")
+        elif self.attention_mode == "Global":
+            hidden_states = rearrange(hidden_states, "b (f d) c -> (b f) d c", f=video_length, d=d)
         return hidden_states

easyanimate/models/norm.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from typing import Any, Dict, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from torch import nn
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class FP32LayerNorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        if hasattr(self, 'weight') and self.weight is not None:
+            return F.layer_norm(
+                inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps
+            ).to(origin_dtype)
+        else:
+            return F.layer_norm(
+                inputs.float(), self.normalized_shape, None, None, self.eps
+            ).to(origin_dtype)
+class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
+    """
+    For PixArt-Alpha.
+    Reference:
+    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+    """
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
+        super().__init__()
+        self.outdim = size_emb_dim
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.use_additional_conditions = use_additional_conditions
+        if use_additional_conditions:
+            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            self.resolution_embedder.linear_2 = zero_module(self.resolution_embedder.linear_2)
+            self.aspect_ratio_embedder.linear_2 = zero_module(self.aspect_ratio_embedder.linear_2)
+    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+        if self.use_additional_conditions:
+            resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
+            resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
+            aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
+            aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
+            conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
+        else:
+            conditioning = timesteps_emb
+        return conditioning
+class AdaLayerNormSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
+        super().__init__()
+        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
+        )
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        batch_size: Optional[int] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # No modulation happening here.
+        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep

easyanimate/models/patch.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from typing import Optional
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.nn.init as init
-import math
 from einops import rearrange
 from torch import nn

+import math
 from typing import Optional
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.nn.init as init
 from einops import rearrange
 from torch import nn

easyanimate/models/transformer3d.py CHANGED Viewed

@@ -15,26 +15,30 @@ import json
 import math
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, Optional
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.nn.init as init
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.attention import BasicTransformerBlock
-from diffusers.models.embeddings import PatchEmbed, Timesteps, TimestepEmbedding
 from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import AdaLayerNormSingle
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, is_torch_version
 from einops import rearrange
 from torch import nn
-from typing import Dict, Optional, Tuple
 from .attention import (SelfAttentionTemporalTransformerBlock,
                         TemporalTransformerBlock)
-from .patch import Patch1D, PatchEmbed3D, PatchEmbedF3D, UnPatch1D, TemporalUpsampler3D, CasualPatchEmbed3D
 try:
     from diffusers.models.embeddings import PixArtAlphaTextProjection
@@ -48,77 +52,25 @@ def zero_module(module):
         p.detach().zero_()
     return module
-class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
-    """
-    For PixArt-Alpha.
-    Reference:
-    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
     """
-    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
-        super().__init__()
-        self.outdim = size_emb_dim
-        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-        self.use_additional_conditions = use_additional_conditions
-        if use_additional_conditions:
-            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
-            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-            self.resolution_embedder.linear_2 = zero_module(self.resolution_embedder.linear_2)
-            self.aspect_ratio_embedder.linear_2 = zero_module(self.aspect_ratio_embedder.linear_2)
-    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
-        timesteps_proj = self.time_proj(timestep)
-        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
-        if self.use_additional_conditions:
-            resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
-            resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
-            aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
-            aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
-            conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
-        else:
-            conditioning = timesteps_emb
-        return conditioning
-class AdaLayerNormSingle(nn.Module):
-    r"""
-    Norm layer adaptive layer norm single (adaLN-single).
-    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
-    Parameters:
-        embedding_dim (`int`): The size of each embedding vector.
-        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
     """
-    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
         super().__init__()
-        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
-            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
-        )
-        self.silu = nn.SiLU()
-        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
-    def forward(
-        self,
-        timestep: torch.Tensor,
-        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
-        batch_size: Optional[int] = None,
-        hidden_dtype: Optional[torch.dtype] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        # No modulation happening here.
-        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
-        return self.linear(self.silu(embedded_timestep)), embedded_timestep
 class TimePositionalEncoding(nn.Module):
     def __init__(
@@ -229,9 +181,14 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         # motion module kwargs
         motion_module_type = "VanillaGrid",
         motion_module_kwargs = None,
         # time position encoding
-        time_position_encoding_before_transformer = False
     ):
         super().__init__()
         self.use_linear_projection = use_linear_projection
@@ -320,6 +277,35 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                         attention_type=attention_type,
                         motion_module_type=motion_module_type,
                         motion_module_kwargs=motion_module_kwargs,
                     )
                     for d in range(num_layers)
                 ]
@@ -346,6 +332,8 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                         kvcompression=False if d < 14 else True,
                         motion_module_type=motion_module_type,
                         motion_module_kwargs=motion_module_kwargs,
                     )
                     for d in range(num_layers)
                 ]
@@ -369,6 +357,8 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                         norm_elementwise_affine=norm_elementwise_affine,
                         norm_eps=norm_eps,
                         attention_type=attention_type,
                     )
                     for d in range(num_layers)
                 ]
@@ -438,8 +428,11 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
             self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
         self.caption_projection = None
         if caption_channels is not None:
             self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
         self.gradient_checkpointing = False
@@ -456,12 +449,14 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         hidden_states: torch.Tensor,
         inpaint_latents: torch.Tensor = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         timestep: Optional[torch.LongTensor] = None,
         added_cond_kwargs: Dict[str, torch.Tensor] = None,
         class_labels: Optional[torch.LongTensor] = None,
         cross_attention_kwargs: Dict[str, Any] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
         """
@@ -520,6 +515,8 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
             attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
         # convert encoder_attention_mask to a bias the same way we do for attention_mask
         if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
             encoder_attention_mask = (1 - encoder_attention_mask.to(encoder_hidden_states.dtype)) * -10000.0
@@ -560,6 +557,13 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
             encoder_hidden_states = self.caption_projection(encoder_hidden_states)
             encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
         skips = []
         skip_index = 0
         for index, block in enumerate(self.transformer_blocks):
@@ -590,7 +594,8 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                 args = {
                     "basic": [],
                     "motionmodule": [video_length, height, width],
-                    "selfattentiontemporal": [video_length, height, width],
                     "kvcompression_motionmodule": [video_length, height, width],
                 }[self.basic_block_type]
                 hidden_states = torch.utils.checkpoint.checkpoint(
@@ -609,7 +614,8 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
                 kwargs = {
                     "basic": {},
                     "motionmodule": {"num_frames":video_length, "height":height, "width":width},
-                    "selfattentiontemporal": {"num_frames":video_length, "height":height, "width":width},
                     "kvcompression_motionmodule": {"num_frames":video_length, "height":height, "width":width},
                 }[self.basic_block_type]
                 hidden_states = block(

 import math
 import os
 from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.nn.init as init
 from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention import BasicTransformerBlock, FeedForward
+from diffusers.models.embeddings import (PatchEmbed, PixArtAlphaTextProjection,
+                                         TimestepEmbedding, Timesteps)
 from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
 from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous
+from diffusers.utils import (USE_PEFT_BACKEND, BaseOutput, is_torch_version,
+                             logging)
+from diffusers.utils.torch_utils import maybe_allow_in_graph
 from einops import rearrange
 from torch import nn
 from .attention import (SelfAttentionTemporalTransformerBlock,
                         TemporalTransformerBlock)
+from .norm import AdaLayerNormSingle
+from .patch import (CasualPatchEmbed3D, Patch1D, PatchEmbed3D, PatchEmbedF3D,
+                    TemporalUpsampler3D, UnPatch1D)
 try:
     from diffusers.models.embeddings import PixArtAlphaTextProjection
         p.detach().zero_()
     return module
+class CLIPProjection(nn.Module):
     """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
     """
+    def __init__(self, in_features, hidden_size, num_tokens=120):
         super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        self.act_1 = nn.GELU(approximate="tanh")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
+        self.linear_2 = zero_module(self.linear_2)
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
 class TimePositionalEncoding(nn.Module):
     def __init__(
         # motion module kwargs
         motion_module_type = "VanillaGrid",
         motion_module_kwargs = None,
+        motion_module_kwargs_odd = None,
+        motion_module_kwargs_even = None,
         # time position encoding
+        time_position_encoding_before_transformer = False,
+        qk_norm = False,
+        after_norm = False,
     ):
         super().__init__()
         self.use_linear_projection = use_linear_projection
                         attention_type=attention_type,
                         motion_module_type=motion_module_type,
                         motion_module_kwargs=motion_module_kwargs,
+                        qk_norm=qk_norm,
+                        after_norm=after_norm,
+                    )
+                    for d in range(num_layers)
+                ]
+            )
+        elif self.basic_block_type == "global_motionmodule":
+            self.transformer_blocks = nn.ModuleList(
+                [
+                    TemporalTransformerBlock(
+                        inner_dim,
+                        num_attention_heads,
+                        attention_head_dim,
+                        dropout=dropout,
+                        cross_attention_dim=cross_attention_dim,
+                        activation_fn=activation_fn,
+                        num_embeds_ada_norm=num_embeds_ada_norm,
+                        attention_bias=attention_bias,
+                        only_cross_attention=only_cross_attention,
+                        double_self_attention=double_self_attention,
+                        upcast_attention=upcast_attention,
+                        norm_type=norm_type,
+                        norm_elementwise_affine=norm_elementwise_affine,
+                        norm_eps=norm_eps,
+                        attention_type=attention_type,
+                        motion_module_type=motion_module_type,
+                        motion_module_kwargs=motion_module_kwargs_even if d % 2 == 0 else motion_module_kwargs_odd,
+                        qk_norm=qk_norm,
+                        after_norm=after_norm,
                     )
                     for d in range(num_layers)
                 ]
                         kvcompression=False if d < 14 else True,
                         motion_module_type=motion_module_type,
                         motion_module_kwargs=motion_module_kwargs,
+                        qk_norm=qk_norm,
+                        after_norm=after_norm,
                     )
                     for d in range(num_layers)
                 ]
                         norm_elementwise_affine=norm_elementwise_affine,
                         norm_eps=norm_eps,
                         attention_type=attention_type,
+                        qk_norm=qk_norm,
+                        after_norm=after_norm,
                     )
                     for d in range(num_layers)
                 ]
             self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
         self.caption_projection = None
+        self.clip_projection = None
         if caption_channels is not None:
             self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+            if in_channels == 12:
+                self.clip_projection = CLIPProjection(in_features=768, hidden_size=inner_dim * 8)
         self.gradient_checkpointing = False
         hidden_states: torch.Tensor,
         inpaint_latents: torch.Tensor = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
+        clip_encoder_hidden_states: Optional[torch.Tensor] = None,
         timestep: Optional[torch.LongTensor] = None,
         added_cond_kwargs: Dict[str, torch.Tensor] = None,
         class_labels: Optional[torch.LongTensor] = None,
         cross_attention_kwargs: Dict[str, Any] = None,
         attention_mask: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
+        clip_attention_mask: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ):
         """
             attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
             attention_mask = attention_mask.unsqueeze(1)
+        if clip_attention_mask is not None:
+            encoder_attention_mask = torch.cat([encoder_attention_mask, clip_attention_mask], dim=1)
         # convert encoder_attention_mask to a bias the same way we do for attention_mask
         if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
             encoder_attention_mask = (1 - encoder_attention_mask.to(encoder_hidden_states.dtype)) * -10000.0
             encoder_hidden_states = self.caption_projection(encoder_hidden_states)
             encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+        if clip_encoder_hidden_states is not None and encoder_hidden_states is not None:
+            batch_size = hidden_states.shape[0]
+            clip_encoder_hidden_states = self.clip_projection(clip_encoder_hidden_states)
+            clip_encoder_hidden_states = clip_encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+            encoder_hidden_states = torch.cat([encoder_hidden_states, clip_encoder_hidden_states], dim = 1)
         skips = []
         skip_index = 0
         for index, block in enumerate(self.transformer_blocks):
                 args = {
                     "basic": [],
                     "motionmodule": [video_length, height, width],
+                    "global_motionmodule": [video_length, height, width],
+                    "selfattentiontemporal": [],
                     "kvcompression_motionmodule": [video_length, height, width],
                 }[self.basic_block_type]
                 hidden_states = torch.utils.checkpoint.checkpoint(
                 kwargs = {
                     "basic": {},
                     "motionmodule": {"num_frames":video_length, "height":height, "width":width},
+                    "global_motionmodule": {"num_frames":video_length, "height":height, "width":width},
+                    "selfattentiontemporal": {},
                     "kvcompression_motionmodule": {"num_frames":video_length, "height":height, "width":width},
                 }[self.basic_block_type]
                 hidden_states = block(

easyanimate/pipeline/pipeline_easyanimate.py CHANGED Viewed

@@ -578,7 +578,7 @@ class EasyAnimatePipeline(DiffusionPipeline):
     def decode_latents(self, latents):
         video_length = latents.shape[2]
-        latents = 1 / 0.18215 * latents
         if self.vae.quant_conv.weight.ndim==5:
             mini_batch_encoder = self.vae.mini_batch_encoder
             mini_batch_decoder = self.vae.mini_batch_decoder

     def decode_latents(self, latents):
         video_length = latents.shape[2]
+        latents = 1 / self.vae.config.scaling_factor * latents
         if self.vae.quant_conv.weight.ndim==5:
             mini_batch_encoder = self.vae.mini_batch_encoder
             mini_batch_decoder = self.vae.mini_batch_decoder

easyanimate/pipeline/pipeline_easyanimate_inpaint.py CHANGED Viewed

@@ -15,13 +15,16 @@
 import html
 import inspect
 import re
 import copy
 import urllib.parse as ul
 from dataclasses import dataclass
 from typing import Callable, List, Optional, Tuple, Union
 import numpy as np
 import torch
 from diffusers import DiffusionPipeline, ImagePipelineOutput
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL
@@ -33,6 +36,7 @@ from diffusers.utils.torch_utils import randn_tensor
 from einops import rearrange
 from tqdm import tqdm
 from transformers import T5EncoderModel, T5Tokenizer
 from ..models.transformer3d import Transformer3DModel
@@ -109,11 +113,15 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         vae: AutoencoderKL,
         transformer: Transformer3DModel,
         scheduler: DPMSolverMultistepScheduler,
     ):
         super().__init__()
         self.register_modules(
-            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
@@ -503,41 +511,64 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         return_video_latents=False,
     ):
         if self.vae.quant_conv.weight.ndim==5:
-            shape = (batch_size, num_channels_latents, int(video_length // 5 * 2) if video_length != 1 else 1, height // self.vae_scale_factor, width // self.vae_scale_factor)
         else:
             shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
         if return_video_latents or (latents is None and not is_strength_max):
-            video = video.to(device=device, dtype=dtype)
-            if video.shape[1] == 4:
-                video_latents = video
             else:
-                video_length = video.shape[2]
-                video = rearrange(video, "b c f h w -> (b f) c h w")
-                video_latents = self._encode_vae_image(image=video, generator=generator)
-                video_latents = rearrange(video_latents, "(b f) c h w -> b c f h w", f=video_length)
-            video_latents = video_latents.repeat(batch_size // video_latents.shape[0], 1, 1, 1, 1)
         if latents is None:
-            rand_device = "cpu" if device.type == "mps" else device
-            noise = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
             # if strength is 1. then initialise the latents to noise, else initial to image + noise
             latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
         else:
             noise = latents.to(device)
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
         # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
         outputs = (latents,)
         if return_noise:
@@ -548,33 +579,61 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         return outputs
-    def decode_latents(self, latents):
-        video_length = latents.shape[2]
-        latents = 1 / 0.18215 * latents
-        if self.vae.quant_conv.weight.ndim==5:
-            mini_batch_decoder = 2
-            # Decoder
-            video = []
             for i in range(0, latents.shape[2], mini_batch_decoder):
                 with torch.no_grad():
                     start_index = i
                     end_index = i + mini_batch_decoder
                     latents_bs = self.vae.decode(latents[:, :, start_index:end_index, :, :])[0]
-                    video.append(latents_bs)
-            # Smooth
-            mini_batch_encoder = 5
-            video = torch.cat(video, 2).cpu()
-            for i in range(mini_batch_encoder, video.shape[2], mini_batch_encoder):
-                origin_before = copy.deepcopy(video[:, :, i - 1, :, :])
-                origin_after = copy.deepcopy(video[:, :, i, :, :])
-                video[:, :, i - 1, :, :] = origin_before * 0.75 + origin_after * 0.25
-                video[:, :, i, :, :] = origin_before * 0.25 + origin_after * 0.75
             video = video.clamp(-1, 1)
         else:
             latents = rearrange(latents, "b c f h w -> (b f) c h w")
-            # video = self.vae.decode(latents).sample
             video = []
             for frame_idx in tqdm(range(latents.shape[0])):
                 video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
@@ -599,6 +658,16 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         return image_latents
     def prepare_mask_latents(
         self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
     ):
@@ -610,19 +679,26 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         mask = mask.to(device=device, dtype=self.vae.dtype)
         if self.vae.quant_conv.weight.ndim==5:
             bs = 1
             new_mask = []
-            for i in range(0, mask.shape[0], bs):
-                mini_batch = 5
-                new_mask_mini_batch = []
-                for j in range(0, mask.shape[2], mini_batch):
-                    mask_bs = mask[i : i + bs, :, j: j + mini_batch, :, :]
                     mask_bs = self.vae.encode(mask_bs)[0]
                     mask_bs = mask_bs.sample()
-                    new_mask_mini_batch.append(mask_bs)
-                new_mask_mini_batch = torch.cat(new_mask_mini_batch, dim = 2)
-                new_mask.append(new_mask_mini_batch)
             mask = torch.cat(new_mask, dim = 0)
-            mask = mask * 0.1825
         else:
             if mask.shape[1] == 4:
@@ -636,19 +712,26 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         masked_image = masked_image.to(device=device, dtype=self.vae.dtype)
         if self.vae.quant_conv.weight.ndim==5:
             bs = 1
             new_mask_pixel_values = []
-            for i in range(0, masked_image.shape[0], bs):
-                mini_batch = 5
-                new_mask_pixel_values_mini_batch = []
-                for j in range(0, masked_image.shape[2], mini_batch):
-                    mask_pixel_values_bs = masked_image[i : i + bs, :, j: j + mini_batch, :, :]
                     mask_pixel_values_bs = self.vae.encode(mask_pixel_values_bs)[0]
                     mask_pixel_values_bs = mask_pixel_values_bs.sample()
-                    new_mask_pixel_values_mini_batch.append(mask_pixel_values_bs)
-                new_mask_pixel_values_mini_batch = torch.cat(new_mask_pixel_values_mini_batch, dim = 2)
-                new_mask_pixel_values.append(new_mask_pixel_values_mini_batch)
             masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
-            masked_image_latents = masked_image_latents * 0.1825
         else:
             if masked_image.shape[1] == 4:
@@ -693,7 +776,9 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         callback_steps: int = 1,
         clean_caption: bool = True,
         mask_feature: bool = True,
-        max_sequence_length: int = 120
     ) -> Union[EasyAnimatePipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
@@ -767,6 +852,8 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         # 1. Check inputs. Raise error if not correct
         height = height or self.transformer.config.sample_size * self.vae_scale_factor
         width = width or self.transformer.config.sample_size * self.vae_scale_factor
         # 2. Default height and width to transformer
         if prompt is not None and isinstance(prompt, str):
@@ -806,11 +893,13 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
-        # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
-        latent_timestep = timesteps[:1].repeat(batch_size)
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
@@ -825,7 +914,7 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
         # Prepare latent variables
         num_channels_latents = self.vae.config.latent_channels
         num_channels_transformer = self.transformer.config.in_channels
-        return_image_latents = num_channels_transformer == 4
         # 5. Prepare latents.
         latents_outputs = self.prepare_latents(
@@ -857,30 +946,83 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
             mask_condition = mask_condition.to(dtype=torch.float32)
             mask_condition = rearrange(mask_condition, "(b f) c h w -> b c f h w", f=video_length)
-            if masked_video_latents is None:
-                masked_video = init_video * (mask_condition < 0.5) + torch.ones_like(init_video) * (mask_condition > 0.5) * -1
             else:
-                masked_video = masked_video_latents
-            mask, masked_video_latents = self.prepare_mask_latents(
-                mask_condition,
-                masked_video,
-                batch_size,
-                height,
-                width,
-                prompt_embeds.dtype,
-                device,
-                generator,
-                do_classifier_free_guidance,
-            )
         else:
-            mask = torch.zeros_like(latents).to(latents.device, latents.dtype)
-            masked_video_latents = torch.zeros_like(latents).to(latents.device, latents.dtype)
         # Check that sizes of mask, masked image and latents match
         if num_channels_transformer == 12:
             # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
             num_channels_masked_image = masked_video_latents.shape[1]
             if num_channels_latents + num_channels_mask + num_channels_masked_image != self.transformer.config.in_channels:
                 raise ValueError(
@@ -890,12 +1032,12 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
                     f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
                     " `pipeline.transformer` or your `mask_image` or `image` input."
                 )
-        elif num_channels_transformer == 4:
             raise ValueError(
                 f"The transformer {self.transformer.__class__} should have 9 input channels, not {self.transformer.config.in_channels}."
             )
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 6.1 Prepare micro-conditions.
@@ -912,21 +1054,25 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
             added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
-        # 7. Denoising loop
-        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                if num_channels_transformer == 12:
-                    mask_input = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
-                    masked_video_latents_input = (
-                        torch.cat([masked_video_latents] * 2) if do_classifier_free_guidance else masked_video_latents
-                    )
-                    inpaint_latents = torch.cat([mask_input, masked_video_latents_input], dim=1)
                 current_timestep = t
                 if not torch.is_tensor(current_timestep):
                     # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
@@ -949,7 +1095,9 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
                     encoder_attention_mask=prompt_attention_mask,
                     timestep=current_timestep,
                     added_cond_kwargs=added_cond_kwargs,
-                    inpaint_latents=inpaint_latents.to(latent_model_input.dtype),
                     return_dict=False,
                 )[0]
@@ -964,6 +1112,17 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
                 # compute previous image: x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
@@ -971,9 +1130,16 @@ class EasyAnimateInpaintPipeline(DiffusionPipeline):
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents)
         # Post-processing
         video = self.decode_latents(latents)
         # Convert to tensor
         if output_type == "latent":
             video = torch.from_numpy(video)

 import html
 import inspect
 import re
+import gc
 import copy
 import urllib.parse as ul
 from dataclasses import dataclass
+from PIL import Image
 from typing import Callable, List, Optional, Tuple, Union
 import numpy as np
 import torch
+import torch.nn.functional as F
 from diffusers import DiffusionPipeline, ImagePipelineOutput
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL
 from einops import rearrange
 from tqdm import tqdm
 from transformers import T5EncoderModel, T5Tokenizer
+from transformers import CLIPVisionModelWithProjection,  CLIPImageProcessor
 from ..models.transformer3d import Transformer3DModel
         vae: AutoencoderKL,
         transformer: Transformer3DModel,
         scheduler: DPMSolverMultistepScheduler,
+        clip_image_processor:CLIPImageProcessor = None,
+        clip_image_encoder:CLIPVisionModelWithProjection = None,
     ):
         super().__init__()
         self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer,
+            scheduler=scheduler,
+            clip_image_processor=clip_image_processor, clip_image_encoder=clip_image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
         return_video_latents=False,
     ):
         if self.vae.quant_conv.weight.ndim==5:
+            mini_batch_encoder = self.vae.mini_batch_encoder
+            mini_batch_decoder = self.vae.mini_batch_decoder
+            shape = (batch_size, num_channels_latents, int(video_length // mini_batch_encoder * mini_batch_decoder) if video_length != 1 else 1, height // self.vae_scale_factor, width // self.vae_scale_factor)
         else:
             shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                 f" size of {batch_size}. Make sure the batch size matches the length of the generators."
             )
         if return_video_latents or (latents is None and not is_strength_max):
+            video = video.to(device=device, dtype=self.vae.dtype)
+            if self.vae.quant_conv.weight.ndim==5:
+                bs = 1
+                mini_batch_encoder = self.vae.mini_batch_encoder
+                new_video = []
+                if self.vae.slice_compression_vae:
+                    for i in range(0, video.shape[0], bs):
+                        video_bs = video[i : i + bs]
+                        video_bs = self.vae.encode(video_bs)[0]
+                        video_bs = video_bs.sample()
+                        new_video.append(video_bs)
+                else:
+                    for i in range(0, video.shape[0], bs):
+                        new_video_mini_batch = []
+                        for j in range(0, video.shape[2], mini_batch_encoder):
+                            video_bs = video[i : i + bs, :, j: j + mini_batch_encoder, :, :]
+                            video_bs = self.vae.encode(video_bs)[0]
+                            video_bs = video_bs.sample()
+                            new_video_mini_batch.append(video_bs)
+                        new_video_mini_batch = torch.cat(new_video_mini_batch, dim = 2)
+                        new_video.append(new_video_mini_batch)
+                video = torch.cat(new_video, dim = 0)
+                video = video * self.vae.config.scaling_factor
             else:
+                if video.shape[1] == 4:
+                    video = video
+                else:
+                    video_length = video.shape[2]
+                    video = rearrange(video, "b c f h w -> (b f) c h w")
+                    video = self._encode_vae_image(video, generator=generator)
+                    video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+            video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
         if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
             # if strength is 1. then initialise the latents to noise, else initial to image + noise
             latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
         else:
             noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
         # scale the initial noise by the standard deviation required by the scheduler
         outputs = (latents,)
         if return_noise:
         return outputs
+    def smooth_output(self, video, mini_batch_encoder, mini_batch_decoder):
+        if video.size()[2] <= mini_batch_encoder:
+            return video
+        prefix_index_before = mini_batch_encoder // 2
+        prefix_index_after = mini_batch_encoder - prefix_index_before
+        pixel_values = video[:, :, prefix_index_before:-prefix_index_after]
+        if self.vae.slice_compression_vae:
+            latents = self.vae.encode(pixel_values)[0]
+            latents = latents.sample()
+        else:
+            new_pixel_values = []
+            for i in range(0, pixel_values.shape[2], mini_batch_encoder):
+                with torch.no_grad():
+                    pixel_values_bs = pixel_values[:, :, i: i + mini_batch_encoder, :, :]
+                    pixel_values_bs = self.vae.encode(pixel_values_bs)[0]
+                    pixel_values_bs = pixel_values_bs.sample()
+                    new_pixel_values.append(pixel_values_bs)
+            latents = torch.cat(new_pixel_values, dim = 2)
+        if self.vae.slice_compression_vae:
+            middle_video = self.vae.decode(latents)[0]
+        else:
+            middle_video = []
             for i in range(0, latents.shape[2], mini_batch_decoder):
                 with torch.no_grad():
                     start_index = i
                     end_index = i + mini_batch_decoder
                     latents_bs = self.vae.decode(latents[:, :, start_index:end_index, :, :])[0]
+                    middle_video.append(latents_bs)
+            middle_video = torch.cat(middle_video, 2)
+        video[:, :, prefix_index_before:-prefix_index_after] = (video[:, :, prefix_index_before:-prefix_index_after] + middle_video) / 2
+        return video
+    def decode_latents(self, latents):
+        video_length = latents.shape[2]
+        latents = 1 / self.vae.config.scaling_factor * latents
+        if self.vae.quant_conv.weight.ndim==5:
+            mini_batch_encoder = self.vae.mini_batch_encoder
+            mini_batch_decoder = self.vae.mini_batch_decoder
+            if self.vae.slice_compression_vae:
+                video = self.vae.decode(latents)[0]
+            else:
+                video = []
+                for i in range(0, latents.shape[2], mini_batch_decoder):
+                    with torch.no_grad():
+                        start_index = i
+                        end_index = i + mini_batch_decoder
+                        latents_bs = self.vae.decode(latents[:, :, start_index:end_index, :, :])[0]
+                        video.append(latents_bs)
+                video = torch.cat(video, 2)
             video = video.clamp(-1, 1)
+            video = self.smooth_output(video, mini_batch_encoder, mini_batch_decoder).cpu().clamp(-1, 1)
         else:
             latents = rearrange(latents, "b c f h w -> (b f) c h w")
             video = []
             for frame_idx in tqdm(range(latents.shape[0])):
                 video.append(self.vae.decode(latents[frame_idx:frame_idx+1]).sample)
         return image_latents
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        return timesteps, num_inference_steps - t_start
     def prepare_mask_latents(
         self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
     ):
         mask = mask.to(device=device, dtype=self.vae.dtype)
         if self.vae.quant_conv.weight.ndim==5:
             bs = 1
+            mini_batch_encoder = self.vae.mini_batch_encoder
             new_mask = []
+            if self.vae.slice_compression_vae:
+                for i in range(0, mask.shape[0], bs):
+                    mask_bs = mask[i : i + bs]
                     mask_bs = self.vae.encode(mask_bs)[0]
                     mask_bs = mask_bs.sample()
+                    new_mask.append(mask_bs)
+            else:
+                for i in range(0, mask.shape[0], bs):
+                    new_mask_mini_batch = []
+                    for j in range(0, mask.shape[2], mini_batch_encoder):
+                        mask_bs = mask[i : i + bs, :, j: j + mini_batch_encoder, :, :]
+                        mask_bs = self.vae.encode(mask_bs)[0]
+                        mask_bs = mask_bs.sample()
+                        new_mask_mini_batch.append(mask_bs)
+                    new_mask_mini_batch = torch.cat(new_mask_mini_batch, dim = 2)
+                    new_mask.append(new_mask_mini_batch)
             mask = torch.cat(new_mask, dim = 0)
+            mask = mask * self.vae.config.scaling_factor
         else:
             if mask.shape[1] == 4:
         masked_image = masked_image.to(device=device, dtype=self.vae.dtype)
         if self.vae.quant_conv.weight.ndim==5:
             bs = 1
+            mini_batch_encoder = self.vae.mini_batch_encoder
             new_mask_pixel_values = []
+            if self.vae.slice_compression_vae:
+                for i in range(0, masked_image.shape[0], bs):
+                    mask_pixel_values_bs = masked_image[i : i + bs]
                     mask_pixel_values_bs = self.vae.encode(mask_pixel_values_bs)[0]
                     mask_pixel_values_bs = mask_pixel_values_bs.sample()
+                    new_mask_pixel_values.append(mask_pixel_values_bs)
+            else:
+                for i in range(0, masked_image.shape[0], bs):
+                    new_mask_pixel_values_mini_batch = []
+                    for j in range(0, masked_image.shape[2], mini_batch_encoder):
+                        mask_pixel_values_bs = masked_image[i : i + bs, :, j: j + mini_batch_encoder, :, :]
+                        mask_pixel_values_bs = self.vae.encode(mask_pixel_values_bs)[0]
+                        mask_pixel_values_bs = mask_pixel_values_bs.sample()
+                        new_mask_pixel_values_mini_batch.append(mask_pixel_values_bs)
+                    new_mask_pixel_values_mini_batch = torch.cat(new_mask_pixel_values_mini_batch, dim = 2)
+                    new_mask_pixel_values.append(new_mask_pixel_values_mini_batch)
             masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
+            masked_image_latents = masked_image_latents * self.vae.config.scaling_factor
         else:
             if masked_image.shape[1] == 4:
         callback_steps: int = 1,
         clean_caption: bool = True,
         mask_feature: bool = True,
+        max_sequence_length: int = 120,
+        clip_image: Image = None,
+        clip_apply_ratio: float = 0.50,
     ) -> Union[EasyAnimatePipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
         # 1. Check inputs. Raise error if not correct
         height = height or self.transformer.config.sample_size * self.vae_scale_factor
         width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        height = int(height // 16 * 16)
+        width = int(width // 16 * 16)
         # 2. Default height and width to transformer
         if prompt is not None and isinstance(prompt, str):
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        # 4. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
         # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
         is_strength_max = strength == 1.0
         # Prepare latent variables
         num_channels_latents = self.vae.config.latent_channels
         num_channels_transformer = self.transformer.config.in_channels
+        return_image_latents = True # num_channels_transformer == 4
         # 5. Prepare latents.
         latents_outputs = self.prepare_latents(
             mask_condition = mask_condition.to(dtype=torch.float32)
             mask_condition = rearrange(mask_condition, "(b f) c h w -> b c f h w", f=video_length)
+            if num_channels_transformer == 12:
+                mask_condition_tile = torch.tile(mask_condition, [1, 3, 1, 1, 1])
+                if masked_video_latents is None:
+                    masked_video = init_video * (mask_condition_tile < 0.5) + torch.ones_like(init_video) * (mask_condition_tile > 0.5) * -1
+                else:
+                    masked_video = masked_video_latents
+                mask_latents, masked_video_latents = self.prepare_mask_latents(
+                    mask_condition_tile,
+                    masked_video,
+                    batch_size,
+                    height,
+                    width,
+                    prompt_embeds.dtype,
+                    device,
+                    generator,
+                    do_classifier_free_guidance,
+                )
+                mask = torch.tile(mask_condition, [1, num_channels_transformer // 3, 1, 1, 1])
+                mask = F.interpolate(mask, size=latents.size()[-3:], mode='trilinear', align_corners=True).to(latents.device, latents.dtype)
+                mask_input = torch.cat([mask_latents] * 2) if do_classifier_free_guidance else mask_latents
+                masked_video_latents_input = (
+                    torch.cat([masked_video_latents] * 2) if do_classifier_free_guidance else masked_video_latents
+                )
+                inpaint_latents = torch.cat([mask_input, masked_video_latents_input], dim=1).to(latents.dtype)
             else:
+                mask = torch.tile(mask_condition, [1, num_channels_transformer, 1, 1, 1])
+                mask = F.interpolate(mask, size=latents.size()[-3:], mode='trilinear', align_corners=True).to(latents.device, latents.dtype)
+                inpaint_latents = None
+        else:
+            if num_channels_transformer == 12:
+                mask = torch.zeros_like(latents).to(latents.device, latents.dtype)
+                masked_video_latents = torch.zeros_like(latents).to(latents.device, latents.dtype)
+                mask_input = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+                masked_video_latents_input = (
+                    torch.cat([masked_video_latents] * 2) if do_classifier_free_guidance else masked_video_latents
+                )
+                inpaint_latents = torch.cat([mask_input, masked_video_latents_input], dim=1).to(latents.dtype)
+            else:
+                mask = torch.zeros_like(init_video[:, :1])
+                mask = torch.tile(mask, [1, num_channels_transformer, 1, 1, 1])
+                mask = F.interpolate(mask, size=latents.size()[-3:], mode='trilinear', align_corners=True).to(latents.device, latents.dtype)
+                inpaint_latents = None
+        if clip_image is not None:
+            inputs = self.clip_image_processor(images=clip_image, return_tensors="pt")
+            inputs["pixel_values"] = inputs["pixel_values"].to(latents.device, dtype=latents.dtype)
+            clip_encoder_hidden_states = self.clip_image_encoder(**inputs).image_embeds
+            clip_encoder_hidden_states_neg = torch.zeros([batch_size, 768]).to(latents.device, dtype=latents.dtype)
+            clip_attention_mask = torch.ones([batch_size, 8]).to(latents.device, dtype=latents.dtype)
+            clip_attention_mask_neg = torch.zeros([batch_size, 8]).to(latents.device, dtype=latents.dtype)
+            clip_encoder_hidden_states_input = torch.cat([clip_encoder_hidden_states_neg, clip_encoder_hidden_states]) if do_classifier_free_guidance else clip_encoder_hidden_states
+            clip_attention_mask_input = torch.cat([clip_attention_mask_neg, clip_attention_mask]) if do_classifier_free_guidance else clip_attention_mask
+        elif clip_image is None and num_channels_transformer == 12:
+            clip_encoder_hidden_states = torch.zeros([batch_size, 768]).to(latents.device, dtype=latents.dtype)
+            clip_attention_mask = torch.zeros([batch_size, 8])
+            clip_attention_mask = clip_attention_mask.to(latents.device, dtype=latents.dtype)
+            clip_encoder_hidden_states_input = torch.cat([clip_encoder_hidden_states] * 2) if do_classifier_free_guidance else clip_encoder_hidden_states
+            clip_attention_mask_input = torch.cat([clip_attention_mask] * 2) if do_classifier_free_guidance else clip_attention_mask
         else:
+            clip_encoder_hidden_states_input = None
+            clip_attention_mask_input = None
         # Check that sizes of mask, masked image and latents match
         if num_channels_transformer == 12:
             # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask_latents.shape[1]
             num_channels_masked_image = masked_video_latents.shape[1]
             if num_channels_latents + num_channels_mask + num_channels_masked_image != self.transformer.config.in_channels:
                 raise ValueError(
                     f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
                     " `pipeline.transformer` or your `mask_image` or `image` input."
                 )
+        elif num_channels_transformer != 4:
             raise ValueError(
                 f"The transformer {self.transformer.__class__} should have 9 input channels, not {self.transformer.config.in_channels}."
             )
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 6.1 Prepare micro-conditions.
             added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                if i < len(timesteps) * (1 - clip_apply_ratio) and clip_encoder_hidden_states_input is not None:
+                    clip_encoder_hidden_states_actual_input = torch.zeros_like(clip_encoder_hidden_states_input)
+                    clip_attention_mask_actual_input = torch.zeros_like(clip_attention_mask_input)
+                else:
+                    clip_encoder_hidden_states_actual_input = clip_encoder_hidden_states_input
+                    clip_attention_mask_actual_input = clip_attention_mask_input
                 current_timestep = t
                 if not torch.is_tensor(current_timestep):
                     # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
                     encoder_attention_mask=prompt_attention_mask,
                     timestep=current_timestep,
                     added_cond_kwargs=added_cond_kwargs,
+                    inpaint_latents=inpaint_latents,
+                    clip_encoder_hidden_states=clip_encoder_hidden_states_actual_input,
+                    clip_attention_mask=clip_attention_mask_actual_input,
                     return_dict=False,
                 )[0]
                 # compute previous image: x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if num_channels_transformer == 4:
+                    init_latents_proper = image_latents
+                    init_mask = mask
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents)
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
         # Post-processing
         video = self.decode_latents(latents)
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
         # Convert to tensor
         if output_type == "latent":
             video = torch.from_numpy(video)

easyanimate/ui/ui.py CHANGED Viewed

@@ -1,35 +1,40 @@
 """Modified from https://github.com/guoyww/AnimateDiff/blob/main/app.py
 """
 import gc
 import json
 import os
 import random
-import base64
-import requests
-import pkg_resources
 from datetime import datetime
 from glob import glob
 import gradio as gr
-import torch
 import numpy as np
 from diffusers import (AutoencoderKL, DDIMScheduler,
                        DPMSolverMultistepScheduler,
                        EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
                        PNDMScheduler)
-from easyanimate.models.autoencoder_magvit import AutoencoderKLMagvit
 from diffusers.utils.import_utils import is_xformers_available
 from omegaconf import OmegaConf
 from safetensors import safe_open
-from transformers import T5EncoderModel, T5Tokenizer
 from easyanimate.models.transformer3d import Transformer3DModel
 from easyanimate.pipeline.pipeline_easyanimate import EasyAnimatePipeline
 from easyanimate.utils.lora_utils import merge_lora, unmerge_lora
-from easyanimate.utils.utils import save_videos_grid
-from PIL import Image
-sample_idx = 0
 scheduler_dict = {
     "Euler": EulerDiscreteScheduler,
     "Euler A": EulerAncestralDiscreteScheduler,
@@ -60,8 +65,8 @@ class EasyAnimateController:
         self.personalized_model_dir     = os.path.join(self.basedir, "models", "Personalized_Model")
         self.savedir                    = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
         self.savedir_sample             = os.path.join(self.savedir, "sample")
-        self.edition                    = "v2"
-        self.inference_config           = OmegaConf.load(os.path.join(self.config_dir, "easyanimate_video_magvit_motion_module_v2.yaml"))
         os.makedirs(self.savedir, exist_ok=True)
         self.diffusion_transformer_list = []
@@ -85,14 +90,14 @@ class EasyAnimateController:
         self.weight_dtype = torch.bfloat16
     def refresh_diffusion_transformer(self):
-        self.diffusion_transformer_list = glob(os.path.join(self.diffusion_transformer_dir, "*/"))
     def refresh_motion_module(self):
-        motion_module_list = glob(os.path.join(self.motion_module_dir, "*.safetensors"))
         self.motion_module_list = [os.path.basename(p) for p in motion_module_list]
     def refresh_personalized_model(self):
-        personalized_model_list = glob(os.path.join(self.personalized_model_dir, "*.safetensors"))
         self.personalized_model_list = [os.path.basename(p) for p in personalized_model_list]
     def update_edition(self, edition):
@@ -100,19 +105,24 @@ class EasyAnimateController:
         self.edition = edition
         if edition == "v1":
             self.inference_config = OmegaConf.load(os.path.join(self.config_dir, "easyanimate_video_motion_module_v1.yaml"))
-            return gr.Dropdown.update(), gr.update(value="none"), gr.update(visible=True), gr.update(visible=True), \
-                gr.update(visible=False), gr.update(value=512, minimum=384, maximum=704, step=32), \
                 gr.update(value=512, minimum=384, maximum=704, step=32), gr.update(value=80, minimum=40, maximum=80, step=1)
-        else:
             self.inference_config = OmegaConf.load(os.path.join(self.config_dir, "easyanimate_video_magvit_motion_module_v2.yaml"))
-            return gr.Dropdown.update(), gr.update(value="none"), gr.update(visible=False), gr.update(visible=False), \
-                gr.update(visible=True), gr.update(value=672, minimum=128, maximum=1280, step=16), \
                 gr.update(value=384, minimum=128, maximum=1280, step=16), gr.update(value=144, minimum=9, maximum=144, step=9)
     def update_diffusion_transformer(self, diffusion_transformer_dropdown):
         print("Update diffusion transformer")
         if diffusion_transformer_dropdown == "none":
-            return gr.Dropdown.update()
         if OmegaConf.to_container(self.inference_config['vae_kwargs'])['enable_magvit']:
             Choosen_AutoencoderKL = AutoencoderKLMagvit
         else:
@@ -130,25 +140,42 @@ class EasyAnimateController:
         self.text_encoder = T5EncoderModel.from_pretrained(diffusion_transformer_dropdown, subfolder="text_encoder", torch_dtype=self.weight_dtype)
         # Get pipeline
-        self.pipeline = EasyAnimatePipeline(
-            vae=self.vae,
-            text_encoder=self.text_encoder,
-            tokenizer=self.tokenizer,
-            transformer=self.transformer,
-            scheduler=scheduler_dict["Euler"](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
-        )
-        self.pipeline.enable_model_cpu_offload()
         print("Update diffusion transformer done")
-        return gr.Dropdown.update()
     def update_motion_module(self, motion_module_dropdown):
         self.motion_module_path = motion_module_dropdown
         print("Update motion module")
         if motion_module_dropdown == "none":
-            return gr.Dropdown.update()
         if self.transformer is None:
             gr.Info(f"Please select a pretrained model path.")
-            return gr.Dropdown.update(value=None)
         else:
             motion_module_dropdown = os.path.join(self.motion_module_dir, motion_module_dropdown)
             if motion_module_dropdown.endswith(".safetensors"):
@@ -160,16 +187,16 @@ class EasyAnimateController:
                 motion_module_state_dict = torch.load(motion_module_dropdown, map_location="cpu")
             missing, unexpected = self.transformer.load_state_dict(motion_module_state_dict, strict=False)
             print("Update motion module done.")
-            return gr.Dropdown.update()
     def update_base_model(self, base_model_dropdown):
         self.base_model_path = base_model_dropdown
         print("Update base model")
         if base_model_dropdown == "none":
-            return gr.Dropdown.update()
         if self.transformer is None:
             gr.Info(f"Please select a pretrained model path.")
-            return gr.Dropdown.update(value=None)
         else:
             base_model_dropdown = os.path.join(self.personalized_model_dir, base_model_dropdown)
             base_model_state_dict = {}
@@ -178,16 +205,16 @@ class EasyAnimateController:
                     base_model_state_dict[key] = f.get_tensor(key)
             self.transformer.load_state_dict(base_model_state_dict, strict=False)
             print("Update base done")
-            return gr.Dropdown.update()
     def update_lora_model(self, lora_model_dropdown):
         print("Update lora model")
         if lora_model_dropdown == "none":
             self.lora_model_path = "none"
-            return gr.Dropdown.update()
         lora_model_dropdown = os.path.join(self.personalized_model_dir, lora_model_dropdown)
         self.lora_model_path = lora_model_dropdown
-        return gr.Dropdown.update()
     def generate(
         self,
@@ -200,15 +227,24 @@ class EasyAnimateController:
         negative_prompt_textbox,
         sampler_dropdown,
         sample_step_slider,
         width_slider,
         height_slider,
-        is_image,
         length_slider,
         cfg_scale_slider,
         seed_textbox,
         is_api = False,
     ):
-        global sample_idx
         if self.transformer is None:
             raise gr.Error(f"Please select a pretrained model path.")
@@ -221,6 +257,39 @@ class EasyAnimateController:
         if self.lora_model_path != lora_model_dropdown:
             print("Update lora model")
             self.update_lora_model(lora_model_dropdown)
         if is_xformers_available(): self.transformer.enable_xformers_memory_efficient_attention()
@@ -235,16 +304,98 @@ class EasyAnimateController:
         generator = torch.Generator(device="cuda").manual_seed(int(seed_textbox))
         try:
-            sample = self.pipeline(
-                prompt_textbox,
-                negative_prompt     = negative_prompt_textbox,
-                num_inference_steps = sample_step_slider,
-                guidance_scale      = cfg_scale_slider,
-                width               = width_slider,
-                height              = height_slider,
-                video_length        = length_slider if not is_image else 1,
-                generator           = generator
-            ).videos
         except Exception as e:
             gc.collect()
             torch.cuda.empty_cache()
@@ -254,7 +405,11 @@ class EasyAnimateController:
             if is_api:
                 return "", f"Error. error information is {str(e)}"
             else:
-                return gr.Image.update(), gr.Video.update(), f"Error. error information is {str(e)}"
         # lora part
         if self.lora_model_path != "none":
@@ -296,7 +451,10 @@ class EasyAnimateController:
             if is_api:
                 return save_sample_path, "Success"
             else:
-                return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
         else:
             save_sample_path = os.path.join(self.savedir_sample, prefix + f".mp4")
             save_videos_grid(sample, save_sample_path, fps=12 if self.edition == "v1" else 24)
@@ -304,7 +462,10 @@ class EasyAnimateController:
             if is_api:
                 return save_sample_path, "Success"
             else:
-                return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
 def ui():
@@ -325,24 +486,24 @@ def ui():
         with gr.Column(variant="panel"):
             gr.Markdown(
                 """
-                ### 1. EasyAnimate Edition (select easyanimate edition first).
                 """
             )
             with gr.Row():
                 easyanimate_edition_dropdown = gr.Dropdown(
-                    label="The config of EasyAnimate Edition",
-                    choices=["v1", "v2"],
-                    value="v2",
                     interactive=True,
                 )
             gr.Markdown(
                 """
-                ### 2. Model checkpoints (select pretrained model path).
                 """
             )
             with gr.Row():
                 diffusion_transformer_dropdown = gr.Dropdown(
-                    label="Pretrained Model Path",
                     choices=controller.diffusion_transformer_list,
                     value="none",
                     interactive=True,
@@ -356,12 +517,12 @@ def ui():
                 diffusion_transformer_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
                 def refresh_diffusion_transformer():
                     controller.refresh_diffusion_transformer()
-                    return gr.Dropdown.update(choices=controller.diffusion_transformer_list)
                 diffusion_transformer_refresh_button.click(fn=refresh_diffusion_transformer, inputs=[], outputs=[diffusion_transformer_dropdown])
             with gr.Row():
                 motion_module_dropdown = gr.Dropdown(
-                    label="Select motion module",
                     choices=controller.motion_module_list,
                     value="none",
                     interactive=True,
@@ -371,78 +532,139 @@ def ui():
                 motion_module_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton", visible=False)
                 def update_motion_module():
                     controller.refresh_motion_module()
-                    return gr.Dropdown.update(choices=controller.motion_module_list)
                 motion_module_refresh_button.click(fn=update_motion_module, inputs=[], outputs=[motion_module_dropdown])
                 base_model_dropdown = gr.Dropdown(
-                    label="Select base Dreambooth model (optional)",
                     choices=controller.personalized_model_list,
                     value="none",
                     interactive=True,
                 )
                 lora_model_dropdown = gr.Dropdown(
-                    label="Select LoRA model (optional)",
                     choices=["none"] + controller.personalized_model_list,
                     value="none",
                     interactive=True,
                 )
-                lora_alpha_slider = gr.Slider(label="LoRA alpha", value=0.55, minimum=0, maximum=2, interactive=True)
                 personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
                 def update_personalized_model():
                     controller.refresh_personalized_model()
                     return [
-                        gr.Dropdown.update(choices=controller.personalized_model_list),
-                        gr.Dropdown.update(choices=["none"] + controller.personalized_model_list)
                     ]
                 personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown])
         with gr.Column(variant="panel"):
             gr.Markdown(
                 """
-                ### 3. Configs for Generation.
                 """
             )
-            prompt_textbox = gr.Textbox(label="Prompt", lines=2, value="This video shows the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene")
-            negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2, value="The video is not of a high quality, it has a low resolution, and the audio quality is not clear. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, rotating camera, blurry camera, shaking camera. Deformation, low-resolution, blurry, ugly, distortion. " )
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
-                        sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
-                        sample_step_slider = gr.Slider(label="Sampling steps", value=50, minimum=10, maximum=100, step=1)
-                    width_slider     = gr.Slider(label="Width",            value=672, minimum=128, maximum=1280, step=16)
-                    height_slider    = gr.Slider(label="Height",           value=384, minimum=128, maximum=1280, step=16)
-                    with gr.Row():
-                        is_image      = gr.Checkbox(False, label="Generate Image")
-                        length_slider = gr.Slider(label="Animation length", value=144, minimum=9,   maximum=144,  step=9)
-                    cfg_scale_slider = gr.Slider(label="CFG Scale",        value=7.0, minimum=0,   maximum=20)
                     with gr.Row():
-                        seed_textbox = gr.Textbox(label="Seed", value=43)
                         seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
-                        seed_button.click(fn=lambda: gr.Textbox.update(value=random.randint(1, 1e8)), inputs=[], outputs=[seed_textbox])
-                    generate_button = gr.Button(value="Generate", variant='primary')
                 with gr.Column():
-                    result_image = gr.Image(label="Generated Image", interactive=False, visible=False)
-                    result_video = gr.Video(label="Generated Animation", interactive=False)
                     infer_progress = gr.Textbox(
-                        label="Generation Info",
                         value="No task currently",
                         interactive=False
                     )
-            is_image.change(
-                lambda x: gr.update(visible=not x),
-                inputs=[is_image],
-                outputs=[length_slider],
             )
             easyanimate_edition_dropdown.change(
                 fn=controller.update_edition,
                 inputs=[easyanimate_edition_dropdown],
@@ -451,7 +673,6 @@ def ui():
                     diffusion_transformer_dropdown,
                     motion_module_dropdown,
                     motion_module_refresh_button,
-                    is_image,
                     width_slider,
                     height_slider,
                     length_slider,
@@ -469,11 +690,17 @@ def ui():
                     negative_prompt_textbox,
                     sampler_dropdown,
                     sample_step_slider,
                     width_slider,
                     height_slider,
-                    is_image,
                     length_slider,
                     cfg_scale_slider,
                     seed_textbox,
                 ],
                 outputs=[result_image, result_video, infer_progress]
@@ -483,11 +710,18 @@ def ui():
 class EasyAnimateController_Modelscope:
     def __init__(self, edition, config_path, model_name, savedir_sample):
-        # Config and model path
-        weight_dtype = torch.bfloat16
-        self.savedir_sample = savedir_sample
         os.makedirs(self.savedir_sample, exist_ok=True)
         self.edition = edition
         self.inference_config = OmegaConf.load(config_path)
         # Get Transformer
@@ -513,32 +747,107 @@ class EasyAnimateController_Modelscope:
             subfolder="text_encoder",
             torch_dtype=weight_dtype
         )
-        self.pipeline = EasyAnimatePipeline(
-            vae=self.vae,
-            text_encoder=self.text_encoder,
-            tokenizer=self.tokenizer,
-            transformer=self.transformer,
-            scheduler=scheduler_dict["Euler"](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
-        )
-        self.pipeline.enable_model_cpu_offload()
         print("Update diffusion transformer done")
     def generate(
         self,
         prompt_textbox,
         negative_prompt_textbox,
         sampler_dropdown,
         sample_step_slider,
         width_slider,
         height_slider,
-        is_image,
         length_slider,
         cfg_scale_slider,
-        seed_textbox
     ):
         if is_xformers_available(): self.transformer.enable_xformers_memory_efficient_attention()
         self.pipeline.scheduler = scheduler_dict[sampler_dropdown](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
         self.pipeline.to("cuda")
         if int(seed_textbox) != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
@@ -546,21 +855,52 @@ class EasyAnimateController_Modelscope:
         generator = torch.Generator(device="cuda").manual_seed(int(seed_textbox))
         try:
-            sample = self.pipeline(
-                prompt_textbox,
-                negative_prompt = negative_prompt_textbox,
-                num_inference_steps = sample_step_slider,
-                guidance_scale = cfg_scale_slider,
-                width = width_slider,
-                height = height_slider,
-                video_length = length_slider if not is_image else 1,
-                generator = generator
-            ).videos
         except Exception as e:
             gc.collect()
             torch.cuda.empty_cache()
             torch.cuda.ipc_collect()
-            return gr.Image.update(), gr.Video.update(), f"Error. error information is {str(e)}"
         if not os.path.exists(self.savedir_sample):
             os.makedirs(self.savedir_sample, exist_ok=True)
@@ -578,11 +918,23 @@ class EasyAnimateController_Modelscope:
             image = (image * 255).numpy().astype(np.uint8)
             image = Image.fromarray(image)
             image.save(save_sample_path)
-            return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
         else:
             save_sample_path = os.path.join(self.savedir_sample, prefix + f".mp4")
             save_videos_grid(sample, save_sample_path, fps=12 if self.edition == "v1" else 24)
-            return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
 def ui_modelscope(edition, config_path, model_name, savedir_sample):
@@ -601,71 +953,197 @@ def ui_modelscope(edition, config_path, model_name, savedir_sample):
             """
         )
         with gr.Column(variant="panel"):
-            prompt_textbox = gr.Textbox(label="Prompt", lines=2, value="This video shows the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene")
-            negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2, value="The video is not of a high quality, it has a low resolution, and the audio quality is not clear. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, rotating camera, blurry camera, shaking camera. Deformation, low-resolution, blurry, ugly, distortion. " )
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
-                        sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
-                        sample_step_slider = gr.Slider(label="Sampling steps", value=30, minimum=10, maximum=100, step=1)
                     if edition == "v1":
-                        width_slider     = gr.Slider(label="Width",            value=512, minimum=384, maximum=704, step=32)
-                        height_slider    = gr.Slider(label="Height",           value=512, minimum=384, maximum=704, step=32)
-                        with gr.Row():
-                            is_image      = gr.Checkbox(False, label="Generate Image", visible=False)
-                        length_slider    = gr.Slider(label="Animation length", value=80,  minimum=40,  maximum=96,   step=1)
-                        cfg_scale_slider = gr.Slider(label="CFG Scale",        value=6.0, minimum=0,   maximum=20)
                     else:
-                        width_slider     = gr.Slider(label="Width",            value=672, minimum=256, maximum=704, step=16)
-                        height_slider    = gr.Slider(label="Height",           value=384, minimum=256, maximum=704, step=16)
                         with gr.Column():
                             gr.Markdown(
                                 """
-                                To ensure the efficiency of the trial, we will limit the frame rate to no more than 81.
-                                If you want to experience longer video generation, you can go to our [Github](https://github.com/aigc-apps/EasyAnimate/).
                                 """
                             )
                             with gr.Row():
-                                is_image      = gr.Checkbox(False, label="Generate Image")
-                                length_slider = gr.Slider(label="Animation length", value=72, minimum=9,   maximum=81,  step=9)
-                        cfg_scale_slider = gr.Slider(label="CFG Scale",        value=7.0, minimum=0,   maximum=20)
                     with gr.Row():
-                        seed_textbox = gr.Textbox(label="Seed", value=43)
                         seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
-                        seed_button.click(fn=lambda: gr.Textbox.update(value=random.randint(1, 1e8)), inputs=[], outputs=[seed_textbox])
-                    generate_button = gr.Button(value="Generate", variant='primary')
                 with gr.Column():
-                    result_image = gr.Image(label="Generated Image", interactive=False, visible=False)
-                    result_video = gr.Video(label="Generated Animation", interactive=False)
                     infer_progress = gr.Textbox(
-                        label="Generation Info",
                         value="No task currently",
                         interactive=False
                     )
-            is_image.change(
-                lambda x: gr.update(visible=not x),
-                inputs=[is_image],
-                outputs=[length_slider],
             )
             generate_button.click(
                 fn=controller.generate,
                 inputs=[
                     prompt_textbox,
                     negative_prompt_textbox,
                     sampler_dropdown,
                     sample_step_slider,
                     width_slider,
                     height_slider,
-                    is_image,
                     length_slider,
                     cfg_scale_slider,
                     seed_textbox,
                 ],
                 outputs=[result_image, result_video, infer_progress]
@@ -674,31 +1152,51 @@ def ui_modelscope(edition, config_path, model_name, savedir_sample):
 def post_eas(
     prompt_textbox, negative_prompt_textbox,
-    sampler_dropdown, sample_step_slider, width_slider, height_slider,
-    is_image, length_slider, cfg_scale_slider, seed_textbox,
 ):
     datas = {
-        "base_model_path": "none",
-        "motion_module_path": "none",
-        "lora_model_path": "none",
-        "lora_alpha_slider": 0.55,
         "prompt_textbox": prompt_textbox,
         "negative_prompt_textbox": negative_prompt_textbox,
         "sampler_dropdown": sampler_dropdown,
         "sample_step_slider": sample_step_slider,
         "width_slider": width_slider,
         "height_slider": height_slider,
-        "is_image": is_image,
         "length_slider": length_slider,
         "cfg_scale_slider": cfg_scale_slider,
         "seed_textbox": seed_textbox,
     }
-    # Token可以在公网地址调用信息中获取，详情请参见通用公网调用部分。
     session = requests.session()
     session.headers.update({"Authorization": os.environ.get("EAS_TOKEN")})
-    response = session.post(url=f'{os.environ.get("EAS_URL")}/easyanimate/infer_forward', json=datas)
     outputs = response.json()
     return outputs
@@ -710,23 +1208,42 @@ class EasyAnimateController_EAS:
     def generate(
         self,
         prompt_textbox,
         negative_prompt_textbox,
         sampler_dropdown,
         sample_step_slider,
         width_slider,
         height_slider,
-        is_image,
         length_slider,
         cfg_scale_slider,
         seed_textbox
     ):
         outputs = post_eas(
             prompt_textbox, negative_prompt_textbox,
-            sampler_dropdown, sample_step_slider, width_slider, height_slider,
-            is_image, length_slider, cfg_scale_slider, seed_textbox
         )
-        base64_encoding = outputs["base64_encoding"]
         decoded_data = base64.b64decode(base64_encoding)
         if not os.path.exists(self.savedir_sample):
@@ -768,35 +1285,134 @@ def ui_eas(edition, config_path, model_name, savedir_sample):
             """
         )
         with gr.Column(variant="panel"):
-            prompt_textbox = gr.Textbox(label="Prompt", lines=2, value="This video shows the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene")
             negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2, value="The video is not of a high quality, it has a low resolution, and the audio quality is not clear. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, rotating camera, blurry camera, shaking camera. Deformation, low-resolution, blurry, ugly, distortion. " )
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                         sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
-                        sample_step_slider = gr.Slider(label="Sampling steps", value=30, minimum=10, maximum=100, step=1)
                     if edition == "v1":
                         width_slider     = gr.Slider(label="Width",            value=512, minimum=384, maximum=704, step=32)
                         height_slider    = gr.Slider(label="Height",           value=512, minimum=384, maximum=704, step=32)
-                        with gr.Row():
-                            is_image      = gr.Checkbox(False, label="Generate Image", visible=False)
-                        length_slider    = gr.Slider(label="Animation length", value=80,  minimum=40,  maximum=96,   step=1)
                         cfg_scale_slider = gr.Slider(label="CFG Scale",        value=6.0, minimum=0,   maximum=20)
                     else:
-                        width_slider     = gr.Slider(label="Width",            value=672, minimum=256, maximum=704, step=16)
-                        height_slider    = gr.Slider(label="Height",           value=384, minimum=256, maximum=704, step=16)
                         with gr.Column():
                             gr.Markdown(
                                 """
-                                To ensure the efficiency of the trial, we will limit the frame rate to no more than 81.
-                                If you want to experience longer video generation, you can go to our [Github](https://github.com/aigc-apps/EasyAnimate/).
                                 """
                             )
-                            with gr.Row():
-                                is_image      = gr.Checkbox(False, label="Generate Image")
-                                length_slider = gr.Slider(label="Animation length", value=72, minimum=9,   maximum=81,  step=9)
                         cfg_scale_slider = gr.Slider(label="CFG Scale",        value=7.0, minimum=0,   maximum=20)
                     with gr.Row():
@@ -819,24 +1435,45 @@ def ui_eas(edition, config_path, model_name, savedir_sample):
                         interactive=False
                     )
-            is_image.change(
-                lambda x: gr.update(visible=not x),
-                inputs=[is_image],
-                outputs=[length_slider],
             )
             generate_button.click(
                 fn=controller.generate,
                 inputs=[
                     prompt_textbox,
                     negative_prompt_textbox,
                     sampler_dropdown,
                     sample_step_slider,
                     width_slider,
                     height_slider,
-                    is_image,
                     length_slider,
                     cfg_scale_slider,
                     seed_textbox,
                 ],
                 outputs=[result_image, result_video, infer_progress]

 """Modified from https://github.com/guoyww/AnimateDiff/blob/main/app.py
 """
+import base64
 import gc
 import json
 import os
 import random
 from datetime import datetime
 from glob import glob
 import gradio as gr
 import numpy as np
+import pkg_resources
+import requests
+import torch
 from diffusers import (AutoencoderKL, DDIMScheduler,
                        DPMSolverMultistepScheduler,
                        EulerAncestralDiscreteScheduler, EulerDiscreteScheduler,
                        PNDMScheduler)
 from diffusers.utils.import_utils import is_xformers_available
 from omegaconf import OmegaConf
+from PIL import Image
 from safetensors import safe_open
+from transformers import (CLIPImageProcessor, CLIPVisionModelWithProjection,
+                          T5EncoderModel, T5Tokenizer)
+from easyanimate.data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from easyanimate.models.autoencoder_magvit import AutoencoderKLMagvit
 from easyanimate.models.transformer3d import Transformer3DModel
 from easyanimate.pipeline.pipeline_easyanimate import EasyAnimatePipeline
+from easyanimate.pipeline.pipeline_easyanimate_inpaint import \
+    EasyAnimateInpaintPipeline
 from easyanimate.utils.lora_utils import merge_lora, unmerge_lora
+from easyanimate.utils.utils import (
+    get_image_to_video_latent,
+    get_width_and_height_from_image_and_base_resolution, save_videos_grid)
 scheduler_dict = {
     "Euler": EulerDiscreteScheduler,
     "Euler A": EulerAncestralDiscreteScheduler,
         self.personalized_model_dir     = os.path.join(self.basedir, "models", "Personalized_Model")
         self.savedir                    = os.path.join(self.basedir, "samples", datetime.now().strftime("Gradio-%Y-%m-%dT%H-%M-%S"))
         self.savedir_sample             = os.path.join(self.savedir, "sample")
+        self.edition                    = "v3"
+        self.inference_config           = OmegaConf.load(os.path.join(self.config_dir, "easyanimate_video_slicevae_motion_module_v3.yaml"))
         os.makedirs(self.savedir, exist_ok=True)
         self.diffusion_transformer_list = []
         self.weight_dtype = torch.bfloat16
     def refresh_diffusion_transformer(self):
+        self.diffusion_transformer_list = sorted(glob(os.path.join(self.diffusion_transformer_dir, "*/")))
     def refresh_motion_module(self):
+        motion_module_list = sorted(glob(os.path.join(self.motion_module_dir, "*.safetensors")))
         self.motion_module_list = [os.path.basename(p) for p in motion_module_list]
     def refresh_personalized_model(self):
+        personalized_model_list = sorted(glob(os.path.join(self.personalized_model_dir, "*.safetensors")))
         self.personalized_model_list = [os.path.basename(p) for p in personalized_model_list]
     def update_edition(self, edition):
         self.edition = edition
         if edition == "v1":
             self.inference_config = OmegaConf.load(os.path.join(self.config_dir, "easyanimate_video_motion_module_v1.yaml"))
+            return gr.update(), gr.update(value="none"), gr.update(visible=True), gr.update(visible=True), \
+                gr.update(value=512, minimum=384, maximum=704, step=32), \
                 gr.update(value=512, minimum=384, maximum=704, step=32), gr.update(value=80, minimum=40, maximum=80, step=1)
+        elif edition == "v2":
             self.inference_config = OmegaConf.load(os.path.join(self.config_dir, "easyanimate_video_magvit_motion_module_v2.yaml"))
+            return gr.update(), gr.update(value="none"), gr.update(visible=False), gr.update(visible=False), \
+                gr.update(value=672, minimum=128, maximum=1280, step=16), \
                 gr.update(value=384, minimum=128, maximum=1280, step=16), gr.update(value=144, minimum=9, maximum=144, step=9)
+        else:
+            self.inference_config = OmegaConf.load(os.path.join(self.config_dir, "easyanimate_video_slicevae_motion_module_v3.yaml"))
+            return gr.update(), gr.update(value="none"), gr.update(visible=False), gr.update(visible=False), \
+                gr.update(value=672, minimum=128, maximum=1280, step=16), \
+                gr.update(value=384, minimum=128, maximum=1280, step=16), gr.update(value=144, minimum=8, maximum=144, step=8)
     def update_diffusion_transformer(self, diffusion_transformer_dropdown):
         print("Update diffusion transformer")
         if diffusion_transformer_dropdown == "none":
+            return gr.update()
         if OmegaConf.to_container(self.inference_config['vae_kwargs'])['enable_magvit']:
             Choosen_AutoencoderKL = AutoencoderKLMagvit
         else:
         self.text_encoder = T5EncoderModel.from_pretrained(diffusion_transformer_dropdown, subfolder="text_encoder", torch_dtype=self.weight_dtype)
         # Get pipeline
+        if self.transformer.config.in_channels != 12:
+            self.pipeline = EasyAnimatePipeline(
+                vae=self.vae,
+                text_encoder=self.text_encoder,
+                tokenizer=self.tokenizer,
+                transformer=self.transformer,
+                scheduler=scheduler_dict["Euler"](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
+            )
+        else:
+            clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                diffusion_transformer_dropdown, subfolder="image_encoder"
+            ).to("cuda", self.weight_dtype)
+            clip_image_processor = CLIPImageProcessor.from_pretrained(
+                diffusion_transformer_dropdown, subfolder="image_encoder"
+            )
+            self.pipeline = EasyAnimateInpaintPipeline(
+                vae=self.vae,
+                text_encoder=self.text_encoder,
+                tokenizer=self.tokenizer,
+                transformer=self.transformer,
+                scheduler=scheduler_dict["Euler"](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs)),
+                clip_image_encoder=clip_image_encoder,
+                clip_image_processor=clip_image_processor,
+            )
         print("Update diffusion transformer done")
+        return gr.update()
     def update_motion_module(self, motion_module_dropdown):
         self.motion_module_path = motion_module_dropdown
         print("Update motion module")
         if motion_module_dropdown == "none":
+            return gr.update()
         if self.transformer is None:
             gr.Info(f"Please select a pretrained model path.")
+            return gr.update(value=None)
         else:
             motion_module_dropdown = os.path.join(self.motion_module_dir, motion_module_dropdown)
             if motion_module_dropdown.endswith(".safetensors"):
                 motion_module_state_dict = torch.load(motion_module_dropdown, map_location="cpu")
             missing, unexpected = self.transformer.load_state_dict(motion_module_state_dict, strict=False)
             print("Update motion module done.")
+            return gr.update()
     def update_base_model(self, base_model_dropdown):
         self.base_model_path = base_model_dropdown
         print("Update base model")
         if base_model_dropdown == "none":
+            return gr.update()
         if self.transformer is None:
             gr.Info(f"Please select a pretrained model path.")
+            return gr.update(value=None)
         else:
             base_model_dropdown = os.path.join(self.personalized_model_dir, base_model_dropdown)
             base_model_state_dict = {}
                     base_model_state_dict[key] = f.get_tensor(key)
             self.transformer.load_state_dict(base_model_state_dict, strict=False)
             print("Update base done")
+            return gr.update()
     def update_lora_model(self, lora_model_dropdown):
         print("Update lora model")
         if lora_model_dropdown == "none":
             self.lora_model_path = "none"
+            return gr.update()
         lora_model_dropdown = os.path.join(self.personalized_model_dir, lora_model_dropdown)
         self.lora_model_path = lora_model_dropdown
+        return gr.update()
     def generate(
         self,
         negative_prompt_textbox,
         sampler_dropdown,
         sample_step_slider,
+        resize_method,
         width_slider,
         height_slider,
+        base_resolution,
+        generation_method,
         length_slider,
+        overlap_video_length,
+        partial_video_length,
         cfg_scale_slider,
+        start_image,
+        end_image,
         seed_textbox,
         is_api = False,
     ):
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
         if self.transformer is None:
             raise gr.Error(f"Please select a pretrained model path.")
         if self.lora_model_path != lora_model_dropdown:
             print("Update lora model")
             self.update_lora_model(lora_model_dropdown)
+        if resize_method == "Resize to the Start Image":
+            if start_image is None:
+                if is_api:
+                    return "", f"Please upload an image when using \"Resize to the Start Image\"."
+                else:
+                    raise gr.Error(f"Please upload an image when using \"Resize to the Start Image\".")
+            aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+            original_width, original_height = start_image[0].size if type(start_image) is list else Image.open(start_image).size
+            closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
+            height_slider, width_slider = [int(x / 16) * 16 for x in closest_size]
+        if self.transformer.config.in_channels != 12 and start_image is not None:
+            if is_api:
+                return "", f"Please select an image to video pretrained model while using image to video."
+            else:
+                raise gr.Error(f"Please select an image to video pretrained model while using image to video.")
+        if self.transformer.config.in_channels != 12 and generation_method == "Long Video Generation":
+            if is_api:
+                return "", f"Please select an image to video pretrained model while using long video generation."
+            else:
+                raise gr.Error(f"Please select an image to video pretrained model while using long video generation.")
+        if start_image is None and end_image is not None:
+            if is_api:
+                return "", f"If specifying the ending image of the video, please specify a starting image of the video."
+            else:
+                raise gr.Error(f"If specifying the ending image of the video, please specify a starting image of the video.")
+        is_image = True if generation_method == "Image Generation" else False
         if is_xformers_available(): self.transformer.enable_xformers_memory_efficient_attention()
         generator = torch.Generator(device="cuda").manual_seed(int(seed_textbox))
         try:
+            if self.transformer.config.in_channels == 12:
+                if generation_method == "Long Video Generation":
+                    init_frames = 0
+                    last_frames = init_frames + partial_video_length
+                    while init_frames < length_slider:
+                        if last_frames >= length_slider:
+                            if self.pipeline.vae.quant_conv.weight.ndim==5:
+                                mini_batch_encoder = self.pipeline.vae.mini_batch_encoder
+                                _partial_video_length = length_slider - init_frames
+                                _partial_video_length = int(_partial_video_length // mini_batch_encoder * mini_batch_encoder)
+                            else:
+                                _partial_video_length = length_slider - init_frames
+                            if _partial_video_length <= 0:
+                                break
+                        else:
+                            _partial_video_length = partial_video_length
+                        if last_frames >= length_slider:
+                            input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, video_length=_partial_video_length, sample_size=(height_slider, width_slider))
+                        else:
+                            input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, None, video_length=_partial_video_length, sample_size=(height_slider, width_slider))
+                        with torch.no_grad():
+                            sample = self.pipeline(
+                                prompt_textbox,
+                                negative_prompt     = negative_prompt_textbox,
+                                num_inference_steps = sample_step_slider,
+                                guidance_scale      = cfg_scale_slider,
+                                width               = width_slider,
+                                height              = height_slider,
+                                video_length        = _partial_video_length,
+                                generator           = generator,
+                                video        = input_video,
+                                mask_video   = input_video_mask,
+                                clip_image   = clip_image,
+                                strength     = 1,
+                            ).videos
+                        if init_frames != 0:
+                            mix_ratio = torch.from_numpy(
+                                np.array([float(_index) / float(overlap_video_length) for _index in range(overlap_video_length)], np.float32)
+                            ).unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                            new_sample[:, :, -overlap_video_length:] = new_sample[:, :, -overlap_video_length:] * (1 - mix_ratio) + \
+                                sample[:, :, :overlap_video_length] * mix_ratio
+                            new_sample = torch.cat([new_sample, sample[:, :, overlap_video_length:]], dim = 2)
+                            sample = new_sample
+                        else:
+                            new_sample = sample
+                        if last_frames >= length_slider:
+                            break
+                        start_image = [
+                            Image.fromarray(
+                                (sample[0, :, _index].transpose(0, 1).transpose(1, 2) * 255).numpy().astype(np.uint8)
+                            ) for _index in range(-overlap_video_length, 0)
+                        ]
+                        init_frames = init_frames + _partial_video_length - overlap_video_length
+                        last_frames = init_frames + _partial_video_length
+                else:
+                    input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, length_slider if not is_image else 1, sample_size=(height_slider, width_slider))
+                    sample = self.pipeline(
+                        prompt_textbox,
+                        negative_prompt     = negative_prompt_textbox,
+                        num_inference_steps = sample_step_slider,
+                        guidance_scale      = cfg_scale_slider,
+                        width               = width_slider,
+                        height              = height_slider,
+                        video_length        = length_slider if not is_image else 1,
+                        generator           = generator,
+                        video        = input_video,
+                        mask_video   = input_video_mask,
+                        clip_image   = clip_image,
+                    ).videos
+            else:
+                sample = self.pipeline(
+                    prompt_textbox,
+                    negative_prompt     = negative_prompt_textbox,
+                    num_inference_steps = sample_step_slider,
+                    guidance_scale      = cfg_scale_slider,
+                    width               = width_slider,
+                    height              = height_slider,
+                    video_length        = length_slider if not is_image else 1,
+                    generator           = generator
+                ).videos
         except Exception as e:
             gc.collect()
             torch.cuda.empty_cache()
             if is_api:
                 return "", f"Error. error information is {str(e)}"
             else:
+                return gr.update(), gr.update(), f"Error. error information is {str(e)}"
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
         # lora part
         if self.lora_model_path != "none":
             if is_api:
                 return save_sample_path, "Success"
             else:
+                if gradio_version_is_above_4:
+                    return gr.Image(value=save_sample_path, visible=True), gr.Video(value=None, visible=False), "Success"
+                else:
+                    return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
         else:
             save_sample_path = os.path.join(self.savedir_sample, prefix + f".mp4")
             save_videos_grid(sample, save_sample_path, fps=12 if self.edition == "v1" else 24)
             if is_api:
                 return save_sample_path, "Success"
             else:
+                if gradio_version_is_above_4:
+                    return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+                else:
+                    return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
 def ui():
         with gr.Column(variant="panel"):
             gr.Markdown(
                 """
+                ### 1. EasyAnimate Edition (EasyAnimate版本).
                 """
             )
             with gr.Row():
                 easyanimate_edition_dropdown = gr.Dropdown(
+                    label="The config of EasyAnimate Edition (EasyAnimate版本配置)",
+                    choices=["v1", "v2", "v3"],
+                    value="v3",
                     interactive=True,
                 )
             gr.Markdown(
                 """
+                ### 2. Model checkpoints (模型路径).
                 """
             )
             with gr.Row():
                 diffusion_transformer_dropdown = gr.Dropdown(
+                    label="Pretrained Model Path (预训练模型路径)",
                     choices=controller.diffusion_transformer_list,
                     value="none",
                     interactive=True,
                 diffusion_transformer_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
                 def refresh_diffusion_transformer():
                     controller.refresh_diffusion_transformer()
+                    return gr.update(choices=controller.diffusion_transformer_list)
                 diffusion_transformer_refresh_button.click(fn=refresh_diffusion_transformer, inputs=[], outputs=[diffusion_transformer_dropdown])
             with gr.Row():
                 motion_module_dropdown = gr.Dropdown(
+                    label="Select motion module (选择运动模块[非必需])",
                     choices=controller.motion_module_list,
                     value="none",
                     interactive=True,
                 motion_module_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton", visible=False)
                 def update_motion_module():
                     controller.refresh_motion_module()
+                    return gr.update(choices=controller.motion_module_list)
                 motion_module_refresh_button.click(fn=update_motion_module, inputs=[], outputs=[motion_module_dropdown])
                 base_model_dropdown = gr.Dropdown(
+                    label="Select base Dreambooth model (选择基模型[非必需])",
                     choices=controller.personalized_model_list,
                     value="none",
                     interactive=True,
                 )
                 lora_model_dropdown = gr.Dropdown(
+                    label="Select LoRA model (选择LoRA模型[非必需])",
                     choices=["none"] + controller.personalized_model_list,
                     value="none",
                     interactive=True,
                 )
+                lora_alpha_slider = gr.Slider(label="LoRA alpha (LoRA权重)", value=0.55, minimum=0, maximum=2, interactive=True)
                 personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
                 def update_personalized_model():
                     controller.refresh_personalized_model()
                     return [
+                        gr.update(choices=controller.personalized_model_list),
+                        gr.update(choices=["none"] + controller.personalized_model_list)
                     ]
                 personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown])
         with gr.Column(variant="panel"):
             gr.Markdown(
                 """
+                ### 3. Configs for Generation (生成参数配置).
                 """
             )
+            prompt_textbox = gr.Textbox(label="Prompt (正向提示词)", lines=2, value="A young woman with beautiful and clear eyes and blonde hair standing and white dress in a forest wearing a crown. She seems to be lost in thought, and the camera focuses on her face. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.")
+            negative_prompt_textbox = gr.Textbox(label="Negative prompt (负向提示词)", lines=2, value="The video is not of a high quality, it has a low resolution, and the audio quality is not clear. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, rotating camera, blurry camera, shaking camera. Deformation, low-resolution, blurry, ugly, distortion." )
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
+                        sampler_dropdown   = gr.Dropdown(label="Sampling method (采样器种类)", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
+                        sample_step_slider = gr.Slider(label="Sampling steps (生成步数)", value=30, minimum=10, maximum=100, step=1)
+                    resize_method = gr.Radio(
+                        ["Generate by", "Resize to the Start Image"],
+                        value="Generate by",
+                        show_label=False,
+                    )
+                    width_slider     = gr.Slider(label="Width (视频宽度)",            value=672, minimum=128, maximum=1280, step=16)
+                    height_slider    = gr.Slider(label="Height (视频高度)",           value=384, minimum=128, maximum=1280, step=16)
+                    base_resolution  = gr.Radio(label="Base Resolution of Pretrained Models", value=512, choices=[512, 768, 960], visible=False)
+                    with gr.Group():
+                        generation_method = gr.Radio(
+                            ["Video Generation", "Image Generation", "Long Video Generation"],
+                            value="Video Generation",
+                            show_label=False,
+                        )
+                        with gr.Row():
+                            length_slider = gr.Slider(label="Animation length (视频帧数)", value=144, minimum=8,   maximum=144,  step=8)
+                            overlap_video_length = gr.Slider(label="Overlap length (视频续写的重叠帧数)", value=4, minimum=1,   maximum=4,  step=1, visible=False)
+                            partial_video_length = gr.Slider(label="Partial video generation length (每个部分的视频生成帧数)", value=72, minimum=8,   maximum=144,  step=8, visible=False)
+                    with gr.Accordion("Image to Video (图片到视频)", open=False):
+                        start_image = gr.Image(label="The image at the beginning of the video (图片到视频的开始图片)", show_label=True, elem_id="i2v_start", sources="upload", type="filepath")
+                        template_gallery_path = ["asset/1.png", "asset/2.png", "asset/3.png", "asset/4.png", "asset/5.png"]
+                        def select_template(evt: gr.SelectData):
+                            text = {
+                                "asset/1.png": "The dog is looking at camera and smiling. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                "asset/2.png": "a sailboat sailing in rough seas with a dramatic sunset. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                "asset/3.png": "a beautiful woman with long hair and a dress blowing in the wind. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                "asset/4.png": "a man in an astronaut suit playing a guitar. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                "asset/5.png": "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                            }[template_gallery_path[evt.index]]
+                            return template_gallery_path[evt.index], text
+                        template_gallery = gr.Gallery(
+                            template_gallery_path,
+                            columns=5, rows=1,
+                            height=140,
+                            allow_preview=False,
+                            container=False,
+                            label="Template Examples",
+                        )
+                        template_gallery.select(select_template, None, [start_image, prompt_textbox])
+                        with gr.Accordion("The image at the ending of the video (图片到视频的结束图片[非必需, Optional])", open=False):
+                            end_image   = gr.Image(label="The image at the ending of the video (图片到视频的结束图片[非必需, Optional])", show_label=False, elem_id="i2v_end", sources="upload", type="filepath")
+                    cfg_scale_slider  = gr.Slider(label="CFG Scale (引导系数)",        value=7.0, minimum=0,   maximum=20)
                     with gr.Row():
+                        seed_textbox = gr.Textbox(label="Seed (随机种子)", value=43)
                         seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
+                        seed_button.click(
+                            fn=lambda: gr.Textbox(value=random.randint(1, 1e8)) if gradio_version_is_above_4 else gr.Textbox.update(value=random.randint(1, 1e8)),
+                            inputs=[],
+                            outputs=[seed_textbox]
+                        )
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
                 with gr.Column():
+                    result_image = gr.Image(label="Generated Image (生成图片)", interactive=False, visible=False)
+                    result_video = gr.Video(label="Generated Animation (生成视频)", interactive=False)
                     infer_progress = gr.Textbox(
+                        label="Generation Info (生成信息)",
                         value="No task currently",
                         interactive=False
                     )
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return [gr.update(visible=True, maximum=144, value=144), gr.update(visible=False), gr.update(visible=False)]
+                elif generation_method == "Image Generation":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=True, maximum=1440), gr.update(visible=True), gr.update(visible=True)]
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider, overlap_video_length, partial_video_length]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
             )
             easyanimate_edition_dropdown.change(
                 fn=controller.update_edition,
                 inputs=[easyanimate_edition_dropdown],
                     diffusion_transformer_dropdown,
                     motion_module_dropdown,
                     motion_module_refresh_button,
                     width_slider,
                     height_slider,
                     length_slider,
                     negative_prompt_textbox,
                     sampler_dropdown,
                     sample_step_slider,
+                    resize_method,
                     width_slider,
                     height_slider,
+                    base_resolution,
+                    generation_method,
                     length_slider,
+                    overlap_video_length,
+                    partial_video_length,
                     cfg_scale_slider,
+                    start_image,
+                    end_image,
                     seed_textbox,
                 ],
                 outputs=[result_image, result_video, infer_progress]
 class EasyAnimateController_Modelscope:
     def __init__(self, edition, config_path, model_name, savedir_sample):
+        # Weight Dtype
+        weight_dtype                    = torch.bfloat16
+        # Basic dir
+        self.basedir                    = os.getcwd()
+        self.personalized_model_dir     = os.path.join(self.basedir, "models", "Personalized_Model")
+        self.lora_model_path            = "none"
+        self.savedir_sample             = savedir_sample
+        self.refresh_personalized_model()
         os.makedirs(self.savedir_sample, exist_ok=True)
+        # Config and model path
         self.edition = edition
         self.inference_config = OmegaConf.load(config_path)
         # Get Transformer
             subfolder="text_encoder",
             torch_dtype=weight_dtype
         )
+        # Get pipeline
+        if self.transformer.config.in_channels != 12:
+            self.pipeline = EasyAnimatePipeline(
+                vae=self.vae,
+                text_encoder=self.text_encoder,
+                tokenizer=self.tokenizer,
+                transformer=self.transformer,
+                scheduler=scheduler_dict["Euler"](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
+            )
+        else:
+            clip_image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                model_name, subfolder="image_encoder"
+            ).to("cuda", weight_dtype)
+            clip_image_processor = CLIPImageProcessor.from_pretrained(
+                model_name, subfolder="image_encoder"
+            )
+            self.pipeline = EasyAnimateInpaintPipeline(
+                vae=self.vae,
+                text_encoder=self.text_encoder,
+                tokenizer=self.tokenizer,
+                transformer=self.transformer,
+                scheduler=scheduler_dict["Euler"](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs)),
+                clip_image_encoder=clip_image_encoder,
+                clip_image_processor=clip_image_processor,
+            )
         print("Update diffusion transformer done")
+    def refresh_personalized_model(self):
+        personalized_model_list = sorted(glob(os.path.join(self.personalized_model_dir, "*.safetensors")))
+        self.personalized_model_list = [os.path.basename(p) for p in personalized_model_list]
+    def update_lora_model(self, lora_model_dropdown):
+        print("Update lora model")
+        if lora_model_dropdown == "none":
+            self.lora_model_path = "none"
+            return gr.update()
+        lora_model_dropdown = os.path.join(self.personalized_model_dir, lora_model_dropdown)
+        self.lora_model_path = lora_model_dropdown
+        return gr.update()
     def generate(
         self,
+        diffusion_transformer_dropdown,
+        motion_module_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown,
+        lora_alpha_slider,
         prompt_textbox,
         negative_prompt_textbox,
         sampler_dropdown,
         sample_step_slider,
+        resize_method,
         width_slider,
         height_slider,
+        base_resolution,
+        generation_method,
         length_slider,
         cfg_scale_slider,
+        start_image,
+        end_image,
+        seed_textbox,
+        is_api = False,
     ):
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        if self.transformer is None:
+            raise gr.Error(f"Please select a pretrained model path.")
+        if self.lora_model_path != lora_model_dropdown:
+            print("Update lora model")
+            self.update_lora_model(lora_model_dropdown)
+        if resize_method == "Resize to the Start Image":
+            if start_image is None:
+                raise gr.Error(f"Please upload an image when using \"Resize to the Start Image\".")
+            aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+            original_width, original_height = start_image[0].size if type(start_image) is list else Image.open(start_image).size
+            closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
+            height_slider, width_slider = [int(x / 16) * 16 for x in closest_size]
+        if self.transformer.config.in_channels != 12 and start_image is not None:
+            raise gr.Error(f"Please select an image to video pretrained model while using image to video.")
+        if start_image is None and end_image is not None:
+            raise gr.Error(f"If specifying the ending image of the video, please specify a starting image of the video.")
+        is_image = True if generation_method == "Image Generation" else False
         if is_xformers_available(): self.transformer.enable_xformers_memory_efficient_attention()
         self.pipeline.scheduler = scheduler_dict[sampler_dropdown](**OmegaConf.to_container(self.inference_config.noise_scheduler_kwargs))
+        if self.lora_model_path != "none":
+            # lora part
+            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
         self.pipeline.to("cuda")
         if int(seed_textbox) != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
         generator = torch.Generator(device="cuda").manual_seed(int(seed_textbox))
         try:
+            if self.transformer.config.in_channels == 12:
+                input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, length_slider if not is_image else 1, sample_size=(height_slider, width_slider))
+                sample = self.pipeline(
+                    prompt_textbox,
+                    negative_prompt     = negative_prompt_textbox,
+                    num_inference_steps = sample_step_slider,
+                    guidance_scale      = cfg_scale_slider,
+                    width               = width_slider,
+                    height              = height_slider,
+                    video_length        = length_slider if not is_image else 1,
+                    generator           = generator,
+                    video        = input_video,
+                    mask_video   = input_video_mask,
+                    clip_image   = clip_image,
+                ).videos
+            else:
+                sample = self.pipeline(
+                    prompt_textbox,
+                    negative_prompt     = negative_prompt_textbox,
+                    num_inference_steps = sample_step_slider,
+                    guidance_scale      = cfg_scale_slider,
+                    width               = width_slider,
+                    height              = height_slider,
+                    video_length        = length_slider if not is_image else 1,
+                    generator           = generator
+                ).videos
         except Exception as e:
             gc.collect()
             torch.cuda.empty_cache()
             torch.cuda.ipc_collect()
+            if self.lora_model_path != "none":
+                self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            if is_api:
+                return "", f"Error. error information is {str(e)}"
+            else:
+                return gr.update(), gr.update(), f"Error. error information is {str(e)}"
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        # lora part
+        if self.lora_model_path != "none":
+            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
         if not os.path.exists(self.savedir_sample):
             os.makedirs(self.savedir_sample, exist_ok=True)
             image = (image * 255).numpy().astype(np.uint8)
             image = Image.fromarray(image)
             image.save(save_sample_path)
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(value=save_sample_path, visible=True), gr.Video(value=None, visible=False), "Success"
+                else:
+                    return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
         else:
             save_sample_path = os.path.join(self.savedir_sample, prefix + f".mp4")
             save_videos_grid(sample, save_sample_path, fps=12 if self.edition == "v1" else 24)
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+                else:
+                    return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
 def ui_modelscope(edition, config_path, model_name, savedir_sample):
             """
         )
         with gr.Column(variant="panel"):
+            gr.Markdown(
+                """
+                ### 1. Model checkpoints (模型路径).
+                """
+            )
+            with gr.Row():
+                diffusion_transformer_dropdown = gr.Dropdown(
+                    label="Pretrained Model Path (预训练模型路径)",
+                    choices=[model_name],
+                    value=model_name,
+                    interactive=False,
+                )
+            with gr.Row():
+                motion_module_dropdown = gr.Dropdown(
+                    label="Select motion module (选择运动模块[非必需])",
+                    choices=["none"],
+                    value="none",
+                    interactive=False,
+                    visible=False
+                )
+                base_model_dropdown = gr.Dropdown(
+                    label="Select base Dreambooth model (选择基模型[非必需])",
+                    choices=["none"],
+                    value="none",
+                    interactive=False,
+                    visible=False
+                )
+                with gr.Column(visible=False):
+                    gr.Markdown(
+                        """
+                        ### Minimalism is an example portrait of Lora, triggered by specific prompt words. More details can be found on [Wiki](https://github.com/aigc-apps/EasyAnimate/wiki/Training-Lora).
+                        """
+                    )
+                    with gr.Row():
+                        lora_model_dropdown = gr.Dropdown(
+                            label="Select LoRA model",
+                            choices=["none", "easyanimatev2_minimalism_lora.safetensors"],
+                            value="none",
+                            interactive=True,
+                        )
+                        lora_alpha_slider = gr.Slider(label="LoRA alpha (LoRA权重)", value=0.55, minimum=0, maximum=2, interactive=True)
+        with gr.Column(variant="panel"):
+            gr.Markdown(
+                """
+                ### 2. Configs for Generation (生成参数配置).
+                """
+            )
+            prompt_textbox = gr.Textbox(label="Prompt (正向提示词)", lines=2, value="A young woman with beautiful and clear eyes and blonde hair standing and white dress in a forest wearing a crown. She seems to be lost in thought, and the camera focuses on her face. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.")
+            negative_prompt_textbox = gr.Textbox(label="Negative prompt (负向提示词)", lines=2, value="The video is not of a high quality, it has a low resolution, and the audio quality is not clear. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, rotating camera, blurry camera, shaking camera. Deformation, low-resolution, blurry, ugly, distortion." )
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
+                        sampler_dropdown   = gr.Dropdown(label="Sampling method (采样器种类)", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
+                        sample_step_slider = gr.Slider(label="Sampling steps (生成步数)", value=20, minimum=10, maximum=30, step=1, interactive=False)
                     if edition == "v1":
+                        width_slider     = gr.Slider(label="Width (视频宽度)",            value=512, minimum=384, maximum=704, step=32)
+                        height_slider    = gr.Slider(label="Height (视频高度)",           value=512, minimum=384, maximum=704, step=32)
+                        with gr.Group():
+                            generation_method = gr.Radio(
+                                ["Video Generation", "Image Generation"],
+                                value="Video Generation",
+                                show_label=False,
+                                visible=False,
+                            )
+                            length_slider = gr.Slider(label="Animation length (视频帧数)", value=80,  minimum=40,  maximum=96,   step=1)
+                        cfg_scale_slider = gr.Slider(label="CFG Scale (引导系数)",        value=6.0, minimum=0,   maximum=20)
                     else:
+                        resize_method = gr.Radio(
+                            ["Generate by", "Resize to the Start Image"],
+                            value="Generate by",
+                            show_label=False,
+                        )
                         with gr.Column():
                             gr.Markdown(
                                 """
+                                We support video generation up to 720p with 144 frames, but for the trial experience, we have set certain limitations. We fix the max resolution of video to 384x672x48 (2s).
+                                If the start image you uploaded does not match this resolution, you can use the "Resize to the Start Image" option above.
+                                If you want to experience longer and larger video generation, you can go to our [Github](https://github.com/aigc-apps/EasyAnimate/).
                                 """
                             )
+                        width_slider     = gr.Slider(label="Width (视频宽度)",            value=672, minimum=128, maximum=1280, step=16, interactive=False)
+                        height_slider    = gr.Slider(label="Height (视频高度)",           value=384, minimum=128, maximum=1280, step=16, interactive=False)
+                        base_resolution  = gr.Radio(label="Base Resolution of Pretrained Models", value=512, choices=[512, 768, 960], interactive=False, visible=False)
+                        with gr.Group():
+                            generation_method = gr.Radio(
+                                ["Video Generation", "Image Generation"],
+                                value="Video Generation",
+                                show_label=False,
+                                visible=True,
+                            )
+                            length_slider = gr.Slider(label="Animation length (视频帧数)", value=48, minimum=8,   maximum=48,  step=8)
+                        with gr.Accordion("Image to Video (图片到视频)", open=True):
                             with gr.Row():
+                                start_image = gr.Image(label="The image at the beginning of the video (图片到视频的开始图片)", show_label=True, elem_id="i2v_start", sources="upload", type="filepath")
+                            template_gallery_path = ["asset/1.png", "asset/2.png", "asset/3.png", "asset/4.png", "asset/5.png"]
+                            def select_template(evt: gr.SelectData):
+                                text = {
+                                    "asset/1.png": "The dog is looking at camera and smiling. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/2.png": "a sailboat sailing in rough seas with a dramatic sunset. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/3.png": "a beautiful woman with long hair and a dress blowing in the wind. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/4.png": "a man in an astronaut suit playing a guitar. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/5.png": "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                }[template_gallery_path[evt.index]]
+                                return template_gallery_path[evt.index], text
+                            template_gallery = gr.Gallery(
+                                template_gallery_path,
+                                columns=5, rows=1,
+                                height=140,
+                                allow_preview=False,
+                                container=False,
+                                label="Template Examples",
+                            )
+                            template_gallery.select(select_template, None, [start_image, prompt_textbox])
+                            with gr.Accordion("The image at the ending of the video (图片到视频的结束图片[非必需, Optional])", open=False):
+                                end_image   = gr.Image(label="The image at the ending of the video (图片到视频的结束图片[非必需, Optional])", show_label=False, elem_id="i2v_end", sources="upload", type="filepath")
+                        cfg_scale_slider = gr.Slider(label="CFG Scale (引导系数)",        value=7.0, minimum=0,   maximum=20)
                     with gr.Row():
+                        seed_textbox = gr.Textbox(label="Seed (随机种子)", value=43)
                         seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
+                        seed_button.click(
+                            fn=lambda: gr.Textbox(value=random.randint(1, 1e8)) if gradio_version_is_above_4 else gr.Textbox.update(value=random.randint(1, 1e8)),
+                            inputs=[],
+                            outputs=[seed_textbox]
+                        )
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
                 with gr.Column():
+                    result_image = gr.Image(label="Generated Image (生成图片)", interactive=False, visible=False)
+                    result_video = gr.Video(label="Generated Animation (生成视频)", interactive=False)
                     infer_progress = gr.Textbox(
+                        label="Generation Info (生成信息)",
                         value="No task currently",
                         interactive=False
                     )
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=8, maximum=48, value=48, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
             )
             generate_button.click(
                 fn=controller.generate,
                 inputs=[
+                    diffusion_transformer_dropdown,
+                    motion_module_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
                     prompt_textbox,
                     negative_prompt_textbox,
                     sampler_dropdown,
                     sample_step_slider,
+                    resize_method,
                     width_slider,
                     height_slider,
+                    base_resolution,
+                    generation_method,
                     length_slider,
                     cfg_scale_slider,
+                    start_image,
+                    end_image,
                     seed_textbox,
                 ],
                 outputs=[result_image, result_video, infer_progress]
 def post_eas(
+    diffusion_transformer_dropdown, motion_module_dropdown,
+    base_model_dropdown, lora_model_dropdown, lora_alpha_slider,
     prompt_textbox, negative_prompt_textbox,
+    sampler_dropdown, sample_step_slider, resize_method, width_slider, height_slider,
+    base_resolution, generation_method, length_slider, cfg_scale_slider,
+    start_image, end_image, seed_textbox,
 ):
+    if start_image is not None:
+        with open(start_image, 'rb') as file:
+            file_content = file.read()
+            start_image_encoded_content = base64.b64encode(file_content)
+            start_image = start_image_encoded_content.decode('utf-8')
+    if end_image is not None:
+        with open(end_image, 'rb') as file:
+            file_content = file.read()
+            end_image_encoded_content = base64.b64encode(file_content)
+            end_image = end_image_encoded_content.decode('utf-8')
     datas = {
+        "base_model_path": base_model_dropdown,
+        "motion_module_path": motion_module_dropdown,
+        "lora_model_path": lora_model_dropdown,
+        "lora_alpha_slider": lora_alpha_slider,
         "prompt_textbox": prompt_textbox,
         "negative_prompt_textbox": negative_prompt_textbox,
         "sampler_dropdown": sampler_dropdown,
         "sample_step_slider": sample_step_slider,
+        "resize_method": resize_method,
         "width_slider": width_slider,
         "height_slider": height_slider,
+        "base_resolution": base_resolution,
+        "generation_method": generation_method,
         "length_slider": length_slider,
         "cfg_scale_slider": cfg_scale_slider,
+        "start_image": start_image,
+        "end_image": end_image,
         "seed_textbox": seed_textbox,
     }
     session = requests.session()
     session.headers.update({"Authorization": os.environ.get("EAS_TOKEN")})
+    response = session.post(url=f'{os.environ.get("EAS_URL")}/easyanimate/infer_forward', json=datas, timeout=300)
     outputs = response.json()
     return outputs
     def generate(
         self,
+        diffusion_transformer_dropdown,
+        motion_module_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown,
+        lora_alpha_slider,
         prompt_textbox,
         negative_prompt_textbox,
         sampler_dropdown,
         sample_step_slider,
+        resize_method,
         width_slider,
         height_slider,
+        base_resolution,
+        generation_method,
         length_slider,
         cfg_scale_slider,
+        start_image,
+        end_image,
         seed_textbox
     ):
+        is_image = True if generation_method == "Image Generation" else False
         outputs = post_eas(
+            diffusion_transformer_dropdown, motion_module_dropdown,
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider,
             prompt_textbox, negative_prompt_textbox,
+            sampler_dropdown, sample_step_slider, resize_method, width_slider, height_slider,
+            base_resolution, generation_method, length_slider, cfg_scale_slider,
+            start_image, end_image,
+            seed_textbox
         )
+        try:
+            base64_encoding = outputs["base64_encoding"]
+        except:
+            return gr.Image(visible=False, value=None), gr.Video(None, visible=True), outputs["message"]
         decoded_data = base64.b64decode(base64_encoding)
         if not os.path.exists(self.savedir_sample):
             """
         )
         with gr.Column(variant="panel"):
+            gr.Markdown(
+                """
+                ### 1. Model checkpoints.
+                """
+            )
+            with gr.Row():
+                diffusion_transformer_dropdown = gr.Dropdown(
+                    label="Pretrained Model Path",
+                    choices=[model_name],
+                    value=model_name,
+                    interactive=False,
+                )
+            with gr.Row():
+                motion_module_dropdown = gr.Dropdown(
+                    label="Select motion module",
+                    choices=["none"],
+                    value="none",
+                    interactive=False,
+                    visible=False
+                )
+                base_model_dropdown = gr.Dropdown(
+                    label="Select base Dreambooth model",
+                    choices=["none"],
+                    value="none",
+                    interactive=False,
+                    visible=False
+                )
+                with gr.Column(visible=False):
+                    gr.Markdown(
+                        """
+                        ### Minimalism is an example portrait of Lora, triggered by specific prompt words. More details can be found on [Wiki](https://github.com/aigc-apps/EasyAnimate/wiki/Training-Lora).
+                        """
+                    )
+                    with gr.Row():
+                        lora_model_dropdown = gr.Dropdown(
+                            label="Select LoRA model",
+                            choices=["none", "easyanimatev2_minimalism_lora.safetensors"],
+                            value="none",
+                            interactive=True,
+                        )
+                        lora_alpha_slider = gr.Slider(label="LoRA alpha (LoRA权重)", value=0.55, minimum=0, maximum=2, interactive=True)
+        with gr.Column(variant="panel"):
+            gr.Markdown(
+                """
+                ### 2. Configs for Generation.
+                """
+            )
+            prompt_textbox = gr.Textbox(label="Prompt", lines=2, value="A young woman with beautiful and clear eyes and blonde hair standing and white dress in a forest wearing a crown. She seems to be lost in thought, and the camera focuses on her face. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.")
             negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2, value="The video is not of a high quality, it has a low resolution, and the audio quality is not clear. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, rotating camera, blurry camera, shaking camera. Deformation, low-resolution, blurry, ugly, distortion. " )
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                         sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(scheduler_dict.keys()), value=list(scheduler_dict.keys())[0])
+                        sample_step_slider = gr.Slider(label="Sampling steps", value=20, minimum=10, maximum=30, step=1, interactive=False)
                     if edition == "v1":
                         width_slider     = gr.Slider(label="Width",            value=512, minimum=384, maximum=704, step=32)
                         height_slider    = gr.Slider(label="Height",           value=512, minimum=384, maximum=704, step=32)
+                        with gr.Group():
+                            generation_method = gr.Radio(
+                                ["Video Generation", "Image Generation"],
+                                value="Video Generation",
+                                show_label=False,
+                                visible=False,
+                            )
+                            length_slider    = gr.Slider(label="Animation length", value=80,  minimum=40,  maximum=96,   step=1)
                         cfg_scale_slider = gr.Slider(label="CFG Scale",        value=6.0, minimum=0,   maximum=20)
                     else:
+                        resize_method = gr.Radio(
+                            ["Generate by", "Resize to the Start Image"],
+                            value="Generate by",
+                            show_label=False,
+                        )
                         with gr.Column():
                             gr.Markdown(
                                 """
+                                We support video generation up to 720p with 144 frames, but for the trial experience, we have set certain limitations. We fix the max resolution of video to 384x672x48 (2s).
+                                If the start image you uploaded does not match this resolution, you can use the "Resize to the Start Image" option above.
+                                If you want to experience longer and larger video generation, you can go to our [Github](https://github.com/aigc-apps/EasyAnimate/).
                                 """
                             )
+                        width_slider     = gr.Slider(label="Width (视频宽度)",            value=672, minimum=128, maximum=1280, step=16, interactive=False)
+                        height_slider    = gr.Slider(label="Height (视频高度)",           value=384, minimum=128, maximum=1280, step=16, interactive=False)
+                        base_resolution  = gr.Radio(label="Base Resolution of Pretrained Models", value=512, choices=[512, 768, 960], interactive=False, visible=False)
+                        with gr.Group():
+                            generation_method = gr.Radio(
+                                ["Video Generation", "Image Generation"],
+                                value="Video Generation",
+                                show_label=False,
+                                visible=True,
+                            )
+                            length_slider = gr.Slider(label="Animation length (视频帧数)", value=48, minimum=8,   maximum=48,  step=8)
+                        with gr.Accordion("Image to Video", open=True):
+                            start_image = gr.Image(label="The image at the beginning of the video", show_label=True, elem_id="i2v_start", sources="upload", type="filepath")
+                            template_gallery_path = ["asset/1.png", "asset/2.png", "asset/3.png", "asset/4.png", "asset/5.png"]
+                            def select_template(evt: gr.SelectData):
+                                text = {
+                                    "asset/1.png": "The dog is looking at camera and smiling. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/2.png": "a sailboat sailing in rough seas with a dramatic sunset. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/3.png": "a beautiful woman with long hair and a dress blowing in the wind. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/4.png": "a man in an astronaut suit playing a guitar. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                    "asset/5.png": "fireworks display over night city. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+                                }[template_gallery_path[evt.index]]
+                                return template_gallery_path[evt.index], text
+                            template_gallery = gr.Gallery(
+                                template_gallery_path,
+                                columns=5, rows=1,
+                                height=140,
+                                allow_preview=False,
+                                container=False,
+                                label="Template Examples",
+                            )
+                            template_gallery.select(select_template, None, [start_image, prompt_textbox])
+                            with gr.Accordion("The image at the ending of the video (Optional)", open=False):
+                                end_image   = gr.Image(label="The image at the ending of the video (Optional)", show_label=True, elem_id="i2v_end", sources="upload", type="filepath")
                         cfg_scale_slider = gr.Slider(label="CFG Scale",        value=7.0, minimum=0,   maximum=20)
                     with gr.Row():
                         interactive=False
                     )
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=8, maximum=48, value=48, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
             )
             generate_button.click(
                 fn=controller.generate,
                 inputs=[
+                    diffusion_transformer_dropdown,
+                    motion_module_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
                     prompt_textbox,
                     negative_prompt_textbox,
                     sampler_dropdown,
                     sample_step_slider,
+                    resize_method,
                     width_slider,
                     height_slider,
+                    base_resolution,
+                    generation_method,
                     length_slider,
                     cfg_scale_slider,
+                    start_image,
+                    end_image,
                     seed_textbox,
                 ],
                 outputs=[result_image, result_video, infer_progress]

easyanimate/utils/utils.py CHANGED Viewed

@@ -8,6 +8,13 @@ import cv2
 from einops import rearrange
 from PIL import Image
 def color_transfer(sc, dc):
     """
@@ -62,3 +69,103 @@ def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, f
         if path.endswith("mp4"):
             path = path.replace('.mp4', '.gif')
         outputs[0].save(path, format='GIF', append_images=outputs, save_all=True, duration=100, loop=0)

 from einops import rearrange
 from PIL import Image
+def get_width_and_height_from_image_and_base_resolution(image, base_resolution):
+    target_pixels = int(base_resolution) * int(base_resolution)
+    original_width, original_height = Image.open(image).size
+    ratio = (target_pixels / (original_width * original_height)) ** 0.5
+    width_slider = round(original_width * ratio)
+    height_slider = round(original_height * ratio)
+    return height_slider, width_slider
 def color_transfer(sc, dc):
     """
         if path.endswith("mp4"):
             path = path.replace('.mp4', '.gif')
         outputs[0].save(path, format='GIF', append_images=outputs, save_all=True, duration=100, loop=0)
+def get_image_to_video_latent(validation_image_start, validation_image_end, video_length, sample_size):
+    if validation_image_start is not None and validation_image_end is not None:
+        if type(validation_image_start) is str and os.path.isfile(validation_image_start):
+            image_start = clip_image = Image.open(validation_image_start)
+        else:
+            image_start = clip_image = validation_image_start
+        if type(validation_image_end) is str and os.path.isfile(validation_image_end):
+            image_end = Image.open(validation_image_end)
+        else:
+            image_end = validation_image_end
+        if type(image_start) is list:
+            clip_image = clip_image[0]
+            start_video = torch.cat(
+                [torch.from_numpy(np.array(_image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0) for _image_start in image_start],
+                dim=2
+            )
+            input_video = torch.tile(start_video[:, :, :1], [1, 1, video_length, 1, 1])
+            input_video[:, :, :len(image_start)] = start_video
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, len(image_start):] = 255
+        else:
+            input_video = torch.tile(
+                torch.from_numpy(np.array(image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0),
+                [1, 1, video_length, 1, 1]
+            )
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, 1:] = 255
+        if type(image_end) is list:
+            image_end = [_image_end.resize(image_start[0].size if type(image_start) is list else image_start.size) for _image_end in image_end]
+            end_video = torch.cat(
+                [torch.from_numpy(np.array(_image_end)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0) for _image_end in image_end],
+                dim=2
+            )
+            input_video[:, :, -len(end_video):] = end_video
+            input_video_mask[:, :, -len(image_end):] = 0
+        else:
+            image_end = image_end.resize(image_start[0].size if type(image_start) is list else image_start.size)
+            input_video[:, :, -1:] = torch.from_numpy(np.array(image_end)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0)
+            input_video_mask[:, :, -1:] = 0
+        input_video = input_video / 255
+    elif validation_image_start is not None:
+        if type(validation_image_start) is str and os.path.isfile(validation_image_start):
+            image_start = clip_image = Image.open(validation_image_start).convert("RGB")
+        else:
+            image_start = clip_image = validation_image_start
+        if type(image_start) is list:
+            clip_image = clip_image[0]
+            start_video = torch.cat(
+                [torch.from_numpy(np.array(_image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0) for _image_start in image_start],
+                dim=2
+            )
+            input_video = torch.tile(start_video[:, :, :1], [1, 1, video_length, 1, 1])
+            input_video[:, :, :len(image_start)] = start_video
+            input_video = input_video / 255
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, len(image_start):] = 255
+        else:
+            input_video = torch.tile(
+                torch.from_numpy(np.array(image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0),
+                [1, 1, video_length, 1, 1]
+            ) / 255
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, 1:, ] = 255
+    else:
+        input_video = torch.zeros([1, 3, video_length, sample_size[0], sample_size[1]])
+        input_video_mask = torch.ones([1, 1, video_length, sample_size[0], sample_size[1]]) * 255
+        clip_image = None
+    return  input_video, input_video_mask, clip_image
+def video_frames(input_video_path):
+    cap = cv2.VideoCapture(input_video_path)
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    cap.release()
+    cv2.destroyAllWindows()
+    return frames
+def get_video_to_video_latent(validation_videos, video_length):
+    input_video = video_frames(validation_videos)
+    input_video = torch.from_numpy(np.array(input_video))[:video_length]
+    input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0) / 255
+    input_video_mask = torch.zeros_like(input_video[:, :1])
+    input_video_mask[:, :, :] = 255
+    return  input_video, input_video_mask, None