Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- README.md +1 -1
- app.py +23 -4
- diffusers_vdm/improved_clip_vision.py +1 -1
- requirements.txt +2 -1
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: π¨
|
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
|
|
|
| 4 |
colorFrom: gray
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.46.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
app.py
CHANGED
|
@@ -23,6 +23,14 @@ from transformers import CLIPTextModel, CLIPTokenizer
|
|
| 23 |
from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
|
| 24 |
from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# Disable gradients globally
|
| 27 |
torch.set_grad_enabled(False)
|
| 28 |
|
|
@@ -34,7 +42,6 @@ class ModifiedUNet(UNet2DConditionModel):
|
|
| 34 |
unet_add_coded_conds(unet=m, added_number_count=1)
|
| 35 |
return m
|
| 36 |
|
| 37 |
-
|
| 38 |
model_name = 'lllyasviel/paints_undo_single_frame'
|
| 39 |
tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
|
| 40 |
text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16).to("cuda")
|
|
@@ -57,7 +64,6 @@ k_sampler = KDiffusionSampler(
|
|
| 57 |
linear=True
|
| 58 |
)
|
| 59 |
|
| 60 |
-
|
| 61 |
def find_best_bucket(h, w, options):
|
| 62 |
min_metric = float('inf')
|
| 63 |
best_bucket = None
|
|
@@ -106,7 +112,13 @@ def interrogator_process(x):
|
|
| 106 |
return image_description
|
| 107 |
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
|
| 111 |
progress=gr.Progress()):
|
| 112 |
rng = torch.Generator(device="cuda").manual_seed(int(seed))
|
|
@@ -192,7 +204,14 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
|
|
| 192 |
return video, image_1, image_2
|
| 193 |
|
| 194 |
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
|
| 197 |
result_frames = []
|
| 198 |
cropped_images = []
|
|
|
|
| 23 |
from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
|
| 24 |
from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
|
| 25 |
|
| 26 |
+
IS_ZERO_GPU = bool(os.getenv("SPACES_ZERO_GPU"))
|
| 27 |
+
IS_GPU_MODE = True if IS_ZERO_GPU else (True if torch.cuda.is_available() else False)
|
| 28 |
+
if IS_ZERO_GPU:
|
| 29 |
+
import subprocess
|
| 30 |
+
subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
|
| 31 |
+
torch.set_float32_matmul_precision("high")
|
| 32 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 33 |
+
|
| 34 |
# Disable gradients globally
|
| 35 |
torch.set_grad_enabled(False)
|
| 36 |
|
|
|
|
| 42 |
unet_add_coded_conds(unet=m, added_number_count=1)
|
| 43 |
return m
|
| 44 |
|
|
|
|
| 45 |
model_name = 'lllyasviel/paints_undo_single_frame'
|
| 46 |
tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
|
| 47 |
text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16).to("cuda")
|
|
|
|
| 64 |
linear=True
|
| 65 |
)
|
| 66 |
|
|
|
|
| 67 |
def find_best_bucket(h, w, options):
|
| 68 |
min_metric = float('inf')
|
| 69 |
best_bucket = None
|
|
|
|
| 112 |
return image_description
|
| 113 |
|
| 114 |
|
| 115 |
+
def process_get_duration(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg, progres):
|
| 116 |
+
def_duration = 15.
|
| 117 |
+
def_steps = 50.
|
| 118 |
+
return int(def_duration * steps / def_steps)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@spaces.GPU(duration=process_get_duration)
|
| 122 |
def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
|
| 123 |
progress=gr.Progress()):
|
| 124 |
rng = torch.Generator(device="cuda").manual_seed(int(seed))
|
|
|
|
| 204 |
return video, image_1, image_2
|
| 205 |
|
| 206 |
|
| 207 |
+
def process_video_get_duration(keyframes, prompt, steps, cfg, fps, seed, progress):
|
| 208 |
+
def_duration = 180.
|
| 209 |
+
def_steps = 50.
|
| 210 |
+
def_fps = 4.
|
| 211 |
+
return int(def_duration * steps / def_steps * fps / def_fps)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
@spaces.GPU(duration=process_video_get_duration)
|
| 215 |
def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
|
| 216 |
result_frames = []
|
| 217 |
cropped_images = []
|
diffusers_vdm/improved_clip_vision.py
CHANGED
|
@@ -35,7 +35,7 @@ def arbitrary_positional_encoding(p, H, W):
|
|
| 35 |
return weight
|
| 36 |
|
| 37 |
|
| 38 |
-
def improved_clipvision_embedding_forward(self, pixel_values):
|
| 39 |
pixel_values = pixel_values * 0.5 + 0.5
|
| 40 |
pixel_values = preprocess(pixel_values)
|
| 41 |
batch_size = pixel_values.shape[0]
|
|
|
|
| 35 |
return weight
|
| 36 |
|
| 37 |
|
| 38 |
+
def improved_clipvision_embedding_forward(self, pixel_values, interpolate_pos_encoding=False, *args, **kwargs):
|
| 39 |
pixel_values = pixel_values * 0.5 + 0.5
|
| 40 |
pixel_values = preprocess(pixel_values)
|
| 41 |
batch_size = pixel_values.shape[0]
|
requirements.txt
CHANGED
|
@@ -13,4 +13,5 @@ xformers
|
|
| 13 |
onnxruntime
|
| 14 |
av
|
| 15 |
torchvision
|
| 16 |
-
spaces
|
|
|
|
|
|
| 13 |
onnxruntime
|
| 14 |
av
|
| 15 |
torchvision
|
| 16 |
+
spaces
|
| 17 |
+
pydantic==2.10.6
|