PhyscalX commited on
Commit
0cb8e29
·
0 Parent(s):

Initial commit

Browse files
Files changed (6) hide show
  1. .flake8 +21 -0
  2. .gitattributes +35 -0
  3. .gitignore +55 -0
  4. README.md +14 -0
  5. app.py +204 -0
  6. requirements.txt +7 -0
.flake8 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max-line-length = 100
3
+ ignore =
4
+ # whitespace before ':' (conflicted with Black)
5
+ E203,
6
+ # ambiguous variable name
7
+ E741,
8
+ # ‘from module import *’ used; unable to detect undefined names
9
+ F403,
10
+ # name may be undefined, or defined from star imports: module
11
+ F405,
12
+ # redefinition of unused name from line N
13
+ F811,
14
+ # undefined name
15
+ F821,
16
+ # line break before binary operator
17
+ W503,
18
+ # line break after binary operator
19
+ W504
20
+ # module imported but unused
21
+ per-file-ignores = __init__.py: F401
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Compiled Object files
2
+ *.slo
3
+ *.lo
4
+ *.o
5
+ *.cuo
6
+
7
+ # Compiled Dynamic libraries
8
+ *.so
9
+ *.dll
10
+ *.dylib
11
+
12
+ # Compiled Static libraries
13
+ *.lai
14
+ *.la
15
+ *.a
16
+ *.lib
17
+
18
+ # Compiled python
19
+ *.pyc
20
+ __pycache__
21
+
22
+ # Compiled MATLAB
23
+ *.mex*
24
+
25
+ # IPython notebook checkpoints
26
+ .ipynb_checkpoints
27
+
28
+ # Editor temporaries
29
+ *.swp
30
+ *~
31
+
32
+ # Sublime Text settings
33
+ *.sublime-workspace
34
+ *.sublime-project
35
+
36
+ # Eclipse Project settings
37
+ *.*project
38
+ .settings
39
+
40
+ # QtCreator files
41
+ *.user
42
+
43
+ # VSCode files
44
+ .vscode
45
+
46
+ # IDEA files
47
+ .idea
48
+
49
+ # OSX dir files
50
+ .DS_Store
51
+
52
+ # Android files
53
+ .gradle
54
+ *.iml
55
+ local.properties
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: URSA-1.7B-FSQ320
3
+ emoji: 🎞️
4
+ colorFrom: yellow
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 5.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: 'URSA Text-to-Image-to-Video'
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024-present, BAAI. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ------------------------------------------------------------------------
15
+ """URSA TI2V application."""
16
+
17
+ import argparse
18
+ import os
19
+
20
+ import gradio as gr
21
+ import numpy as np
22
+ import PIL.Image
23
+ import torch
24
+
25
+ from diffnext.pipelines import URSAPipeline
26
+ from diffnext.utils import export_to_image, export_to_video
27
+
28
+ # Fix tokenizer fork issue.
29
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
30
+ # Switch to the allocator optimized for dynamic shape.
31
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
32
+
33
+
34
+ def parse_args():
35
+ """Parse arguments."""
36
+ parser = argparse.ArgumentParser(description="Serve URSA TI2V application")
37
+ parser.add_argument("--model", default="BAAI/URSA-1.7B-FSQ320", help="model path")
38
+ parser.add_argument("--device", type=int, default=0, help="device index")
39
+ parser.add_argument("--precision", default="float16", help="compute precision")
40
+ return parser.parse_args()
41
+
42
+
43
+ def crop_image(image, target_h, target_w):
44
+ """Center crop image to target size."""
45
+ h, w = image.height, image.width
46
+ aspect_ratio_target, aspect_ratio = target_w / target_h, w / h
47
+ if aspect_ratio > aspect_ratio_target:
48
+ new_w = int(h * aspect_ratio_target)
49
+ x_start = (w - new_w) // 2
50
+ image = image.crop((x_start, 0, x_start + new_w, h))
51
+ else:
52
+ new_h = int(w / aspect_ratio_target)
53
+ y_start = (h - new_h) // 2
54
+ image = image.crop((0, y_start, w, y_start + new_h))
55
+ return np.array(image.resize((target_w, target_h), PIL.Image.Resampling.BILINEAR))
56
+
57
+
58
+ def generate_image(
59
+ prompt,
60
+ negative_prompt,
61
+ seed,
62
+ randomize_seed,
63
+ guidance_scale,
64
+ num_inference_steps=25,
65
+ ):
66
+ """Generate a video."""
67
+ args = {**locals(), **video_presets["t2i"]}
68
+ seed = np.random.randint(2147483647) if randomize_seed else seed
69
+ device = getattr(pipe, "_offload_device", pipe.device)
70
+ generator = torch.Generator(device=device).manual_seed(seed)
71
+ images = pipe(generator=generator, **args).frames
72
+ return [export_to_image(image, quality=95) for image in images] + [seed]
73
+
74
+
75
+ def generate_video(
76
+ prompt,
77
+ negative_prompt,
78
+ image,
79
+ motion_score,
80
+ seed,
81
+ randomize_seed,
82
+ guidance_scale,
83
+ num_inference_steps,
84
+ output_type="np",
85
+ ):
86
+ """Generate a video."""
87
+ args = {**locals(), **video_presets["ti2v"]}
88
+ args["prompt"] = f"motion={motion_score:.1f}, {prompt}"
89
+ args["image"] = crop_image(image, args["height"], args["width"]) if image else None
90
+ seed = np.random.randint(2147483647) if randomize_seed else seed
91
+ device = getattr(pipe, "_offload_device", pipe.device)
92
+ generator = torch.Generator(device=device).manual_seed(seed)
93
+ frames = pipe(generator=generator, **args).frames[0]
94
+ return export_to_video(frames, fps=12), seed
95
+
96
+
97
+ css = """#col-container {margin: 0 auto; max-width: 1366px}"""
98
+ title = "Uniform Discrete Diffusion with Metric Path for Video Generation"
99
+ header = (
100
+ "<div align='center'>"
101
+ "<h2>Uniform Discrete Diffusion with Metric Path for Video Generation</h2>"
102
+ "<h3><a href='https://arxiv.org/abs/2510.24717' target='_blank' rel='noopener'>[paper]</a>"
103
+ "<a href='https://github.com/baaivision/URSA' target='_blank' rel='noopener'>[code]</a></h3>"
104
+ "</div>"
105
+ )
106
+
107
+ video_presets = {
108
+ "t2i": {"width": 512, "height": 320, "num_frames": 1},
109
+ "ti2v": {"width": 512, "height": 320, "num_frames": 49},
110
+ }
111
+
112
+ prompts = [
113
+ "a lone grizzly bear walks through a misty forest at dawn, sunlight catching its fur.",
114
+ "Many spotted jellyfish pulsating under water. Their bodies are transparent and glowing in deep ocean.", # noqa
115
+ "An intense close-up of a soldier’s face, covered in dirt and sweat, his eyes filled with determination as he surveys the battlefield.", # noqa
116
+ "a close-up shot of a woman standing in a dimly lit room. she is wearing a traditional chinese outfit, which includes a red and gold dress with intricate designs and a matching headpiece. the woman has her hair styled in an updo, adorned with a gold accessory. her makeup is done in a way that accentuates her features, with red lipstick and dark eyeshadow. she is looking directly at the camera with a neutral expression. the room has a rustic feel, with wooden beams and a stone wall visible in the background. the lighting in the room is soft and warm, creating a contrast with the woman's vibrant attire. there are no texts or other objects in the video. the style of the video is a portrait, focusing on the woman and her attire.", # noqa
117
+ "The camera slowly rotates around a massive stack of vintage televisions that are placed within a large New York museum gallery. Each of the televisions is showing a different program. There are 1950s sci-fi movies with their distinctive visuals, horror movies with their creepy scenes, news broadcasts with moving images and words, static on some screens, and a 1970s sitcom with its characteristic look. The televisions are of various sizes and designs, some with rounded edges and others with more angular shapes. The gallery is well-lit, with light falling on the stack of televisions and highlighting the different programs being shown. There are no people visible in the immediate vicinity, only the stack of televisions and the surrounding gallery space.", # noqa
118
+ ]
119
+ motion_scores = [9, 9, 9, 9, 9]
120
+ videos = ["", "", "", "", ""]
121
+ examples = [list(x) for x in zip(prompts, motion_scores)]
122
+
123
+
124
+ if __name__ == "__main__":
125
+ args = parse_args()
126
+
127
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu", args.device)
128
+ model_args = {"torch_dtype": getattr(torch, args.precision.lower()), "trust_remote_code": True}
129
+ pipe = URSAPipeline.from_pretrained(args.model, **model_args).to(device)
130
+
131
+ # Application.
132
+ app = gr.Blocks(css=css, theme="origin").__enter__()
133
+ container = gr.Column(elem_id="col-container").__enter__()
134
+ _, main_row = gr.Markdown(header), gr.Row().__enter__()
135
+
136
+ # Input.
137
+ input_col = gr.Column().__enter__()
138
+ prompt = gr.Text(
139
+ label="Prompt",
140
+ placeholder="Describe the video you want to generate",
141
+ value="A lone grizzly bear walks through a misty forest at dawn, sunlight catching its fur.", # noqa
142
+ lines=5,
143
+ )
144
+ negative_prompt = gr.Text(
145
+ label="Negative Prompt",
146
+ placeholder="Describe what you don't want in the video",
147
+ value="worst quality, low quality, inconsistent motion, static, still, blurry, jittery, distorted, ugly", # noqa
148
+ lines=1,
149
+ )
150
+ with gr.Row():
151
+ generate_image_btn = gr.Button("Generate Image Prompt", variant="primary", size="lg")
152
+ generate_video_btn = gr.Button("Generate Video", variant="primary", size="lg")
153
+ image_prompt = gr.Image(label="Image Prompt", height=480, type="pil")
154
+
155
+ # fmt: off
156
+ options = gr.Accordion("Options", open=False).__enter__()
157
+ seed = gr.Slider(label="Seed", maximum=2147483647, step=1, value=0)
158
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
159
+ guidance_scale = gr.Slider(label="Guidance scale", minimum=1, maximum=10.0, step=0.1, value=7.0)
160
+ with gr.Row():
161
+ num_inference_steps = gr.Slider(label="Inference steps", minimum=1, maximum=100, step=1, value=50) # noqa
162
+ options.__exit__(), input_col.__exit__()
163
+
164
+ # Results.
165
+ result_col = gr.Column().__enter__()
166
+ motion = gr.Slider(label="Motion Score", minimum=1, maximum=10, step=1, value=9)
167
+ result = gr.Video(label="Result", height=480, show_label=False, autoplay=True)
168
+ result_col.__exit__(), main_row.__exit__()
169
+ # fmt: on
170
+
171
+ # Examples.
172
+ with gr.Row():
173
+ gr.Examples(examples=examples, inputs=[prompt, motion])
174
+
175
+ # Events.
176
+ container.__exit__()
177
+ gr.on(
178
+ triggers=[generate_image_btn.click, prompt.submit, negative_prompt.submit],
179
+ fn=generate_image,
180
+ inputs=[
181
+ prompt,
182
+ negative_prompt,
183
+ seed,
184
+ randomize_seed,
185
+ guidance_scale,
186
+ ],
187
+ outputs=[image_prompt, seed],
188
+ )
189
+ gr.on(
190
+ triggers=[generate_video_btn.click, prompt.submit, negative_prompt.submit],
191
+ fn=generate_video,
192
+ inputs=[
193
+ prompt,
194
+ negative_prompt,
195
+ image_prompt,
196
+ motion,
197
+ seed,
198
+ randomize_seed,
199
+ guidance_scale,
200
+ num_inference_steps,
201
+ ],
202
+ outputs=[result, seed],
203
+ )
204
+ app.__exit__(), app.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ einops
2
+ torch
3
+ diffusers
4
+ transformers
5
+ accelerate
6
+ imageio[ffmpeg]
7
+ git+https://github.com/baaivision/URSA.git