rahul7star commited on
Commit
0084610
·
verified ·
1 Parent(s): 7d46bab

Migrated from GitHub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +30 -0
  2. LICENSE +201 -0
  3. ORIGINAL_README.md +381 -0
  4. __init__.py +1 -0
  5. assets/KANDINSKY_LOGO_1_BLACK.png +0 -0
  6. assets/KANDINSKY_LOGO_1_WHITE.png +0 -0
  7. assets/comfyui_kandinsky5.png +3 -0
  8. assets/generation_examples/1036335634.mp4 +3 -0
  9. assets/generation_examples/1512407739 (1).mp4 +3 -0
  10. assets/generation_examples/1512407739.mp4 +3 -0
  11. assets/generation_examples/642423904 (1).mp4 +3 -0
  12. assets/generation_examples/642423904 (2).mp4 +3 -0
  13. assets/generation_examples/642423904.mp4 +3 -0
  14. assets/generation_examples/68941856 (1).mp4 +3 -0
  15. assets/generation_examples/68941856.mp4 +3 -0
  16. assets/generation_examples/distill/1.mp4 +3 -0
  17. assets/generation_examples/distill/2.mp4 +3 -0
  18. assets/generation_examples/distill/3.mp4 +3 -0
  19. assets/generation_examples/distill/4.mp4 +3 -0
  20. assets/generation_examples/sft/1.mp4 +3 -0
  21. assets/generation_examples/sft/2.mp4 +3 -0
  22. assets/generation_examples/sft/3.mp4 +3 -0
  23. assets/generation_examples/sft/4.mp4 +3 -0
  24. assets/generation_examples/sft/5.mp4 +3 -0
  25. assets/generation_examples/sft/6.mp4 +3 -0
  26. assets/generation_examples/test (1) (1).mp4 +3 -0
  27. assets/generation_examples/test2 (1).mp4 +3 -0
  28. assets/generation_examples/video5237959401997893857.mp4 +3 -0
  29. assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg +3 -0
  30. assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg +3 -0
  31. assets/sbs/kandinsky_5_video_lite_vs_sora.jpg +3 -0
  32. assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg +3 -0
  33. assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg +3 -0
  34. assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg +3 -0
  35. assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg +3 -0
  36. assets/vbench.png +3 -0
  37. benchmark/moviegen_bench.csv +0 -0
  38. comfyui/README.md +76 -0
  39. comfyui/kandisnky5_lite_T2V.json +541 -0
  40. comfyui/nodes_kandinsky.py +286 -0
  41. configs/config_10s_distil.yaml +54 -0
  42. configs/config_10s_nocfg.yaml +56 -0
  43. configs/config_10s_pretrain.yaml +54 -0
  44. configs/config_10s_sft.yaml +56 -0
  45. configs/config_5s_distil.yaml +47 -0
  46. configs/config_5s_nocfg.yaml +49 -0
  47. configs/config_5s_pretrain.yaml +47 -0
  48. configs/config_5s_sft.yaml +49 -0
  49. download_models.py +74 -0
  50. inference_example.ipynb +192 -0
.gitattributes CHANGED
@@ -33,3 +33,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/comfyui_kandinsky5.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/generation_examples/1036335634.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ assets/generation_examples/1512407739[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
39
+ assets/generation_examples/1512407739.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ assets/generation_examples/642423904[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
41
+ assets/generation_examples/642423904[[:space:]](2).mp4 filter=lfs diff=lfs merge=lfs -text
42
+ assets/generation_examples/642423904.mp4 filter=lfs diff=lfs merge=lfs -text
43
+ assets/generation_examples/68941856[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
44
+ assets/generation_examples/68941856.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ assets/generation_examples/distill/1.mp4 filter=lfs diff=lfs merge=lfs -text
46
+ assets/generation_examples/distill/2.mp4 filter=lfs diff=lfs merge=lfs -text
47
+ assets/generation_examples/distill/3.mp4 filter=lfs diff=lfs merge=lfs -text
48
+ assets/generation_examples/distill/4.mp4 filter=lfs diff=lfs merge=lfs -text
49
+ assets/generation_examples/sft/1.mp4 filter=lfs diff=lfs merge=lfs -text
50
+ assets/generation_examples/sft/2.mp4 filter=lfs diff=lfs merge=lfs -text
51
+ assets/generation_examples/sft/3.mp4 filter=lfs diff=lfs merge=lfs -text
52
+ assets/generation_examples/sft/4.mp4 filter=lfs diff=lfs merge=lfs -text
53
+ assets/generation_examples/sft/5.mp4 filter=lfs diff=lfs merge=lfs -text
54
+ assets/generation_examples/sft/6.mp4 filter=lfs diff=lfs merge=lfs -text
55
+ assets/generation_examples/test[[:space:]](1)[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
56
+ assets/generation_examples/test2[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
57
+ assets/generation_examples/video5237959401997893857.mp4 filter=lfs diff=lfs merge=lfs -text
58
+ assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg filter=lfs diff=lfs merge=lfs -text
59
+ assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg filter=lfs diff=lfs merge=lfs -text
60
+ assets/sbs/kandinsky_5_video_lite_vs_sora.jpg filter=lfs diff=lfs merge=lfs -text
61
+ assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg filter=lfs diff=lfs merge=lfs -text
62
+ assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg filter=lfs diff=lfs merge=lfs -text
63
+ assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg filter=lfs diff=lfs merge=lfs -text
64
+ assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg filter=lfs diff=lfs merge=lfs -text
65
+ assets/vbench.png filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
ORIGINAL_README.md ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="assets/KANDINSKY_LOGO_1_WHITE.png">
4
+ <source media="(prefers-color-scheme: light)" srcset="assets/KANDINSKY_LOGO_1_BLACK.png">
5
+ <img alt="Shows an illustrated sun in light mode and a moon with stars in dark mode." src="https://user-images.githubusercontent.com/25423296/163456779-a8556205-d0a5-45e2-ac17-42d089e3c3f8.png">
6
+ </picture>
7
+ </div>
8
+
9
+ <div align="center">
10
+ <a href="https://habr.com/ru/companies/sberbank/articles/951800/">Habr</a> | <a href="https://ai-forever.github.io/Kandinsky-5/">Project Page</a> | Technical Report (soon) | <a href=https://huggingface.co/collections/ai-forever/kandisnky-50-t2v-lite-68d71892d2cc9b02177e5ae5> Models🤗 </a> | <a href="https://github.com/ai-forever/Kandinsky-5/blob/main/comfyui/README.md">ComfyUI README</a>
11
+ </div>
12
+
13
+ <h1>Kandinsky 5.0: A family of diffusion models for Video & Image generation</h1>
14
+
15
+ In this repository, we provide a family of diffusion models to generate a video or an image (<em>Coming Soon</em>) given a textual prompt and distilled model for faster generation.
16
+
17
+ https://github.com/user-attachments/assets/b9ff0417-02a4-4f6b-aacc-60c44e7fe6f1
18
+
19
+ ## Project Updates
20
+
21
+ - 🔥 **Source**: ```2025/09/29```: We have open-sourced `Kandinsky 5.0 T2V Lite` a lite (2B parameters) version of `Kandinsky 5.0 Video` text-to-video generation model. Released checkpoints: `kandinsky5lite_t2v_pretrain_5s`, `kandinsky5lite_t2v_pretrain_10s`, `kandinsky5lite_t2v_sft_5s`, `kandinsky5lite_t2v_sft_10s`, `kandinsky5lite_t2v_nocfg_5s`, `kandinsky5lite_t2v_nocfg_10s`, `kandinsky5lite_t2v_distilled16steps_5s`, `kandinsky5lite_t2v_distilled16steps_10s` contains weight from pretrain, supervised finetuning, cfg distillation and diffusion distillation into 16 steps. 5s checkpoints are capable of generating videos up to 5 seconds long. 10s checkpoints is faster models checkpoints trained with [NABLA](https://huggingface.co/ai-forever/Wan2.1-T2V-14B-NABLA-0.7) algorithm and capable to generate videos up to 10 seconds long.
22
+ - 🔥 **Source**: ```2025/10/7```: The ComfyUI README file has been updated. SDPA support has been added, allowing you to run our code without Flash attention. Magcache support for nocfg checkpoints has been added, allowing Magcache support for sft and nocfg checkpoints. Memory consumption in the VAE has been reduced, with the entire pipeline now running at 24 GB with offloading.
23
+
24
+ ## Kandinsky 5.0 T2V Lite
25
+
26
+ Kandinsky 5.0 T2V Lite is a lightweight video generation model (2B parameters) that ranks #1 among open-source models in its class. It outperforms larger Wan models (5B and 14B) and offers the best understanding of Russian concepts in the open-source ecosystem.
27
+
28
+ We provide 8 model variants, each optimized for different use cases:
29
+
30
+ * SFT model — delivers the highest generation quality;
31
+
32
+ * CFG-distilled — runs 2× faster;
33
+
34
+ * Diffusion-distilled — enables low-latency generation with minimal quality loss (6× faster);
35
+
36
+ * Pretrain model — designed for fine-tuning by researchers and enthusiasts.
37
+
38
+ All models are available in two versions: for generating 5-second and 10-second videos.
39
+
40
+ ## Pipeline
41
+
42
+ **Latent diffusion pipeline** with **Flow Matching**.
43
+
44
+ **Diffusion Transformer (DiT)** as the main generative backbone with **cross-attention to text embeddings**.
45
+
46
+ - **Qwen2.5-VL** and **CLIP** provides text embeddings.
47
+
48
+ - **HunyuanVideo 3D VAE** encodes/decodes video into a latent space.
49
+
50
+ - **DiT** is the main generative module using cross-attention to condition on text.
51
+
52
+ <img width="1600" height="477" alt="Picture1" src="https://github.com/user-attachments/assets/17fc2eb5-05e3-4591-9ec6-0f6e1ca397b3" />
53
+
54
+ <img width="800" height="406" alt="Picture2" src="https://github.com/user-attachments/assets/f3006742-e261-4c39-b7dc-e39330be9a09" />
55
+
56
+
57
+ ## Model Zoo
58
+
59
+ | Model | config | video duration | NFE | Checkpoint | Latency* |
60
+ |-------------------------------------|--------|----------------|-----|------------|----------------|
61
+ | Kandinsky 5.0 T2V Lite SFT 5s |configs/config_5s_sft.yaml | 5s | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s) | 139 s |
62
+ | Kandinsky 5.0 T2V Lite SFT 10s |configs/config_10s_sft.yaml| 10s | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s) | 224 s |
63
+ | Kandinsky 5.0 T2V Lite pretrain 5s |configs/config_5s_pretrain.yaml | 5s | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s) | 139 s |
64
+ | Kandinsky 5.0 T2V Lite pretrain 10s |configs/config_10s_pretrain.yaml | 10s | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-10s) | 224 s |
65
+ | Kandinsky 5.0 T2V Lite no-CFG 5s |configs/config_5s_nocfg.yaml| 5s | 50 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s) | 77 s |
66
+ | Kandinsky 5.0 T2V Lite no-CFG 10s |configs/config_10s_nocfg.yaml| 10s | 50 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-10s) | 124 s |
67
+ | Kandinsky 5.0 T2V Lite distill 5s |configs/config_5s_distil.yaml| 5s | 16 | 🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s)| 35 s |
68
+ | Kandinsky 5.0 T2V Lite distill 10s |configs/config_10s_distil.yaml| 10s | 16 | 🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-10s)| 61 s | |
69
+
70
+ *Latency was measured after the second inference run. The first run of the model can be slower due to the compilation process. Inference was measured on an NVIDIA H100 GPU with 80 GB of memory, using CUDA 12.8.1 and PyTorch 2.8. For 5-second models Flash Attention 3 was used.
71
+
72
+ ### Examples:
73
+
74
+ #### Kandinsky 5.0 T2V Lite SFT
75
+
76
+ <table border="0" style="width: 200; text-align: left; margin-top: 20px;">
77
+ <tr>
78
+ <td>
79
+ <video src="https://github.com/user-attachments/assets/bc38821b-f9f1-46db-885f-1f70464669eb" width=200 controls autoplay loop></video>
80
+ </td>
81
+ <td>
82
+ <video src="https://github.com/user-attachments/assets/9f64c940-4df8-4c51-bd81-a05de8e70fc3" width=200 controls autoplay loop></video>
83
+ </td>
84
+ <tr>
85
+ <td>
86
+ <video src="https://github.com/user-attachments/assets/77dd417f-e0bf-42bd-8d80-daffcd054add" width=200 controls autoplay loop></video>
87
+ </td>
88
+ <td>
89
+ <video src="https://github.com/user-attachments/assets/385a0076-f01c-4663-aa46-6ce50352b9ed" width=200 controls autoplay loop></video>
90
+ </td>
91
+ <tr>
92
+ <td>
93
+ <video src="https://github.com/user-attachments/assets/7c1bcb31-cc7d-4385-9a33-2b0cc28393dd" width=200 controls autoplay loop></video>
94
+ </td>
95
+ <td>
96
+ <video src="https://github.com/user-attachments/assets/990a8a0b-2df1-4bbc-b2e3-2859b6f1eea6" width=200 controls autoplay loop></video>
97
+ </td>
98
+ </tr>
99
+
100
+ </table>
101
+
102
+
103
+ #### Kandinsky 5.0 T2V Lite Distill
104
+
105
+ <table border="0" style="width: 200; text-align: left; margin-top: 20px;">
106
+ <tr>
107
+ <td>
108
+ <video src="https://github.com/user-attachments/assets/861342f9-f576-4083-8a3b-94570a970d58" width=200 controls autoplay loop></video>
109
+ </td>
110
+ <td>
111
+ <video src="https://github.com/user-attachments/assets/302e4e7d-781d-4a58-9b10-8c473d469c4b" width=200 controls autoplay loop></video>
112
+ </td>
113
+ <tr>
114
+ <td>
115
+ <video src="https://github.com/user-attachments/assets/3e70175c-40e5-4aec-b506-38006fe91a76" width=200 controls autoplay loop></video>
116
+ </td>
117
+ <td>
118
+ <video src="https://github.com/user-attachments/assets/b7da85f7-8b62-4d46-9460-7f0e505de810" width=200 controls autoplay loop></video>
119
+ </td>
120
+
121
+ </table>
122
+
123
+ ### Results:
124
+
125
+ #### Side-by-Side evaluation
126
+
127
+ The evaluation is based on the expanded prompts from the [Movie Gen benchmark](https://github.com/facebookresearch/MovieGenBench), which are available in the expanded_prompt column of the benchmark/moviegen_bench.csv file.
128
+
129
+ <table border="0" style="width: 400; text-align: left; margin-top: 20px;">
130
+ <tr>
131
+ <td>
132
+ <img src="assets/sbs/kandinsky_5_video_lite_vs_sora.jpg" width=400 ></img>
133
+ </td>
134
+ <td>
135
+ <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg" width=400 ></img>
136
+ </td>
137
+ <tr>
138
+ <td>
139
+ <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg" width=400 ></img>
140
+ </td>
141
+ <td>
142
+ <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg" width=400 ></img>
143
+ </td>
144
+ <tr>
145
+ <td>
146
+ <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg" width=400 ></img>
147
+ </td>
148
+
149
+ </table>
150
+
151
+ #### Distill Side-by-Side evaluation
152
+
153
+ <table border="0" style="width: 400; text-align: left; margin-top: 20px;">
154
+ <tr>
155
+ <td>
156
+ <img src="assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg" width=400 ></img>
157
+ </td>
158
+ <td>
159
+ <img src="assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg" width=400 ></img>
160
+ </td>
161
+
162
+ </table>
163
+
164
+ #### VBench results
165
+
166
+ <div align="center">
167
+ <picture>
168
+ <img src="assets/vbench.png">
169
+ </picture>
170
+ </div>
171
+
172
+ ## Quickstart
173
+
174
+ #### Installation
175
+ Clone the repo:
176
+ ```sh
177
+ git clone https://github.com/ai-forever/Kandinsky-5.git
178
+ cd Kandinsky-5
179
+ ```
180
+
181
+ Install dependencies:
182
+ ```sh
183
+ pip install -r requirements.txt
184
+ ```
185
+
186
+ To improve inference performance on NVidia Hopper GPUs, we recommend installing [Flash Attention 3](https://github.com/Dao-AILab/flash-attention/?tab=readme-ov-file#flashattention-3-beta-release).
187
+
188
+ #### Model Download
189
+ ```sh
190
+ python download_models.py
191
+ ```
192
+
193
+ #### Run Kandinsky 5.0 T2V Lite SFT 5s
194
+
195
+ ```sh
196
+ python test.py --prompt "A dog in red hat"
197
+ ```
198
+
199
+ #### Run Kandinsky 5.0 T2V Lite SFT 10s
200
+
201
+ ```sh
202
+ python test.py --config ./configs/config_10s_sft.yaml --prompt "A dog in red hat" --video_duration 10
203
+ ```
204
+
205
+ #### Run Kandinsky 5.0 T2V Lite pretrain 5s
206
+
207
+ ```sh
208
+ python test.py --config ./configs/config_5s_pretrain.yaml --prompt "A dog in red hat"
209
+ ```
210
+
211
+ #### Run Kandinsky 5.0 T2V Lite pretrain 10s
212
+
213
+ ```sh
214
+ python test.py --config ./configs/config_10s_pretrain.yaml --prompt "A dog in red hat" --video_duration 10
215
+ ```
216
+
217
+ #### Run Kandinsky 5.0 T2V Lite no-CFG 5s
218
+
219
+ ```sh
220
+ python test.py --config ./configs/config_5s_nocfg.yaml --prompt "A dog in red hat"
221
+ ```
222
+
223
+ #### Run Kandinsky 5.0 T2V Lite no-CFG 10s
224
+
225
+ ```sh
226
+ python test.py --config ./configs/config_10s_nocfg.yaml --prompt "A dog in red hat" --video_duration 10
227
+ ```
228
+
229
+ #### Run Kandinsky 5.0 T2V Lite distill 5s
230
+
231
+ ```sh
232
+ python test.py --config ./configs/config_5s_distil.yaml --prompt "A dog in red hat"
233
+ ```
234
+
235
+ #### Run Kandinsky 5.0 T2V Lite distill 10s
236
+
237
+ ```sh
238
+ python test.py --config ./configs/config_10s_distil.yaml --prompt "A dog in red hat" --video_duration 10
239
+ ```
240
+
241
+ ### Inference
242
+
243
+ ```python
244
+ import torch
245
+ from IPython.display import Video
246
+ from kandinsky import get_T2V_pipeline
247
+
248
+ device_map = {
249
+ "dit": torch.device('cuda:0'),
250
+ "vae": torch.device('cuda:0'),
251
+ "text_embedder": torch.device('cuda:0')
252
+ }
253
+
254
+ pipe = get_T2V_pipeline(device_map, conf_path="configs/config_5s_sft.yaml")
255
+
256
+ images = pipe(
257
+ seed=42,
258
+ time_length=5,
259
+ width=768,
260
+ height=512,
261
+ save_path="./test.mp4",
262
+ text="A cat in a red hat",
263
+ )
264
+
265
+ Video("./test.mp4")
266
+ ```
267
+
268
+ Please, refer to [inference_example.ipynb](inference_example.ipynb) notebook for more usage details.
269
+
270
+ ### Distributed Inference
271
+
272
+ For a faster inference, we also provide the capability to perform inference in a distributed way:
273
+ ```
274
+ NUMBER_OF_NODES=1
275
+ NUMBER_OF_DEVICES_PER_NODE=1 / 2 / 4
276
+ python -m torch.distributed.launch --nnodes $NUMBER_OF_NODES --nproc-per-node $NUMBER_OF_DEVICES_PER_NODE test.py
277
+ ```
278
+
279
+ ### Optimized Inference
280
+
281
+ #### Offloading
282
+ For less memory consumption you can use **offloading** of the models.
283
+ ```sh
284
+ python test.py --prompt "A dog in red hat" --offload
285
+ ```
286
+
287
+ #### Magcache
288
+ Also we provide [Magcache](https://github.com/Zehong-Ma/MagCache) inference for faster generations (now available for sft 5s and sft 10s checkpoints).
289
+
290
+ ```sh
291
+ python test.py --prompt "A dog in red hat" --magcache
292
+ ```
293
+
294
+ ### ComfyUI
295
+
296
+ See the instruction [here](comfyui)
297
+
298
+ ### Beta testing
299
+ You can apply to participate in the beta testing of the Kandinsky Video Lite via the [telegram bot](https://t.me/kandinsky_access_bot).
300
+
301
+ ## 📑 Todo List
302
+ - Kandinsky 5.0 Lite Text-to-Video
303
+ - [x] Multi-GPU Inference code of the 2B models
304
+ - [ ] Checkpoints 2B models
305
+ - [x] pretrain
306
+ - [x] sft
307
+ - [ ] rl
308
+ - [x] cfg distil
309
+ - [x] distil 16 steps
310
+ - [ ] autoregressive generation
311
+ - [x] ComfyUI integration
312
+ - [ ] Diffusers integration
313
+ - [ ] Caching acceleration support
314
+ - Kandinsky 5.0 Lite Image-to-Video
315
+ - [ ] Multi-GPU Inference code of the 2B model
316
+ - [ ] Checkpoints of the 2B model
317
+ - [ ] ComfyUI integration
318
+ - [ ] Diffusers integration
319
+ - Kandinsky 5.0 Pro Text-to-Video
320
+ - [ ] Multi-GPU Inference code of the models
321
+ - [ ] Checkpoints of the model
322
+ - [ ] ComfyUI integration
323
+ - [ ] Diffusers integration
324
+ - Kandinsky 5.0 Pro Image-to-Video
325
+ - [ ] Multi-GPU Inference code of the model
326
+ - [ ] Checkpoints of the model
327
+ - [ ] ComfyUI integration
328
+ - [ ] Diffusers integration
329
+ - [ ] Technical report
330
+
331
+ # Authors
332
+ <B>Project Leader:</B> Denis Dimitrov</br>
333
+
334
+ <B>Team Leads:</B> Vladimir Arkhipkin, Vladimir Korviakov, Nikolai Gerasimenko, Denis Parkhomenko</br>
335
+
336
+ <B>Core Contributors:</B> Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov, Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim, Anastasiia Kargapoltseva, Nikita Kiselev</br>
337
+
338
+ <B>Contributors:</B> Anna Dmitrienko, Anastasia Maltseva, Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov, Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina, Tatiana Nikulina, Polina Gavrilova</br>
339
+
340
+ # Citation
341
+
342
+ ```
343
+ @misc{kandinsky2025,
344
+ author = {Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov,
345
+ Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim,
346
+ Anastasiia Kargapoltseva, Nikita Kiselev, Vladimir Arkhipkin, Vladimir Korviakov,
347
+ Nikolai Gerasimenko, Denis Parkhomenko, Anna Dmitrienko, Anastasia Maltseva,
348
+ Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov,
349
+ Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina,
350
+ Tatiana Nikulina, Polina Gavrilova, Denis Dimitrov},
351
+ title = {Kandinsky 5.0: A family of diffusion models for Video & Image generation},
352
+ howpublished = {\url{https://github.com/ai-forever/Kandinsky-5}},
353
+ year = 2025
354
+ }
355
+
356
+ @misc{mikhailov2025nablanablaneighborhoodadaptiveblocklevel,
357
+ title={$\nabla$NABLA: Neighborhood Adaptive Block-Level Attention},
358
+ author={Dmitrii Mikhailov and Aleksey Letunovskiy and Maria Kovaleva and Vladimir Arkhipkin
359
+ and Vladimir Korviakov and Vladimir Polovnikov and Viacheslav Vasilev
360
+ and Evelina Sidorova and Denis Dimitrov},
361
+ year={2025},
362
+ eprint={2507.13546},
363
+ archivePrefix={arXiv},
364
+ primaryClass={cs.CV},
365
+ url={https://arxiv.org/abs/2507.13546},
366
+ }
367
+ ```
368
+
369
+ # Acknowledgements
370
+
371
+ We gratefully acknowledge the open-source projects and research that made Kandinsky 5.0 possible:
372
+
373
+ - [PyTorch](https://pytorch.org/) — for model training and inference.
374
+ - [FlashAttention 3](https://github.com/Dao-AILab/flash-attention) — for efficient attention and faster inference.
375
+ - [Qwen2.5-VL](https://github.com/QwenLM/Qwen3-VL) — for providing high-quality text embeddings.
376
+ - [CLIP](https://github.com/openai/CLIP) — for robust text–image alignment.
377
+ - [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) — for video latent encoding and decoding.
378
+ - [MagCache](https://github.com/Zehong-Ma/MagCache) — for accelerated inference.
379
+ - [ComfyUI](https://github.com/comfyanonymous/ComfyUI) — for integration into node-based workflows.
380
+
381
+ We deeply appreciate the contributions of these communities and researchers to the open-source ecosystem.
__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .comfyui.nodes_kandinsky import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
assets/KANDINSKY_LOGO_1_BLACK.png ADDED
assets/KANDINSKY_LOGO_1_WHITE.png ADDED
assets/comfyui_kandinsky5.png ADDED

Git LFS Details

  • SHA256: 4c91961abe51a1fcbd3a35d438ea3b4f652f61a4d9f035c9f10e91dc5c9b79cd
  • Pointer size: 131 Bytes
  • Size of remote file: 474 kB
assets/generation_examples/1036335634.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d3657c36760e1694a4d3533b04e0d28ddd16d8d8e6373953e8f754742e2a54b
3
+ size 4199589
assets/generation_examples/1512407739 (1).mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54b887c2c5cca6a4b5d8cc7f433a01a1c72c2592f07187b3c530126ac77aa601
3
+ size 7227407
assets/generation_examples/1512407739.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00f48b1d0ddd97c7d802136d1a9090eb397ce34d95569fd4c4d6eb64eb46d06f
3
+ size 6778347
assets/generation_examples/642423904 (1).mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:612d2a1340475f79b20f90cc5e85a5b9e79193af631e8bd7fa50cdc5fc47dee8
3
+ size 3038994
assets/generation_examples/642423904 (2).mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55bed6367e8c4de29f082cfbaa8af357fef81dc339e8abbadea50901ac635d10
3
+ size 3092127
assets/generation_examples/642423904.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:464da305ce36af41a1fb7fa842de1357b601008d128f1693f12f9674c906243c
3
+ size 2466511
assets/generation_examples/68941856 (1).mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8274aa1cbc087465f6a1b72842c8d1cb1860c61232dd29ee0f17ea7ff2d2ac08
3
+ size 5856930
assets/generation_examples/68941856.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:759e825e4b6bd05ab92621390b1f5aa97240cd18a76312395ff39fd636dc8a9d
3
+ size 6942715
assets/generation_examples/distill/1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06953c45987011d08aef79f7ae1368f9a69d480a509554fb55d0d46d4498255f
3
+ size 6916245
assets/generation_examples/distill/2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d199141bd7cffb4cb30ac84f2879da330b83cd2429aa7a95c3406f8dce49134a
3
+ size 5384100
assets/generation_examples/distill/3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bfdc9c8cf2ada22de4ffc1d0281e56ed7c0d61a66bdff28697cbe6a2a8e97f5
3
+ size 3957258
assets/generation_examples/distill/4.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bbebdd5de15721f082785bbc02dc7422b0d9d9a6ac244c02e1bfbecc22ba22e
3
+ size 6328091
assets/generation_examples/sft/1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bfc00348d1fdb09c43687e086cd912ba15097ecf3f85b6302827a54eafab3a2
3
+ size 4486280
assets/generation_examples/sft/2.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:612eb63051314d50bb333ee1c797e95a5df7522ca5df3dc853266b135a27ce06
3
+ size 4600755
assets/generation_examples/sft/3.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8cc13e558964138ef53a098d4b1174db5a58979f7dbf021788a39ad41c2fcff
3
+ size 8193301
assets/generation_examples/sft/4.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:462f02343590e07e5ac2a919f566438d2ddd990d0c5ddcd8c92e15feae63eb11
3
+ size 7697517
assets/generation_examples/sft/5.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e89d9d0ae048df05fc2f00e9acc009a261b38810f5792967a969bb3147a5e6d
3
+ size 3528986
assets/generation_examples/sft/6.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8226569ad637d3b179ce895685e52cfcf81372bbbab623386fddde8e0352c9db
3
+ size 4109417
assets/generation_examples/test (1) (1).mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:949f1a6c6056307026430aa0809fc7b02ac6deb3732973b305aaad20e6754c76
3
+ size 1592811
assets/generation_examples/test2 (1).mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d626185c2e14dcca33b4e315529dd9555fc34f9dc96a327a28935bdb659406b
3
+ size 1103150
assets/generation_examples/video5237959401997893857.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df397dc2d8032875aaccb58ef563f90c6cd25d50ce73bae40c5776dc9818ac0
3
+ size 5977768
assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg ADDED

Git LFS Details

  • SHA256: a6f53623b3c1e1f45ea6872f3afa4b3f71d79377bc89065b12e590c8a1a60f1d
  • Pointer size: 131 Bytes
  • Size of remote file: 190 kB
assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg ADDED

Git LFS Details

  • SHA256: 81d9aa99a224f3b1ce7262edf0c969bebcb7b95349cb5b57be5cc7aecbcc15d9
  • Pointer size: 131 Bytes
  • Size of remote file: 192 kB
assets/sbs/kandinsky_5_video_lite_vs_sora.jpg ADDED

Git LFS Details

  • SHA256: 2a5c838cb53a026a57d3037361ad4ed74bae4b31f4d1b11e6474956eca42d412
  • Pointer size: 131 Bytes
  • Size of remote file: 195 kB
assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg ADDED

Git LFS Details

  • SHA256: 74fa68588e7e24fd817cc8e96d63f4e5b623ff193c71a644c0ce42ebb9b49dac
  • Pointer size: 131 Bytes
  • Size of remote file: 170 kB
assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg ADDED

Git LFS Details

  • SHA256: 80bc261b9afcaf1446228a24a96afe3b5c24b4780f3e2f43e27496077611ec6f
  • Pointer size: 131 Bytes
  • Size of remote file: 196 kB
assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg ADDED

Git LFS Details

  • SHA256: d01f4a73b287541487228939fd505a947b78b6325f76421b2ee5f1523188e08e
  • Pointer size: 131 Bytes
  • Size of remote file: 192 kB
assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg ADDED

Git LFS Details

  • SHA256: 4f053f7d996112f40e8b49f6440ea75a40f71c02e60d467cff479ced0b54444a
  • Pointer size: 131 Bytes
  • Size of remote file: 198 kB
assets/vbench.png ADDED

Git LFS Details

  • SHA256: 27131bac1ccb83d3d28e8f558c6a7a91ed92816c0814583299b8584f0cda6546
  • Pointer size: 131 Bytes
  • Size of remote file: 170 kB
benchmark/moviegen_bench.csv ADDED
The diff for this file is too large to render. See raw diff
 
comfyui/README.md ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kandinsky 5 Video for ComfyUI
2
+
3
+ ![Kandinsky 5 ComfyUI Workflow](../assets/comfyui_kandinsky5.png)
4
+
5
+
6
+ ## Description
7
+
8
+ This project provides a workflow for generating videos using the Kandinsky 5 model within the ComfyUI environment.
9
+
10
+ ## Installation and Setup
11
+
12
+ ### 1. Install ComfyUI
13
+
14
+ If you don't have ComfyUI installed yet, follow these steps:
15
+
16
+ ```bash
17
+ # Clone ComfyUI repository
18
+ git clone https://github.com/comfyanonymous/ComfyUI.git
19
+ cd ComfyUI
20
+
21
+ # Install dependencies
22
+ pip install -r requirements.txt
23
+
24
+ # Launch ComfyUI
25
+ python main.py
26
+
27
+ ```
28
+
29
+ ### 2. Clone this repository into the ComfyUI custom_nodes folder:
30
+ ```bash
31
+ # Navigate to ComfyUI custom_nodes folder
32
+ cd ComfyUI/custom_nodes
33
+
34
+ # Clone this repository and install requirements for model
35
+ git clone https://github.com/ai-forever/Kandinsky-5.git kandinsky
36
+ cd kandinsky
37
+ pip install -r requirements.txt
38
+ ```
39
+
40
+ ### 3. Load the Workflow
41
+ Launch ComfyUI (typically available at http://127.0.0.1:8188)
42
+
43
+ In the ComfyUI interface, click the "Load" button
44
+
45
+ Select the kandisnky5_lite_T2V.json file from this folder of this repository
46
+
47
+ The workflow will load into the ComfyUI interface
48
+
49
+ ### 4. Download Checkpoints
50
+
51
+ Download the required models and place them in the appropriate folders.
52
+
53
+ 1. Run download_models.py It will download models and encoders to ./weights directory.
54
+ 2. Rearrange them to comfyui paths(text_encoders/diffusion_models/vae).
55
+
56
+ ```file-tree
57
+ ComfyUI/
58
+ ├── models/
59
+ │ ├── text_encoders/ # For text_encoder and text_encoder2 models
60
+ │ ├── diffusion_models/ # For kandinsky5lite_t2v_*.safetensors models
61
+ │ └── vae/ # For vae model
62
+ ```
63
+
64
+ ### 5. Configure Parameters
65
+ After loading the workflow, configure the following parameters:
66
+
67
+ ### Main Parameters
68
+
69
+ | Parameter | Description | Recommended Value |
70
+ |-----------|-------------|-------------------|
71
+ | **Prompt** | Text description for video generation | Your descriptive text |
72
+ | **Negative Prompt** | What to exclude from generation | Unwanted elements description |
73
+ | **Width/Height/Length** | Output video size | 768x512x121 for 5s or 768x512x241 for 10s, Width and Height should be divisisble by 128 for 10s model |
74
+ | **Steps** | Number of generation steps | 50, 16 for distilled version|
75
+ | **CFG Scale** | | 1.0 for distilled16steps and nocfg, 5.0 for sft and pretrain models |
76
+ | **Scheduler Scale** | Noise scheduler scale | 5.0 for 5s, 10.0 for 10s |
comfyui/kandisnky5_lite_T2V.json ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": "12380645-f2b5-4537-88ad-dd959a44e47c",
3
+ "revision": 0,
4
+ "last_node_id": 77,
5
+ "last_link_id": 163,
6
+ "nodes": [
7
+ {
8
+ "id": 77,
9
+ "type": "Kandinsky5VAEDecode",
10
+ "pos": [
11
+ 635.5618896484375,
12
+ 529.5596313476562
13
+ ],
14
+ "size": [
15
+ 239.40000915527344,
16
+ 46
17
+ ],
18
+ "flags": {},
19
+ "order": 7,
20
+ "mode": 0,
21
+ "inputs": [
22
+ {
23
+ "name": "model",
24
+ "type": "MODEL",
25
+ "link": 155
26
+ },
27
+ {
28
+ "name": "latent",
29
+ "type": "LATENT",
30
+ "link": 156
31
+ }
32
+ ],
33
+ "outputs": [
34
+ {
35
+ "name": "IMAGE",
36
+ "type": "IMAGE",
37
+ "slot_index": 0,
38
+ "links": [
39
+ 157
40
+ ]
41
+ }
42
+ ],
43
+ "properties": {
44
+ "Node name for S&R": "Kandinsky5VAEDecode",
45
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
46
+ "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
47
+ },
48
+ "widgets_values": []
49
+ },
50
+ {
51
+ "id": 54,
52
+ "type": "SaveAnimatedWEBP",
53
+ "pos": [
54
+ 953.2196044921875,
55
+ 91.47215270996094
56
+ ],
57
+ "size": [
58
+ 487.7459716796875,
59
+ 484.7098693847656
60
+ ],
61
+ "flags": {},
62
+ "order": 8,
63
+ "mode": 0,
64
+ "inputs": [
65
+ {
66
+ "name": "images",
67
+ "type": "IMAGE",
68
+ "link": 157
69
+ }
70
+ ],
71
+ "outputs": [],
72
+ "properties": {
73
+ "cnr_id": "comfy-core",
74
+ "ver": "0.3.26"
75
+ },
76
+ "widgets_values": [
77
+ "ComfyUI",
78
+ 24,
79
+ false,
80
+ 100,
81
+ "default"
82
+ ]
83
+ },
84
+ {
85
+ "id": 72,
86
+ "type": "Kandinsky5TextEncode",
87
+ "pos": [
88
+ 175.8029327392578,
89
+ 158.080322265625
90
+ ],
91
+ "size": [
92
+ 400,
93
+ 200
94
+ ],
95
+ "flags": {},
96
+ "order": 5,
97
+ "mode": 0,
98
+ "inputs": [
99
+ {
100
+ "name": "model",
101
+ "type": "MODEL",
102
+ "link": 143
103
+ },
104
+ {
105
+ "name": "extended_text",
106
+ "shape": 7,
107
+ "type": "PROMPT",
108
+ "link": 163
109
+ }
110
+ ],
111
+ "outputs": [
112
+ {
113
+ "name": "TEXT",
114
+ "type": "CONDITION",
115
+ "slot_index": 0,
116
+ "links": [
117
+ 161
118
+ ]
119
+ },
120
+ {
121
+ "name": "POOLED",
122
+ "type": "CONDITION",
123
+ "slot_index": 1,
124
+ "links": [
125
+ 160
126
+ ]
127
+ }
128
+ ],
129
+ "properties": {
130
+ "Node name for S&R": "Kandinsky5TextEncode",
131
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
132
+ "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
133
+ },
134
+ "widgets_values": [
135
+ ""
136
+ ]
137
+ },
138
+ {
139
+ "id": 73,
140
+ "type": "Kandinsky5TextEncode",
141
+ "pos": [
142
+ 179.4368133544922,
143
+ 406.9901428222656
144
+ ],
145
+ "size": [
146
+ 400,
147
+ 200
148
+ ],
149
+ "flags": {},
150
+ "order": 4,
151
+ "mode": 0,
152
+ "inputs": [
153
+ {
154
+ "name": "model",
155
+ "type": "MODEL",
156
+ "link": 147
157
+ },
158
+ {
159
+ "name": "extended_text",
160
+ "shape": 7,
161
+ "type": "PROMPT",
162
+ "link": null
163
+ }
164
+ ],
165
+ "outputs": [
166
+ {
167
+ "name": "TEXT",
168
+ "type": "CONDITION",
169
+ "slot_index": 0,
170
+ "links": [
171
+ 159
172
+ ]
173
+ },
174
+ {
175
+ "name": "POOLED",
176
+ "type": "CONDITION",
177
+ "slot_index": 1,
178
+ "links": [
179
+ 158
180
+ ]
181
+ }
182
+ ],
183
+ "properties": {
184
+ "Node name for S&R": "Kandinsky5TextEncode",
185
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
186
+ "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
187
+ },
188
+ "widgets_values": [
189
+ "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
190
+ ]
191
+ },
192
+ {
193
+ "id": 75,
194
+ "type": "Kandinsky5Generate",
195
+ "pos": [
196
+ 610.8881225585938,
197
+ 160.90586853027344
198
+ ],
199
+ "size": [
200
+ 315,
201
+ 282
202
+ ],
203
+ "flags": {},
204
+ "order": 6,
205
+ "mode": 0,
206
+ "inputs": [
207
+ {
208
+ "name": "model",
209
+ "type": "MODEL",
210
+ "link": 149
211
+ },
212
+ {
213
+ "name": "config",
214
+ "type": "CONFIG",
215
+ "link": 162
216
+ },
217
+ {
218
+ "name": "positive_emb",
219
+ "type": "CONDITION",
220
+ "link": 161
221
+ },
222
+ {
223
+ "name": "positive_clip",
224
+ "type": "CONDITION",
225
+ "link": 160
226
+ },
227
+ {
228
+ "name": "negative_emb",
229
+ "type": "CONDITION",
230
+ "link": 159
231
+ },
232
+ {
233
+ "name": "negative_clip",
234
+ "type": "CONDITION",
235
+ "link": 158
236
+ }
237
+ ],
238
+ "outputs": [
239
+ {
240
+ "name": "latent",
241
+ "type": "LATENT",
242
+ "slot_index": 0,
243
+ "links": [
244
+ 156
245
+ ]
246
+ }
247
+ ],
248
+ "properties": {
249
+ "Node name for S&R": "Kandinsky5Generate",
250
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
251
+ "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
252
+ },
253
+ "widgets_values": [
254
+ 50,
255
+ 768,
256
+ 512,
257
+ 121,
258
+ 5,
259
+ 10
260
+ ]
261
+ },
262
+ {
263
+ "id": 68,
264
+ "type": "expand_prompt",
265
+ "pos": [
266
+ -273.5207824707031,
267
+ 268.80975341796875
268
+ ],
269
+ "size": [
270
+ 400,
271
+ 200
272
+ ],
273
+ "flags": {},
274
+ "order": 3,
275
+ "mode": 0,
276
+ "inputs": [
277
+ {
278
+ "name": "model",
279
+ "type": "MODEL",
280
+ "link": 139
281
+ }
282
+ ],
283
+ "outputs": [
284
+ {
285
+ "name": "exp_prompt",
286
+ "type": "PROMPT",
287
+ "slot_index": 0,
288
+ "links": [
289
+ 163
290
+ ]
291
+ },
292
+ {
293
+ "name": "log",
294
+ "type": "STRING",
295
+ "slot_index": 1,
296
+ "links": []
297
+ }
298
+ ],
299
+ "properties": {
300
+ "Node name for S&R": "expand_prompt",
301
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
302
+ "ver": "ef383d80876b498f553b13c8ae99d423308b0aa8"
303
+ },
304
+ "widgets_values": [
305
+ "A heroic astronaut in a sleek, futuristic white-and-silver space suit with reflective visor down, galloping on a majestic black stallion through rugged, snow-capped mountain peaks at golden hour. The horse’s mane flows wildly as it leaps over a rocky ridge, kicking up dust, while the astronaut grips the reins tightly, their suit subtly illuminated by the warm glow of the setting sun. Dramatic low-angle shot, with vast misty valleys below and a sky shifting from deep orange to twilight purple. Cinematic lighting, hyper-detailed, with a sense of epic adventure and sci-fi wonder."
306
+ ]
307
+ },
308
+ {
309
+ "id": 71,
310
+ "type": "Kandinsky5LoadTextEmbedders",
311
+ "pos": [
312
+ -241.9403839111328,
313
+ 25.038286209106445
314
+ ],
315
+ "size": [
316
+ 340.20001220703125,
317
+ 82
318
+ ],
319
+ "flags": {},
320
+ "order": 0,
321
+ "mode": 0,
322
+ "inputs": [],
323
+ "outputs": [
324
+ {
325
+ "name": "model",
326
+ "type": "MODEL",
327
+ "slot_index": 0,
328
+ "links": [
329
+ 139,
330
+ 143,
331
+ 147
332
+ ]
333
+ }
334
+ ],
335
+ "properties": {
336
+ "Node name for S&R": "Kandinsky5LoadTextEmbedders",
337
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
338
+ "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
339
+ },
340
+ "widgets_values": [
341
+ "text_encoder",
342
+ "text_encoder2"
343
+ ]
344
+ },
345
+ {
346
+ "id": 74,
347
+ "type": "Kandinsky5LoadDiT",
348
+ "pos": [
349
+ 208.43588256835938,
350
+ 27.57303810119629
351
+ ],
352
+ "size": [
353
+ 315,
354
+ 78
355
+ ],
356
+ "flags": {},
357
+ "order": 1,
358
+ "mode": 0,
359
+ "inputs": [],
360
+ "outputs": [
361
+ {
362
+ "name": "model",
363
+ "type": "MODEL",
364
+ "slot_index": 0,
365
+ "links": [
366
+ 149
367
+ ]
368
+ },
369
+ {
370
+ "name": "conf",
371
+ "type": "CONFIG",
372
+ "slot_index": 1,
373
+ "links": [
374
+ 162
375
+ ]
376
+ }
377
+ ],
378
+ "properties": {
379
+ "Node name for S&R": "Kandinsky5LoadDiT",
380
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
381
+ "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
382
+ },
383
+ "widgets_values": [
384
+ "kandinsky5lite_t2v_sft_5s.safetensors"
385
+ ]
386
+ },
387
+ {
388
+ "id": 76,
389
+ "type": "Kandinsky5LoadVAE",
390
+ "pos": [
391
+ 612.3971557617188,
392
+ 23.200862884521484
393
+ ],
394
+ "size": [
395
+ 315,
396
+ 58
397
+ ],
398
+ "flags": {},
399
+ "order": 2,
400
+ "mode": 0,
401
+ "inputs": [],
402
+ "outputs": [
403
+ {
404
+ "name": "model",
405
+ "type": "MODEL",
406
+ "slot_index": 0,
407
+ "links": [
408
+ 155
409
+ ]
410
+ }
411
+ ],
412
+ "properties": {
413
+ "Node name for S&R": "Kandinsky5LoadVAE",
414
+ "aux_id": "gen-ai-team/kandinsky-5-inference",
415
+ "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
416
+ },
417
+ "widgets_values": [
418
+ "vae"
419
+ ]
420
+ }
421
+ ],
422
+ "links": [
423
+ [
424
+ 139,
425
+ 71,
426
+ 0,
427
+ 68,
428
+ 0,
429
+ "MODEL"
430
+ ],
431
+ [
432
+ 143,
433
+ 71,
434
+ 0,
435
+ 72,
436
+ 0,
437
+ "MODEL"
438
+ ],
439
+ [
440
+ 147,
441
+ 71,
442
+ 0,
443
+ 73,
444
+ 0,
445
+ "MODEL"
446
+ ],
447
+ [
448
+ 149,
449
+ 74,
450
+ 0,
451
+ 75,
452
+ 0,
453
+ "MODEL"
454
+ ],
455
+ [
456
+ 155,
457
+ 76,
458
+ 0,
459
+ 77,
460
+ 0,
461
+ "MODEL"
462
+ ],
463
+ [
464
+ 156,
465
+ 75,
466
+ 0,
467
+ 77,
468
+ 1,
469
+ "LATENT"
470
+ ],
471
+ [
472
+ 157,
473
+ 77,
474
+ 0,
475
+ 54,
476
+ 0,
477
+ "IMAGE"
478
+ ],
479
+ [
480
+ 158,
481
+ 73,
482
+ 1,
483
+ 75,
484
+ 5,
485
+ "CONDITION"
486
+ ],
487
+ [
488
+ 159,
489
+ 73,
490
+ 0,
491
+ 75,
492
+ 4,
493
+ "CONDITION"
494
+ ],
495
+ [
496
+ 160,
497
+ 72,
498
+ 1,
499
+ 75,
500
+ 3,
501
+ "CONDITION"
502
+ ],
503
+ [
504
+ 161,
505
+ 72,
506
+ 0,
507
+ 75,
508
+ 2,
509
+ "CONDITION"
510
+ ],
511
+ [
512
+ 162,
513
+ 74,
514
+ 1,
515
+ 75,
516
+ 1,
517
+ "CONFIG"
518
+ ],
519
+ [
520
+ 163,
521
+ 68,
522
+ 0,
523
+ 72,
524
+ 1,
525
+ "PROMPT"
526
+ ]
527
+ ],
528
+ "groups": [],
529
+ "config": {},
530
+ "extra": {
531
+ "ds": {
532
+ "scale": 0.6611570247933911,
533
+ "offset": [
534
+ 286.65909097964607,
535
+ 84.06209500649425
536
+ ]
537
+ },
538
+ "frontendVersion": "1.26.13"
539
+ },
540
+ "version": 0.4
541
+ }
comfyui/nodes_kandinsky.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ from omegaconf.dictconfig import DictConfig
4
+ from ..kandinsky.models.vae import build_vae
5
+ from ..kandinsky.models.text_embedders import Kandinsky5TextEmbedder
6
+ from ..kandinsky.models.dit import get_dit
7
+ from ..kandinsky.generation_utils import generate
8
+ import folder_paths
9
+ from comfy.comfy_types import ComfyNodeABC
10
+ from comfy.utils import ProgressBar as pbar
11
+ from safetensors.torch import load_file
12
+ from omegaconf import OmegaConf
13
+ from pathlib import Path
14
+
15
+
16
+ class Kandinsky5LoadTextEmbedders:
17
+ @classmethod
18
+ def INPUT_TYPES(s):
19
+ return {
20
+ "required": {
21
+ "qwen": (os.listdir(folder_paths.get_folder_paths("text_encoders")[0]), {"default": "qwen2_5_vl_7b_instruct"}),
22
+ "clip": (os.listdir(folder_paths.get_folder_paths("text_encoders")[0]), {"default": "clip_text"})
23
+ }
24
+ }
25
+ RETURN_TYPES = ("MODEL",)
26
+ RETURN_NAMES = ("model",)
27
+ FUNCTION = "load_te"
28
+
29
+ CATEGORY = "advanced/loaders"
30
+
31
+ DESCRIPTION = "return clip and qwen text embedders"
32
+
33
+ def load_te(self, qwen, clip):
34
+ qwen_path = os.path.join(folder_paths.get_folder_paths("text_encoders")[0],qwen)
35
+ clip_path = os.path.join(folder_paths.get_folder_paths("text_encoders")[0],clip)
36
+ conf = {'qwen': {'checkpoint_path': qwen_path, 'max_length': 256},
37
+ 'clip': {'checkpoint_path': clip_path, 'max_length': 77}
38
+ }
39
+ return (Kandinsky5TextEmbedder(DictConfig(conf), device='cpu'),)
40
+ class Kandinsky5LoadDiT:
41
+ @classmethod
42
+ def INPUT_TYPES(s):
43
+ return {
44
+ "required": {
45
+ "dit": (folder_paths.get_filename_list("diffusion_models"), ),
46
+ }
47
+ }
48
+ RETURN_TYPES = ("MODEL","CONFIG")
49
+ RETURN_NAMES = ("model","conf")
50
+ FUNCTION = "load_dit"
51
+ CATEGORY = "advanced/loaders"
52
+
53
+ DESCRIPTION = "return kandy dit"
54
+
55
+ def load_dit(self, dit):
56
+
57
+ dit_path = folder_paths.get_full_path_or_raise("diffusion_models", dit)
58
+ current_file = Path(__file__)
59
+ parent_directory = current_file.parent.parent
60
+ sec = dit.split("_")[-1].split(".")[0]
61
+ conf = OmegaConf.load(os.path.join(parent_directory,f"configs/config_{sec}_sft.yaml"))
62
+ dit = get_dit(conf.model.dit_params)
63
+ state_dict = load_file(dit_path)
64
+ dit.load_state_dict(state_dict)
65
+ return (dit,conf)
66
+ class Kandinsky5TextEncode(ComfyNodeABC):
67
+ @classmethod
68
+ def INPUT_TYPES(s):
69
+ return {
70
+ "required": {
71
+ "model": ("MODEL",),
72
+ "prompt": ("STRING", {"multiline": True})
73
+ },
74
+ "optional": {
75
+ "extended_text": ("PROMPT",),
76
+ },
77
+ }
78
+ RETURN_TYPES = ("CONDITION", "CONDITION")
79
+ RETURN_NAMES = ("TEXT", "POOLED")
80
+ OUTPUT_TOOLTIPS = ("A conditioning containing the embedded text used to guide the diffusion model.",)
81
+ FUNCTION = "encode"
82
+
83
+ CATEGORY = "conditioning"
84
+ DESCRIPTION = "Encodes a text prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images."
85
+
86
+ def encode(self, model, prompt, extended_text=None):
87
+ text = extended_text if extended_text is not None else prompt
88
+ device='cuda:0'
89
+ model = model.to(device)
90
+ text_embeds = model.embedder([text], type_of_content='video')
91
+ pooled_embed = model.clip_embedder([text])
92
+ model = model.to('cpu')
93
+ return (text_embeds, pooled_embed)
94
+
95
+ class Kandinsky5LoadVAE:
96
+ @classmethod
97
+ def INPUT_TYPES(s):
98
+ return {
99
+ "required": {
100
+ "vae": (os.listdir(folder_paths.get_folder_paths("vae")[0]), {"default": "hunyuan_vae"}),
101
+ }
102
+ }
103
+ RETURN_TYPES = ("MODEL",)
104
+ RETURN_NAMES = ("model",)
105
+ FUNCTION = "load_vae"
106
+
107
+ CATEGORY = "advanced/loaders"
108
+
109
+ DESCRIPTION = "return vae"
110
+
111
+ def load_vae(self, vae):
112
+ vae_path = os.path.join(folder_paths.get_folder_paths("vae")[0],vae)
113
+ vae = build_vae(DictConfig({'checkpoint_path':vae_path, 'name':'hunyuan'}))
114
+ vae = vae.eval()
115
+
116
+ return (vae,)
117
+ class expand_prompt(ComfyNodeABC):
118
+ @classmethod
119
+ def INPUT_TYPES(s):
120
+ return {
121
+ "required": {
122
+ "model": ("MODEL",),
123
+ "prompt": ("STRING", {"multiline": True})
124
+ }
125
+ }
126
+ RETURN_TYPES = ("PROMPT","STRING")
127
+ RETURN_NAMES = ("exp_prompt","log")
128
+ OUTPUT_NODE = True
129
+ OUTPUT_TOOLTIPS = ("expanded prompt",)
130
+ FUNCTION = "expand_prompt"
131
+
132
+ CATEGORY = "conditioning"
133
+ DESCRIPTION = "extend prompt with."
134
+ def expand_prompt(self, model, prompt, device='cuda:0'):
135
+ messages = [
136
+ {
137
+ "role": "user",
138
+ "content": [
139
+ {
140
+ "type": "text",
141
+ "text": f"""You are a prompt beautifier that transforms short user video descriptions into rich, detailed English prompts specifically optimized for video generation models.
142
+ Here are some example descriptions from the dataset that the model was trained:
143
+ 1. "In a dimly lit room with a cluttered background, papers are pinned to the wall and various objects rest on a desk. Three men stand present: one wearing a red sweater, another in a black sweater, and the third in a gray shirt. The man in the gray shirt speaks and makes hand gestures, while the other two men look forward. The camera remains stationary, focusing on the three men throughout the sequence. A gritty and realistic visual style prevails, marked by a greenish tint that contributes to a moody atmosphere. Low lighting casts shadows, enhancing the tense mood of the scene."
144
+ 2. "In an office setting, a man sits at a desk wearing a gray sweater and seated in a black office chair. A wooden cabinet with framed pictures stands beside him, alongside a small plant and a lit desk lamp. Engaged in a conversation, he makes various hand gestures to emphasize his points. His hands move in different positions, indicating different ideas or points. The camera remains stationary, focusing on the man throughout. Warm lighting creates a cozy atmosphere. The man appears to be explaining something. The overall visual style is professional and polished, suitable for a business or educational context."
145
+ 3. "A person works on a wooden object resembling a sunburst pattern, holding it in their left hand while using their right hand to insert a thin wire into the gaps between the wooden pieces. The background features a natural outdoor setting with greenery and a tree trunk visible. The camera stays focused on the hands and the wooden object throughout, capturing the detailed process of assembling the wooden structure. The person carefully threads the wire through the gaps, ensuring the wooden pieces are securely fastened together. The scene unfolds with a naturalistic and instructional style, emphasizing the craftsmanship and the methodical steps taken to complete the task."
146
+ IImportantly! These are just examples from a large training dataset of 200 million videos.
147
+ Rewrite Prompt: "{prompt}" to get high-quality video generation. Answer only with expanded prompt.""",
148
+ },
149
+ ],
150
+ }
151
+ ]
152
+ model = model.to(device)
153
+ text = model.embedder.processor.apply_chat_template(
154
+ messages, tokenize=False, add_generation_prompt=True
155
+ )
156
+ inputs = model.embedder.processor(
157
+ text=[text],
158
+ images=None,
159
+ videos=None,
160
+ padding=True,
161
+ return_tensors="pt",
162
+ )
163
+ inputs = inputs.to(model.embedder.model.device)
164
+ generated_ids = model.embedder.model.generate(
165
+ **inputs, max_new_tokens=256
166
+ )
167
+ generated_ids_trimmed = [
168
+ out_ids[len(in_ids) :]
169
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
170
+ ]
171
+ output_text = model.embedder.processor.batch_decode(
172
+ generated_ids_trimmed,
173
+ skip_special_tokens=True,
174
+ clean_up_tokenization_spaces=False,
175
+ )
176
+ print(output_text[0])
177
+ model = model.to('cpu')
178
+ return (output_text[0],str(output_text[0]))
179
+ class Kandinsky5Generate(ComfyNodeABC):
180
+ @classmethod
181
+ def INPUT_TYPES(s):
182
+ return {
183
+ "required": {
184
+ "model": ("MODEL", {"tooltip": "The model used for denoising the input latent."}),
185
+ "config": ("CONFIG", {"tooltip": "Config of model and generation."}),
186
+ "steps": ("INT", {"default": 50, "min": 1, "max": 10000, "tooltip": "The number of steps used in the denoising process."}),
187
+ "width": ("INT", {"default": 768, "min": 512, "max": 768, "tooltip": "width of video."}),
188
+ "height": ("INT", {"default": 512, "min": 512, "max": 768, "tooltip": "height of video."}),
189
+ "length": ("INT", {"default": 121, "min": 5, "max": 241, "tooltip": "lenght of video."}),
190
+ "cfg": ("FLOAT", {"default": 5.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01, "tooltip": "The Classifier-Free Guidance scale balances creativity and adherence to the prompt. Higher values result in images more closely matching the prompt however too high values will negatively impact quality."}),
191
+ "scheduler_scale":("FLOAT", {"default": 10.0, "min": 1.0, "max": 25.0, "step":0.1, "round": 0.01, "tooltip": "scheduler scale"}),
192
+ "precision": (["float16", "bfloat16"], {"default": "bfloat16"}),
193
+ "positive_emb": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to include in the image."}),
194
+ "positive_clip": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to exclude from the image."}),
195
+ "negative_emb": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to include in the image."}),
196
+ "negative_clip": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to exclude from the image."}),
197
+ }
198
+ }
199
+ RETURN_TYPES = ("LATENT",)
200
+ RETURN_NAMES = ("latent",)
201
+ OUTPUT_TOOLTIPS = ("The denoised latent.",)
202
+ FUNCTION = "sample"
203
+ CATEGORY = "sampling"
204
+ DESCRIPTION = "Uses the provided model, positive and negative conditioning to denoise the latent image."
205
+
206
+ def sample(self, model, config, steps, width, height, length, cfg, precision, positive_emb, positive_clip, negative_emb, negative_clip, scheduler_scale):
207
+ bs = 1
208
+ device = 'cuda:0'
209
+ model = model.to(device)
210
+ patch_size = (1, 2, 2)
211
+ autocast_type = torch.bfloat16 if precision=='bfloat16' else torch.float16
212
+ dim = config.model.dit_params.in_visual_dim
213
+ length, height, width = 1 + (length - 1)//4, height // 8, width // 8
214
+ bs_text_embed, text_cu_seqlens = positive_emb
215
+ bs_null_text_embed, null_text_cu_seqlens = negative_emb
216
+ text_embed = {"text_embeds": bs_text_embed, "pooled_embed": positive_clip }
217
+ null_embed = {"text_embeds": bs_null_text_embed, "pooled_embed": negative_clip }
218
+
219
+ visual_rope_pos = [
220
+ torch.arange(length // patch_size[0]),
221
+ torch.arange(height // patch_size[1]),
222
+ torch.arange(width // patch_size[2])
223
+ ]
224
+ text_rope_pos = torch.cat([torch.arange(end) for end in torch.diff(text_cu_seqlens).cpu()])
225
+ null_text_rope_pos = torch.cat([torch.arange(end) for end in torch.diff(null_text_cu_seqlens).cpu()])
226
+ with torch.no_grad():
227
+ with torch.autocast(device_type='cuda', dtype=autocast_type):
228
+ latent_visual = generate(
229
+ model, device, (bs * length, height, width, dim), steps,
230
+ text_embed, null_embed,
231
+ visual_rope_pos, text_rope_pos, null_text_rope_pos,
232
+ cfg, scheduler_scale, config
233
+ )
234
+ model = model.to('cpu')
235
+ return (latent_visual,)
236
+
237
+ class Kandinsky5VAEDecode(ComfyNodeABC):
238
+ @classmethod
239
+ def INPUT_TYPES(s):
240
+ return {
241
+ "required": {
242
+ "model": ("MODEL", {"tooltip": "vae."}),
243
+ "latent": ("LATENT", {"tooltip": "latent."}),}
244
+ }
245
+ RETURN_TYPES = ("IMAGE",)
246
+ OUTPUT_TOOLTIPS = ("The decoded image.",)
247
+ FUNCTION = "decode"
248
+ CATEGORY = "latent"
249
+ DESCRIPTION = "Decodes latent images back into pixel space images."
250
+
251
+ def decode(self, model, latent):
252
+ device = 'cuda:0'
253
+ model = model.to(device)
254
+ with torch.no_grad():
255
+ with torch.autocast(device_type='cuda', dtype=torch.float16):
256
+ bs = 1
257
+ images = latent.reshape(bs, -1, latent.shape[-3], latent.shape[-2], latent.shape[-1])# bs, t, h, w, c
258
+ # shape for decode: bs, c, t, h, w
259
+ images = (images / 0.476986).permute(0, 4, 1, 2, 3)
260
+ images = model.decode(images).sample
261
+ if not isinstance(images, torch.Tensor):
262
+ images = images.sample
263
+ images = ((images.clamp(-1., 1.) + 1.) * 0.5)#.to(torch.uint8)
264
+ images = images[0].float().permute(1, 2, 3, 0)
265
+ model = model.to('cpu')
266
+ return (images,)
267
+
268
+ NODE_CLASS_MAPPINGS = {
269
+ "Kandinsky5LoadTextEmbedders": Kandinsky5LoadTextEmbedders,
270
+ "Kandinsky5TextEncode": Kandinsky5TextEncode,
271
+ "Kandinsky5Generate": Kandinsky5Generate,
272
+ "Kandinsky5LoadVAE": Kandinsky5LoadVAE,
273
+ "Kandinsky5VAEDecode": Kandinsky5VAEDecode,
274
+ "Kandinsky5LoadDiT": Kandinsky5LoadDiT,
275
+ "expand_prompt": expand_prompt
276
+ }
277
+ NODE_DISPLAY_NAME_MAPPINGS = {
278
+ "Kandinsky5LoadTextEmbedders": "Kandinsky5LoadTextEmbedders",
279
+ "Kandinsky5TextEncode": "Kandinsky5TextEncode",
280
+ "Kandinsky5Generate": "Kandinsky5Generate",
281
+ "Kandinsky5LoadVAE": "Kandinsky5LoadVAE",
282
+ "Kandinsky5VAEDecode": "Kandinsky5VAEDecode",
283
+ "Kandinsky5LoadDiT": "Kandinsky5LoadDiT",
284
+ "expand_prompt": "expand_prompt"
285
+
286
+ }
configs/config_10s_distil.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scheduler_scale: 5
3
+ scale_factor:
4
+ - 1.0
5
+ - 2.0
6
+ - 2.0
7
+ resolution: 512
8
+ model:
9
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_distilled16steps_10s.safetensors"
10
+ num_steps: 16
11
+ guidance_weight: 1.0
12
+ dit_params:
13
+ in_visual_dim: 16
14
+ out_visual_dim: 16
15
+ time_dim: 512
16
+ patch_size:
17
+ - 1
18
+ - 2
19
+ - 2
20
+ model_dim: 1792
21
+ ff_dim: 7168
22
+ num_text_blocks: 2
23
+ num_visual_blocks: 32
24
+ axes_dims:
25
+ - 16
26
+ - 24
27
+ - 24
28
+ visual_cond: true
29
+ in_text_dim: 3584
30
+ in_text_dim2: 768
31
+ attention:
32
+ type: nabla
33
+ causal: false
34
+ local: false
35
+ glob: false
36
+ window: 3
37
+ P: 0.9
38
+ wT: 11
39
+ wW: 3
40
+ wH: 3
41
+ add_sta: true
42
+ method: topcdf
43
+ vae:
44
+ checkpoint_path: "./weights/vae/"
45
+ name: "hunyuan"
46
+ text_embedder:
47
+ qwen:
48
+ emb_size: 3584
49
+ checkpoint_path: "./weights/text_encoder/"
50
+ max_length: 256
51
+ clip:
52
+ checkpoint_path: "./weights/text_encoder2/"
53
+ emb_size: 768
54
+ max_length: 77
configs/config_10s_nocfg.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scheduler_scale: 5
3
+ scale_factor:
4
+ - 1.0
5
+ - 2.0
6
+ - 2.0
7
+ resolution: 512
8
+ model:
9
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_sft_10s.safetensors"
10
+ num_steps: 50
11
+ guidance_weight: 1.0
12
+ dit_params:
13
+ in_visual_dim: 16
14
+ out_visual_dim: 16
15
+ time_dim: 512
16
+ patch_size:
17
+ - 1
18
+ - 2
19
+ - 2
20
+ model_dim: 1792
21
+ ff_dim: 7168
22
+ num_text_blocks: 2
23
+ num_visual_blocks: 32
24
+ axes_dims:
25
+ - 16
26
+ - 24
27
+ - 24
28
+ visual_cond: true
29
+ in_text_dim: 3584
30
+ in_text_dim2: 768
31
+ attention:
32
+ type: nabla
33
+ causal: false
34
+ local: false
35
+ glob: false
36
+ window: 3
37
+ P: 0.9
38
+ wT: 11
39
+ wW: 3
40
+ wH: 3
41
+ add_sta: true
42
+ method: topcdf
43
+ vae:
44
+ checkpoint_path: "./weights/vae/"
45
+ name: "hunyuan"
46
+ text_embedder:
47
+ qwen:
48
+ emb_size: 3584
49
+ checkpoint_path: "./weights/text_encoder/"
50
+ max_length: 256
51
+ clip:
52
+ checkpoint_path: "./weights/text_encoder2/"
53
+ emb_size: 768
54
+ max_length: 77
55
+ magcache:
56
+ mag_ratios: [0.8736, 0, 1.12136, 0, 1.07896, 0, 1.06666, 0, 1.06235, 0, 1.03925, 0, 1.04018, 0, 1.0355, 0, 1.0327, 0, 1.02839, 0, 1.02768, 0, 1.02488, 0, 1.02143, 0, 1.02133, 0, 1.01715, 0, 1.01943, 0, 1.02177, 0, 1.01829, 0, 1.01747, 0, 1.01626, 0, 1.01559, 0, 1.01435, 0, 1.01435, 0, 1.01571, 0, 1.01312, 0, 1.01338, 0, 1.01437, 0, 1.01211, 0, 1.01237, 0, 1.01356, 0, 1.0101, 0, 1.01194, 0, 1.00898, 0, 1.0091, 0, 1.0108, 0, 1.00705, 0, 1.0018, 0, 1.01209, 0, 1.00525, 0, 1.00098, 0, 0.99914, 0, 0.99592, 0, 0.99089, 0, 0.98506, 0, 0.97495, 0, 0.9604, 0, 0.93492, 0, 0.89367, 0, 0.79353, 0]
configs/config_10s_pretrain.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scheduler_scale: 5
3
+ scale_factor:
4
+ - 1.0
5
+ - 2.0
6
+ - 2.0
7
+ resolution: 512
8
+ model:
9
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_pretrain_10s.safetensors"
10
+ num_steps: 50
11
+ guidance_weight: 5.0
12
+ dit_params:
13
+ in_visual_dim: 16
14
+ out_visual_dim: 16
15
+ time_dim: 512
16
+ patch_size:
17
+ - 1
18
+ - 2
19
+ - 2
20
+ model_dim: 1792
21
+ ff_dim: 7168
22
+ num_text_blocks: 2
23
+ num_visual_blocks: 32
24
+ axes_dims:
25
+ - 16
26
+ - 24
27
+ - 24
28
+ visual_cond: true
29
+ in_text_dim: 3584
30
+ in_text_dim2: 768
31
+ attention:
32
+ type: nabla
33
+ causal: false
34
+ local: false
35
+ glob: false
36
+ window: 3
37
+ P: 0.9
38
+ wT: 11
39
+ wW: 3
40
+ wH: 3
41
+ add_sta: true
42
+ method: topcdf
43
+ vae:
44
+ checkpoint_path: "./weights/vae/"
45
+ name: "hunyuan"
46
+ text_embedder:
47
+ qwen:
48
+ emb_size: 3584
49
+ checkpoint_path: "./weights/text_encoder/"
50
+ max_length: 256
51
+ clip:
52
+ checkpoint_path: "./weights/text_encoder2/"
53
+ emb_size: 768
54
+ max_length: 77
configs/config_10s_sft.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scheduler_scale: 5
3
+ scale_factor:
4
+ - 1.0
5
+ - 2.0
6
+ - 2.0
7
+ resolution: 512
8
+ model:
9
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_sft_10s.safetensors"
10
+ num_steps: 50
11
+ guidance_weight: 5.0
12
+ dit_params:
13
+ in_visual_dim: 16
14
+ out_visual_dim: 16
15
+ time_dim: 512
16
+ patch_size:
17
+ - 1
18
+ - 2
19
+ - 2
20
+ model_dim: 1792
21
+ ff_dim: 7168
22
+ num_text_blocks: 2
23
+ num_visual_blocks: 32
24
+ axes_dims:
25
+ - 16
26
+ - 24
27
+ - 24
28
+ visual_cond: true
29
+ in_text_dim: 3584
30
+ in_text_dim2: 768
31
+ attention:
32
+ type: nabla
33
+ causal: false
34
+ local: false
35
+ glob: false
36
+ window: 3
37
+ P: 0.9
38
+ wT: 11
39
+ wW: 3
40
+ wH: 3
41
+ add_sta: true
42
+ method: topcdf
43
+ vae:
44
+ checkpoint_path: "./weights/vae/"
45
+ name: "hunyuan"
46
+ text_embedder:
47
+ qwen:
48
+ emb_size: 3584
49
+ checkpoint_path: "./weights/text_encoder/"
50
+ max_length: 256
51
+ clip:
52
+ checkpoint_path: "./weights/text_encoder2/"
53
+ emb_size: 768
54
+ max_length: 77
55
+ magcache:
56
+ mag_ratios: [0.92261, 0.92261, 0.95898, 0.95962, 1.04862, 1.04855, 1.0806, 1.08045, 1.04405, 1.0445, 1.03587, 1.03619, 1.03789, 1.03785, 1.03485, 1.03514, 1.03724, 1.03814, 1.02484, 1.02502, 1.02525, 1.02508, 1.02473, 1.02532, 1.02625, 1.02706, 1.0197, 1.02011, 1.02326, 1.02324, 1.02081, 1.02116, 1.01993, 1.02047, 1.01979, 1.0205, 1.01823, 1.01852, 1.01785, 1.01813, 1.01563, 1.01606, 1.02057, 1.02083, 1.01132, 1.01207, 1.02053, 1.01959, 1.01718, 1.01749, 1.01546, 1.01589, 1.01516, 1.01525, 1.01578, 1.01608, 1.01616, 1.01618, 1.01443, 1.01466, 1.01554, 1.01568, 1.01494, 1.01515, 1.01567, 1.01572, 1.01418, 1.01458, 1.01601, 1.01618, 1.01491, 1.01508, 1.0162, 1.01625, 1.01412, 1.01419, 1.01431, 1.01437, 1.0106, 1.0108, 1.01428, 1.01427, 1.01222, 1.01236, 1.00812, 1.00818, 1.00759, 1.00764, 1.001, 1.00119, 0.98798, 0.98819, 0.9727, 0.97279, 0.93234, 0.93213, 0.83781, 0.83746]
configs/config_5s_distil.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scale_factor:
3
+ - 1.0
4
+ - 2.0
5
+ - 2.0
6
+ resolution: 512
7
+ model:
8
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_distilled16steps_5s.safetensors"
9
+ num_steps: 16
10
+ guidance_weight: 1.0
11
+ dit_params:
12
+ in_visual_dim: 16
13
+ out_visual_dim: 16
14
+ time_dim: 512
15
+ patch_size:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ model_dim: 1792
20
+ ff_dim: 7168
21
+ num_text_blocks: 2
22
+ num_visual_blocks: 32
23
+ axes_dims:
24
+ - 16
25
+ - 24
26
+ - 24
27
+ visual_cond: true
28
+ in_text_dim: 3584
29
+ in_text_dim2: 768
30
+ attention:
31
+ type: flash
32
+ causal: false
33
+ local: false
34
+ glob: false
35
+ window: 3
36
+ vae:
37
+ checkpoint_path: "./weights/vae/"
38
+ name: "hunyuan"
39
+ text_embedder:
40
+ qwen:
41
+ emb_size: 3584
42
+ checkpoint_path: "./weights/text_encoder/"
43
+ max_length: 256
44
+ clip:
45
+ checkpoint_path: "./weights/text_encoder2/"
46
+ emb_size: 768
47
+ max_length: 77
configs/config_5s_nocfg.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scale_factor:
3
+ - 1.0
4
+ - 2.0
5
+ - 2.0
6
+ resolution: 512
7
+ model:
8
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_nocfg_5s.safetensors"
9
+ num_steps: 50
10
+ guidance_weight: 1.0
11
+ dit_params:
12
+ in_visual_dim: 16
13
+ out_visual_dim: 16
14
+ time_dim: 512
15
+ patch_size:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ model_dim: 1792
20
+ ff_dim: 7168
21
+ num_text_blocks: 2
22
+ num_visual_blocks: 32
23
+ axes_dims:
24
+ - 16
25
+ - 24
26
+ - 24
27
+ visual_cond: true
28
+ in_text_dim: 3584
29
+ in_text_dim2: 768
30
+ attention:
31
+ type: flash
32
+ causal: false
33
+ local: false
34
+ glob: false
35
+ window: 3
36
+ vae:
37
+ checkpoint_path: "./weights/vae/"
38
+ name: "hunyuan"
39
+ text_embedder:
40
+ qwen:
41
+ emb_size: 3584
42
+ checkpoint_path: "./weights/text_encoder/"
43
+ max_length: 256
44
+ clip:
45
+ checkpoint_path: "./weights/text_encoder2/"
46
+ emb_size: 768
47
+ max_length: 77
48
+ magcache:
49
+ mag_ratios: [0.8827, 0, 1.14399, 0, 1.08362, 0, 1.06681, 0, 1.05906, 0, 1.03969, 0, 1.03835, 0, 1.03338, 0, 1.031, 0, 1.02616, 0, 1.02654, 0, 1.02322, 0, 1.02078, 0, 1.02, 0, 1.01673, 0, 1.01353, 0, 1.02175, 0, 1.0156, 0, 1.01616, 0, 1.01557, 0, 1.0131, 0, 1.01264, 0, 1.01378, 0, 1.0147, 0, 1.0109, 0, 1.01178, 0, 1.01248, 0, 1.0111, 0, 1.0099, 0, 1.01248, 0, 1.00721, 0, 1.01134, 0, 1.00752, 0, 1.00837, 0, 1.00817, 0, 1.00475, 0, 0.99937, 0, 1.01171, 0, 1.00434, 0, 0.99868, 0, 0.9969, 0, 0.995, 0, 0.98869, 0, 0.98454, 0, 0.97462, 0, 0.95885, 0, 0.93354, 0, 0.88895, 0, 0.78835, 0]
configs/config_5s_pretrain.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scale_factor:
3
+ - 1.0
4
+ - 2.0
5
+ - 2.0
6
+ resolution: 512
7
+ model:
8
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_pretrain_5s.safetensors"
9
+ num_steps: 50
10
+ guidance_weight: 5.0
11
+ dit_params:
12
+ in_visual_dim: 16
13
+ out_visual_dim: 16
14
+ time_dim: 512
15
+ patch_size:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ model_dim: 1792
20
+ ff_dim: 7168
21
+ num_text_blocks: 2
22
+ num_visual_blocks: 32
23
+ axes_dims:
24
+ - 16
25
+ - 24
26
+ - 24
27
+ visual_cond: true
28
+ in_text_dim: 3584
29
+ in_text_dim2: 768
30
+ attention:
31
+ type: flash
32
+ causal: false
33
+ local: false
34
+ glob: false
35
+ window: 3
36
+ vae:
37
+ checkpoint_path: "./weights/vae/"
38
+ name: "hunyuan"
39
+ text_embedder:
40
+ qwen:
41
+ emb_size: 3584
42
+ checkpoint_path: "./weights/text_encoder/"
43
+ max_length: 256
44
+ clip:
45
+ checkpoint_path: "./weights/text_encoder2/"
46
+ emb_size: 768
47
+ max_length: 77
configs/config_5s_sft.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ metrics:
2
+ scale_factor:
3
+ - 1.0
4
+ - 2.0
5
+ - 2.0
6
+ resolution: 512
7
+ model:
8
+ checkpoint_path: "./weights/model/kandinsky5lite_t2v_sft_5s.safetensors"
9
+ num_steps: 50
10
+ guidance_weight: 5.0
11
+ dit_params:
12
+ in_visual_dim: 16
13
+ out_visual_dim: 16
14
+ time_dim: 512
15
+ patch_size:
16
+ - 1
17
+ - 2
18
+ - 2
19
+ model_dim: 1792
20
+ ff_dim: 7168
21
+ num_text_blocks: 2
22
+ num_visual_blocks: 32
23
+ axes_dims:
24
+ - 16
25
+ - 24
26
+ - 24
27
+ visual_cond: true
28
+ in_text_dim: 3584
29
+ in_text_dim2: 768
30
+ attention:
31
+ type: flash
32
+ causal: false
33
+ local: false
34
+ glob: false
35
+ window: 3
36
+ vae:
37
+ checkpoint_path: "./weights/vae/"
38
+ name: "hunyuan"
39
+ text_embedder:
40
+ qwen:
41
+ emb_size: 3584
42
+ checkpoint_path: "./weights/text_encoder/"
43
+ max_length: 256
44
+ clip:
45
+ checkpoint_path: "./weights/text_encoder2/"
46
+ emb_size: 768
47
+ max_length: 77
48
+ magcache:
49
+ mag_ratios: [0.91607, 0.91507, 0.95254, 0.95349, 1.04876, 1.04937, 1.0842, 1.084, 1.04372, 1.04445, 1.03521, 1.03559, 1.03906, 1.03904, 1.03104, 1.03132, 1.03618, 1.03671, 1.02519, 1.02512, 1.02595, 1.02603, 1.02378, 1.02447, 1.02427, 1.02514, 1.01967, 1.01996, 1.02266, 1.02269, 1.01885, 1.01951, 1.01913, 1.01977, 1.01944, 1.02018, 1.01664, 1.01707, 1.01682, 1.01723, 1.0155, 1.01611, 1.01998, 1.02022, 1.01194, 1.01244, 1.01626, 1.01555, 1.01611, 1.01654, 1.01545, 1.01579, 1.01362, 1.01376, 1.01589, 1.01627, 1.01527, 1.01521, 1.01301, 1.01334, 1.01415, 1.01444, 1.0144, 1.01464, 1.01444, 1.01442, 1.01361, 1.01399, 1.01397, 1.01408, 1.01412, 1.01432, 1.01453, 1.01454, 1.01341, 1.01342, 1.01317, 1.01342, 1.01018, 1.01051, 1.01278, 1.0128, 1.01021, 1.01037, 1.00809, 1.00794, 1.00679, 1.00711, 0.99882, 0.99948, 0.98905, 0.98905, 0.9755, 0.97545, 0.93786, 0.93738, 0.84336, 0.84193]
download_models.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import snapshot_download
4
+
5
+
6
+ if __name__ == "__main__":
7
+
8
+ cache_dir = "./weights"
9
+
10
+ dit_path = snapshot_download(
11
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s",
12
+ allow_patterns="model/*",
13
+ local_dir=cache_dir,
14
+ )
15
+
16
+ dit_path = snapshot_download(
17
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-10s",
18
+ allow_patterns="model/*",
19
+ local_dir=cache_dir,
20
+ )
21
+
22
+ dit_path = snapshot_download(
23
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s",
24
+ allow_patterns="model/*",
25
+ local_dir=cache_dir,
26
+ )
27
+
28
+ dit_path = snapshot_download(
29
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s",
30
+ allow_patterns="model/*",
31
+ local_dir=cache_dir,
32
+ )
33
+
34
+ dit_path = snapshot_download(
35
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s",
36
+ allow_patterns="model/*",
37
+ local_dir=cache_dir,
38
+ )
39
+
40
+ dit_path = snapshot_download(
41
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-10s",
42
+ allow_patterns="model/*",
43
+ local_dir=cache_dir,
44
+ )
45
+
46
+ dit_path = snapshot_download(
47
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s",
48
+ allow_patterns="model/*",
49
+ local_dir=cache_dir,
50
+ )
51
+
52
+ dit_path = snapshot_download(
53
+ repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-10s",
54
+ allow_patterns="model/*",
55
+ local_dir=cache_dir,
56
+ )
57
+
58
+ vae_path = snapshot_download(
59
+ repo_id="hunyuanvideo-community/HunyuanVideo",
60
+ allow_patterns="vae/*",
61
+ local_dir=cache_dir,
62
+ )
63
+
64
+ text_encoder_path = snapshot_download(
65
+ repo_id="Qwen/Qwen2.5-VL-7B-Instruct",
66
+ local_dir=os.path.join(cache_dir, "text_encoder/"),
67
+ )
68
+
69
+ text_encoder2_path = snapshot_download(
70
+ repo_id="openai/clip-vit-large-patch14",
71
+ local_dir=os.path.join(cache_dir, "text_encoder2/"),
72
+ )
73
+
74
+
inference_example.ipynb ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "49a3d5bb-3d04-4d11-aba6-043fc5667abd",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from kandinsky import get_T2V_pipeline\n",
11
+ "from IPython.display import Video\n",
12
+ "from PIL import Image"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": null,
18
+ "id": "e9bcf5ed-f813-47e1-ade6-3f655d35d0e5",
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "pipe = get_T2V_pipeline(\n",
23
+ " device_map={\"dit\": \"cuda:0\", \"vae\": \"cuda:0\", \"text_embedder\": \"cuda:0\"},\n",
24
+ ")"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "markdown",
29
+ "id": "1f1563bb-0ebc-4f7c-97f4-6918a55d774f",
30
+ "metadata": {},
31
+ "source": [
32
+ "# Video"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "id": "48bc4ef9-0492-41a1-8133-c7e10a894d88",
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "out = pipe(\"a cat in a red hat\", time_length=2, width=768, height=512, save_path='./test.mp4')"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": null,
48
+ "id": "3379b1a9-b987-443e-aad0-a7f8ac62b69d",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "Video('./test.mp4')"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "id": "c4de2c5e-2784-4827-92b8-6997d3012e79",
58
+ "metadata": {},
59
+ "source": [
60
+ "# Images"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "id": "c9cbc0fc-173f-463e-896b-b29b727b0e65",
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "out = pipe(\n",
71
+ " \"a dog in a red boots\", \n",
72
+ " time_length=0, width=768, height=512, \n",
73
+ " save_path='./image.png'\n",
74
+ ")"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "id": "6a2c7741-9d7e-4396-a434-d65ff9e224e9",
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "image = Image.open(\"./image.png\")\n",
85
+ "image"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "markdown",
90
+ "id": "ea82ab34-ea66-4559-ac86-ec63af0e0f7a",
91
+ "metadata": {},
92
+ "source": [
93
+ "# Distilled model"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "id": "4d42c0c0-92a9-4f91-a263-b3b9e58dbf99",
100
+ "metadata": {},
101
+ "outputs": [],
102
+ "source": [
103
+ "pipe = get_T2V_pipeline(\n",
104
+ " device_map={\"dit\": \"cuda:0\", \"vae\": \"cuda:0\", \"text_embedder\": \"cuda:0\"},\n",
105
+ " conf_path=\"./configs/config_5s_distil.yaml\"\n",
106
+ ") "
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": null,
112
+ "id": "14e5d407-135b-451c-a092-840bae230524",
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "out = pipe(\"cheburashka in a blue hat\", time_length=5, width=768, height=512, guidance_weight=1.0, num_steps=16, scheduler_scale=5, save_path='./test2.mp4')"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": null,
122
+ "id": "53cda76a-a60e-4e62-9cb4-3b4fcf2f0b4b",
123
+ "metadata": {},
124
+ "outputs": [],
125
+ "source": [
126
+ "Video('./test2.mp4')"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "markdown",
131
+ "id": "004cc1da-ceb8-4e11-9774-8aa231b3fece",
132
+ "metadata": {},
133
+ "source": [
134
+ "# 10s video NABLA"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "id": "ea8fb5f8-ad6d-4e04-a4e3-0b381e3259db",
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": [
144
+ "pipe = get_T2V_pipeline(\n",
145
+ " device_map={\"dit\": \"cuda:0\", \"vae\": \"cuda:0\", \"text_embedder\": \"cuda:0\"},\n",
146
+ " conf_path=\"./configs/config_10s_sft.yaml\"\n",
147
+ ") "
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "id": "9ffa470b-5c29-4e05-88af-fe26c4d0ada1",
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "out = pipe(\"Shiba Inu is driving a car\", time_length=10, width=768, height=512, save_path='./test3.mp4')"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "id": "a4bbd345-9ae0-48a6-bdb9-24f4c63d48d7",
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "Video('./test3.mp4')"
168
+ ]
169
+ }
170
+ ],
171
+ "metadata": {
172
+ "kernelspec": {
173
+ "display_name": "test",
174
+ "language": "python",
175
+ "name": "python3"
176
+ },
177
+ "language_info": {
178
+ "codemirror_mode": {
179
+ "name": "ipython",
180
+ "version": 3
181
+ },
182
+ "file_extension": ".py",
183
+ "mimetype": "text/x-python",
184
+ "name": "python",
185
+ "nbconvert_exporter": "python",
186
+ "pygments_lexer": "ipython3",
187
+ "version": "3.12.11"
188
+ }
189
+ },
190
+ "nbformat": 4,
191
+ "nbformat_minor": 5
192
+ }