Spaces:

rahul7star
/

Kandinsky

Paused

App Files Files Community

rahul7star commited on 22 days ago

Commit

0084610

verified ·

1 Parent(s): 7d46bab

Migrated from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +30 -0
LICENSE +201 -0
ORIGINAL_README.md +381 -0
__init__.py +1 -0
assets/KANDINSKY_LOGO_1_BLACK.png +0 -0
assets/KANDINSKY_LOGO_1_WHITE.png +0 -0
assets/comfyui_kandinsky5.png +3 -0
assets/generation_examples/1036335634.mp4 +3 -0
assets/generation_examples/1512407739 (1).mp4 +3 -0
assets/generation_examples/1512407739.mp4 +3 -0
assets/generation_examples/642423904 (1).mp4 +3 -0
assets/generation_examples/642423904 (2).mp4 +3 -0
assets/generation_examples/642423904.mp4 +3 -0
assets/generation_examples/68941856 (1).mp4 +3 -0
assets/generation_examples/68941856.mp4 +3 -0
assets/generation_examples/distill/1.mp4 +3 -0
assets/generation_examples/distill/2.mp4 +3 -0
assets/generation_examples/distill/3.mp4 +3 -0
assets/generation_examples/distill/4.mp4 +3 -0
assets/generation_examples/sft/1.mp4 +3 -0
assets/generation_examples/sft/2.mp4 +3 -0
assets/generation_examples/sft/3.mp4 +3 -0
assets/generation_examples/sft/4.mp4 +3 -0
assets/generation_examples/sft/5.mp4 +3 -0
assets/generation_examples/sft/6.mp4 +3 -0
assets/generation_examples/test (1) (1).mp4 +3 -0
assets/generation_examples/test2 (1).mp4 +3 -0
assets/generation_examples/video5237959401997893857.mp4 +3 -0
assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg +3 -0
assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg +3 -0
assets/sbs/kandinsky_5_video_lite_vs_sora.jpg +3 -0
assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg +3 -0
assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg +3 -0
assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg +3 -0
assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg +3 -0
assets/vbench.png +3 -0
benchmark/moviegen_bench.csv +0 -0
comfyui/README.md +76 -0
comfyui/kandisnky5_lite_T2V.json +541 -0
comfyui/nodes_kandinsky.py +286 -0
configs/config_10s_distil.yaml +54 -0
configs/config_10s_nocfg.yaml +56 -0
configs/config_10s_pretrain.yaml +54 -0
configs/config_10s_sft.yaml +56 -0
configs/config_5s_distil.yaml +47 -0
configs/config_5s_nocfg.yaml +49 -0
configs/config_5s_pretrain.yaml +47 -0
configs/config_5s_sft.yaml +49 -0
download_models.py +74 -0
inference_example.ipynb +192 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/comfyui_kandinsky5.png filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/1036335634.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/1512407739[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/1512407739.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/642423904[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/642423904[[:space:]](2).mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/642423904.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/68941856[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/68941856.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/distill/1.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/distill/2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/distill/3.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/distill/4.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/sft/1.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/sft/2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/sft/3.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/sft/4.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/sft/5.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/sft/6.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/test[[:space:]](1)[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/test2[[:space:]](1).mp4 filter=lfs diff=lfs merge=lfs -text
+assets/generation_examples/video5237959401997893857.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg filter=lfs diff=lfs merge=lfs -text
+assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg filter=lfs diff=lfs merge=lfs -text
+assets/sbs/kandinsky_5_video_lite_vs_sora.jpg filter=lfs diff=lfs merge=lfs -text
+assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg filter=lfs diff=lfs merge=lfs -text
+assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg filter=lfs diff=lfs merge=lfs -text
+assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg filter=lfs diff=lfs merge=lfs -text
+assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg filter=lfs diff=lfs merge=lfs -text
+assets/vbench.png filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,381 @@

+<div align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="assets/KANDINSKY_LOGO_1_WHITE.png">
+    <source media="(prefers-color-scheme: light)" srcset="assets/KANDINSKY_LOGO_1_BLACK.png">
+    <img alt="Shows an illustrated sun in light mode and a moon with stars in dark mode." src="https://user-images.githubusercontent.com/25423296/163456779-a8556205-d0a5-45e2-ac17-42d089e3c3f8.png">
+  </picture>
+</div>
+<div align="center">
+  <a href="https://habr.com/ru/companies/sberbank/articles/951800/">Habr</a> | <a href="https://ai-forever.github.io/Kandinsky-5/">Project Page</a> | Technical Report (soon) | <a href=https://huggingface.co/collections/ai-forever/kandisnky-50-t2v-lite-68d71892d2cc9b02177e5ae5> Models🤗 </a> | <a href="https://github.com/ai-forever/Kandinsky-5/blob/main/comfyui/README.md">ComfyUI README</a>
+</div>
+<h1>Kandinsky 5.0: A family of diffusion models for Video & Image generation</h1>
+In this repository, we provide a family of diffusion models to generate a video or an image (<em>Coming Soon</em>) given a textual prompt and distilled model for faster generation.
+https://github.com/user-attachments/assets/b9ff0417-02a4-4f6b-aacc-60c44e7fe6f1
+## Project Updates
+- 🔥 **Source**: ```2025/09/29```: We have open-sourced `Kandinsky 5.0 T2V Lite` a lite (2B parameters) version of `Kandinsky 5.0 Video` text-to-video generation model. Released checkpoints: `kandinsky5lite_t2v_pretrain_5s`, `kandinsky5lite_t2v_pretrain_10s`, `kandinsky5lite_t2v_sft_5s`, `kandinsky5lite_t2v_sft_10s`, `kandinsky5lite_t2v_nocfg_5s`, `kandinsky5lite_t2v_nocfg_10s`, `kandinsky5lite_t2v_distilled16steps_5s`, `kandinsky5lite_t2v_distilled16steps_10s` contains weight from pretrain, supervised finetuning, cfg distillation and diffusion distillation into 16 steps. 5s checkpoints are capable of generating videos up to 5 seconds long. 10s checkpoints is faster models checkpoints trained with [NABLA](https://huggingface.co/ai-forever/Wan2.1-T2V-14B-NABLA-0.7) algorithm and capable to generate videos up to 10 seconds long.
+- 🔥 **Source**: ```2025/10/7```: The ComfyUI README file has been updated. SDPA support has been added, allowing you to run our code without Flash attention. Magcache support for nocfg checkpoints has been added, allowing Magcache support for sft and nocfg checkpoints. Memory consumption in the VAE has been reduced, with the entire pipeline now running at 24 GB with offloading.
+## Kandinsky 5.0 T2V Lite
+Kandinsky 5.0 T2V Lite is a lightweight video generation model (2B parameters) that ranks #1 among open-source models in its class. It outperforms larger Wan models (5B and 14B) and offers the best understanding of Russian concepts in the open-source ecosystem.
+We provide 8 model variants, each optimized for different use cases:
+* SFT model — delivers the highest generation quality;
+* CFG-distilled — runs 2× faster;
+* Diffusion-distilled — enables low-latency generation with minimal quality loss (6× faster);
+* Pretrain model — designed for fine-tuning by researchers and enthusiasts.
+All models are available in two versions: for generating 5-second and 10-second videos.
+## Pipeline
+**Latent diffusion pipeline** with **Flow Matching**.
+**Diffusion Transformer (DiT)** as the main generative backbone with **cross-attention to text embeddings**.
+- **Qwen2.5-VL** and **CLIP** provides text embeddings.
+- **HunyuanVideo 3D VAE** encodes/decodes video into a latent space.
+- **DiT** is the main generative module using cross-attention to condition on text.
+<img width="1600" height="477" alt="Picture1" src="https://github.com/user-attachments/assets/17fc2eb5-05e3-4591-9ec6-0f6e1ca397b3" />
+<img width="800" height="406" alt="Picture2" src="https://github.com/user-attachments/assets/f3006742-e261-4c39-b7dc-e39330be9a09" />
+## Model Zoo
+| Model                               | config | video duration | NFE | Checkpoint | Latency* |
+|-------------------------------------|--------|----------------|-----|------------|----------------|
+| Kandinsky 5.0 T2V Lite SFT 5s       |configs/config_5s_sft.yaml | 5s             | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s) |      139 s     |
+| Kandinsky 5.0 T2V Lite SFT 10s      |configs/config_10s_sft.yaml| 10s            | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s) |      224 s     |
+| Kandinsky 5.0 T2V Lite pretrain 5s  |configs/config_5s_pretrain.yaml | 5s             | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s) |      139 s      |
+| Kandinsky 5.0 T2V Lite pretrain 10s |configs/config_10s_pretrain.yaml | 10s            | 100 |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-10s) |     224 s      |
+| Kandinsky 5.0 T2V Lite no-CFG 5s    |configs/config_5s_nocfg.yaml| 5s             | 50  |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s) |       77 s     |
+| Kandinsky 5.0 T2V Lite no-CFG 10s   |configs/config_10s_nocfg.yaml| 10s            | 50  |🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-10s) |     124 s      |
+| Kandinsky 5.0 T2V Lite distill 5s   |configs/config_5s_distil.yaml| 5s             | 16  | 🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s)|       35 s     |
+| Kandinsky 5.0 T2V Lite distill 10s  |configs/config_10s_distil.yaml| 10s            | 16  | 🤗 [HF](https://huggingface.co/ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-10s)|      61 s      |              |
+*Latency was measured after the second inference run. The first run of the model can be slower due to the compilation process. Inference was measured on an NVIDIA H100 GPU with 80 GB of memory, using CUDA 12.8.1 and PyTorch 2.8. For 5-second models Flash Attention 3 was used.
+### Examples:
+#### Kandinsky 5.0 T2V Lite SFT
+<table border="0" style="width: 200; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/bc38821b-f9f1-46db-885f-1f70464669eb" width=200 controls autoplay loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/9f64c940-4df8-4c51-bd81-a05de8e70fc3" width=200 controls autoplay loop></video>
+      </td>
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/77dd417f-e0bf-42bd-8d80-daffcd054add" width=200 controls autoplay loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/385a0076-f01c-4663-aa46-6ce50352b9ed" width=200 controls autoplay loop></video>
+      </td>
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/7c1bcb31-cc7d-4385-9a33-2b0cc28393dd" width=200 controls autoplay loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/990a8a0b-2df1-4bbc-b2e3-2859b6f1eea6" width=200 controls autoplay loop></video>
+      </td>
+  </tr>
+</table>
+#### Kandinsky 5.0 T2V Lite Distill
+<table border="0" style="width: 200; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/861342f9-f576-4083-8a3b-94570a970d58" width=200 controls autoplay loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/302e4e7d-781d-4a58-9b10-8c473d469c4b" width=200 controls autoplay loop></video>
+      </td>
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/3e70175c-40e5-4aec-b506-38006fe91a76" width=200 controls autoplay loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/b7da85f7-8b62-4d46-9460-7f0e505de810" width=200 controls autoplay loop></video>
+      </td>
+</table>
+### Results:
+#### Side-by-Side evaluation
+The evaluation is based on the expanded prompts from the [Movie Gen benchmark](https://github.com/facebookresearch/MovieGenBench), which are available in the expanded_prompt column of the benchmark/moviegen_bench.csv file.
+<table border="0" style="width: 400; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <img src="assets/sbs/kandinsky_5_video_lite_vs_sora.jpg" width=400 ></img>
+      </td>
+      <td>
+          <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg" width=400 ></img>
+      </td>
+  <tr>
+      <td>
+          <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg" width=400 ></img>
+      </td>
+      <td>
+          <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg" width=400 ></img>
+      </td>
+  <tr>
+      <td>
+          <img src="assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg" width=400 ></img>
+      </td>
+</table>
+#### Distill Side-by-Side evaluation
+<table border="0" style="width: 400; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <img src="assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg" width=400 ></img>
+      </td>
+      <td>
+          <img src="assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg" width=400 ></img>
+      </td>
+</table>
+#### VBench results
+<div align="center">
+  <picture>
+    <img src="assets/vbench.png">
+  </picture>
+</div>
+## Quickstart
+#### Installation
+Clone the repo:
+```sh
+git clone https://github.com/ai-forever/Kandinsky-5.git
+cd Kandinsky-5
+```
+Install dependencies:
+```sh
+pip install -r requirements.txt
+```
+To improve inference performance on NVidia Hopper GPUs, we recommend installing [Flash Attention 3](https://github.com/Dao-AILab/flash-attention/?tab=readme-ov-file#flashattention-3-beta-release).
+#### Model Download
+```sh
+python download_models.py
+```
+#### Run Kandinsky 5.0 T2V Lite SFT 5s
+```sh
+python test.py --prompt "A dog in red hat"
+```
+#### Run Kandinsky 5.0 T2V Lite SFT 10s
+```sh
+python test.py --config ./configs/config_10s_sft.yaml --prompt "A dog in red hat" --video_duration 10
+```
+#### Run Kandinsky 5.0 T2V Lite pretrain 5s
+```sh
+python test.py --config ./configs/config_5s_pretrain.yaml --prompt "A dog in red hat"
+```
+#### Run Kandinsky 5.0 T2V Lite pretrain 10s
+```sh
+python test.py --config ./configs/config_10s_pretrain.yaml --prompt "A dog in red hat" --video_duration 10
+```
+#### Run Kandinsky 5.0 T2V Lite no-CFG 5s
+```sh
+python test.py --config ./configs/config_5s_nocfg.yaml --prompt "A dog in red hat"
+```
+#### Run Kandinsky 5.0 T2V Lite no-CFG 10s
+```sh
+python test.py --config ./configs/config_10s_nocfg.yaml --prompt "A dog in red hat" --video_duration 10
+```
+#### Run Kandinsky 5.0 T2V Lite distill 5s
+```sh
+python test.py --config ./configs/config_5s_distil.yaml --prompt "A dog in red hat"
+```
+#### Run Kandinsky 5.0 T2V Lite distill 10s
+```sh
+python test.py --config ./configs/config_10s_distil.yaml --prompt "A dog in red hat" --video_duration 10
+```
+### Inference
+```python
+import torch
+from IPython.display import Video
+from kandinsky import get_T2V_pipeline
+device_map = {
+    "dit": torch.device('cuda:0'),
+    "vae": torch.device('cuda:0'),
+    "text_embedder": torch.device('cuda:0')
+}
+pipe = get_T2V_pipeline(device_map, conf_path="configs/config_5s_sft.yaml")
+images = pipe(
+    seed=42,
+    time_length=5,
+    width=768,
+    height=512,
+    save_path="./test.mp4",
+    text="A cat in a red hat",
+)
+Video("./test.mp4")
+```
+Please, refer to [inference_example.ipynb](inference_example.ipynb) notebook for more usage details.
+### Distributed Inference
+For a faster inference, we also provide the capability to perform inference in a distributed way:
+```
+NUMBER_OF_NODES=1
+NUMBER_OF_DEVICES_PER_NODE=1 / 2 / 4
+python -m torch.distributed.launch --nnodes $NUMBER_OF_NODES --nproc-per-node $NUMBER_OF_DEVICES_PER_NODE test.py
+```
+### Optimized Inference
+#### Offloading
+For less memory consumption you can use **offloading** of the models.
+```sh
+python test.py --prompt "A dog in red hat" --offload
+```
+#### Magcache
+Also we provide [Magcache](https://github.com/Zehong-Ma/MagCache) inference for faster generations (now available for sft 5s and sft 10s checkpoints).
+```sh
+python test.py --prompt "A dog in red hat" --magcache
+```
+### ComfyUI
+See the instruction [here](comfyui)
+### Beta testing
+You can apply to participate in the beta testing of the Kandinsky Video Lite via the [telegram bot](https://t.me/kandinsky_access_bot).
+## 📑 Todo List
+- Kandinsky 5.0 Lite Text-to-Video
+    - [x] Multi-GPU Inference code of the 2B models
+    - [ ] Checkpoints 2B models
+      - [x]  pretrain
+      - [x] sft
+      - [ ] rl
+      - [x] cfg distil
+      - [x] distil 16 steps
+      - [ ] autoregressive generation
+    - [x] ComfyUI integration
+    - [ ] Diffusers integration
+    - [ ] Caching acceleration support
+- Kandinsky 5.0 Lite Image-to-Video
+    - [ ] Multi-GPU Inference code of the 2B model
+    - [ ] Checkpoints of the 2B model
+    - [ ] ComfyUI integration
+    - [ ] Diffusers integration
+- Kandinsky 5.0 Pro Text-to-Video
+    - [ ] Multi-GPU Inference code of the models
+    - [ ] Checkpoints of the model
+    - [ ] ComfyUI integration
+    - [ ] Diffusers integration
+- Kandinsky 5.0 Pro Image-to-Video
+    - [ ] Multi-GPU Inference code of the model
+    - [ ] Checkpoints of the model
+    - [ ] ComfyUI integration
+    - [ ] Diffusers integration
+- [ ] Technical report
+# Authors
+<B>Project Leader:</B> Denis Dimitrov</br>
+<B>Team Leads:</B> Vladimir Arkhipkin, Vladimir Korviakov, Nikolai Gerasimenko, Denis Parkhomenko</br>
+<B>Core Contributors:</B> Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov, Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim, Anastasiia Kargapoltseva, Nikita Kiselev</br>
+<B>Contributors:</B> Anna Dmitrienko,  Anastasia Maltseva, Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov, Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina, Tatiana Nikulina, Polina Gavrilova</br>
+# Citation
+```
+@misc{kandinsky2025,
+    author = {Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov,
+              Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim,
+              Anastasiia Kargapoltseva, Nikita Kiselev, Vladimir Arkhipkin, Vladimir Korviakov,
+              Nikolai Gerasimenko, Denis Parkhomenko, Anna Dmitrienko, Anastasia Maltseva,
+              Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov,
+              Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina,
+              Tatiana Nikulina, Polina Gavrilova, Denis Dimitrov},
+    title = {Kandinsky 5.0: A family of diffusion models for Video & Image generation},
+    howpublished = {\url{https://github.com/ai-forever/Kandinsky-5}},
+    year = 2025
+}
+@misc{mikhailov2025nablanablaneighborhoodadaptiveblocklevel,
+      title={$\nabla$NABLA: Neighborhood Adaptive Block-Level Attention},
+      author={Dmitrii Mikhailov and Aleksey Letunovskiy and Maria Kovaleva and Vladimir Arkhipkin
+              and Vladimir Korviakov and Vladimir Polovnikov and Viacheslav Vasilev
+              and Evelina Sidorova and Denis Dimitrov},
+      year={2025},
+      eprint={2507.13546},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2507.13546},
+}
+```
+# Acknowledgements
+We gratefully acknowledge the open-source projects and research that made Kandinsky 5.0 possible:
+- [PyTorch](https://pytorch.org/) — for model training and inference.
+- [FlashAttention 3](https://github.com/Dao-AILab/flash-attention) — for efficient attention and faster inference.
+- [Qwen2.5-VL](https://github.com/QwenLM/Qwen3-VL) — for providing high-quality text embeddings.
+- [CLIP](https://github.com/openai/CLIP) — for robust text–image alignment.
+- [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo) — for video latent encoding and decoding.
+- [MagCache](https://github.com/Zehong-Ma/MagCache) — for accelerated inference.
+- [ComfyUI](https://github.com/comfyanonymous/ComfyUI) — for integration into node-based workflows.
+We deeply appreciate the contributions of these communities and researchers to the open-source ecosystem.

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .comfyui.nodes_kandinsky import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS

assets/KANDINSKY_LOGO_1_BLACK.png ADDED Viewed

assets/KANDINSKY_LOGO_1_WHITE.png ADDED Viewed

assets/comfyui_kandinsky5.png ADDED Viewed

Git LFS Details

SHA256: 4c91961abe51a1fcbd3a35d438ea3b4f652f61a4d9f035c9f10e91dc5c9b79cd
Pointer size: 131 Bytes
Size of remote file: 474 kB

assets/generation_examples/1036335634.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d3657c36760e1694a4d3533b04e0d28ddd16d8d8e6373953e8f754742e2a54b
+size 4199589

assets/generation_examples/1512407739 (1).mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54b887c2c5cca6a4b5d8cc7f433a01a1c72c2592f07187b3c530126ac77aa601
+size 7227407

assets/generation_examples/1512407739.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00f48b1d0ddd97c7d802136d1a9090eb397ce34d95569fd4c4d6eb64eb46d06f
+size 6778347

assets/generation_examples/642423904 (1).mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:612d2a1340475f79b20f90cc5e85a5b9e79193af631e8bd7fa50cdc5fc47dee8
+size 3038994

assets/generation_examples/642423904 (2).mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:55bed6367e8c4de29f082cfbaa8af357fef81dc339e8abbadea50901ac635d10
+size 3092127

assets/generation_examples/642423904.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:464da305ce36af41a1fb7fa842de1357b601008d128f1693f12f9674c906243c
+size 2466511

assets/generation_examples/68941856 (1).mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8274aa1cbc087465f6a1b72842c8d1cb1860c61232dd29ee0f17ea7ff2d2ac08
+size 5856930

assets/generation_examples/68941856.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:759e825e4b6bd05ab92621390b1f5aa97240cd18a76312395ff39fd636dc8a9d
+size 6942715

assets/generation_examples/distill/1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06953c45987011d08aef79f7ae1368f9a69d480a509554fb55d0d46d4498255f
+size 6916245

assets/generation_examples/distill/2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d199141bd7cffb4cb30ac84f2879da330b83cd2429aa7a95c3406f8dce49134a
+size 5384100

assets/generation_examples/distill/3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bfdc9c8cf2ada22de4ffc1d0281e56ed7c0d61a66bdff28697cbe6a2a8e97f5
+size 3957258

assets/generation_examples/distill/4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bbebdd5de15721f082785bbc02dc7422b0d9d9a6ac244c02e1bfbecc22ba22e
+size 6328091

assets/generation_examples/sft/1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bfc00348d1fdb09c43687e086cd912ba15097ecf3f85b6302827a54eafab3a2
+size 4486280

assets/generation_examples/sft/2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:612eb63051314d50bb333ee1c797e95a5df7522ca5df3dc853266b135a27ce06
+size 4600755

assets/generation_examples/sft/3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8cc13e558964138ef53a098d4b1174db5a58979f7dbf021788a39ad41c2fcff
+size 8193301

assets/generation_examples/sft/4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:462f02343590e07e5ac2a919f566438d2ddd990d0c5ddcd8c92e15feae63eb11
+size 7697517

assets/generation_examples/sft/5.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e89d9d0ae048df05fc2f00e9acc009a261b38810f5792967a969bb3147a5e6d
+size 3528986

assets/generation_examples/sft/6.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8226569ad637d3b179ce895685e52cfcf81372bbbab623386fddde8e0352c9db
+size 4109417

assets/generation_examples/test (1) (1).mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:949f1a6c6056307026430aa0809fc7b02ac6deb3732973b305aaad20e6754c76
+size 1592811

assets/generation_examples/test2 (1).mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d626185c2e14dcca33b4e315529dd9555fc34f9dc96a327a28935bdb659406b
+size 1103150

assets/generation_examples/video5237959401997893857.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4df397dc2d8032875aaccb58ef563f90c6cd25d50ce73bae40c5776dc9818ac0
+size 5977768

assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg ADDED Viewed

Git LFS Details

SHA256: a6f53623b3c1e1f45ea6872f3afa4b3f71d79377bc89065b12e590c8a1a60f1d
Pointer size: 131 Bytes
Size of remote file: 190 kB

assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg ADDED Viewed

Git LFS Details

SHA256: 81d9aa99a224f3b1ce7262edf0c969bebcb7b95349cb5b57be5cc7aecbcc15d9
Pointer size: 131 Bytes
Size of remote file: 192 kB

assets/sbs/kandinsky_5_video_lite_vs_sora.jpg ADDED Viewed

Git LFS Details

SHA256: 2a5c838cb53a026a57d3037361ad4ed74bae4b31f4d1b11e6474956eca42d412
Pointer size: 131 Bytes
Size of remote file: 195 kB

assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg ADDED Viewed

Git LFS Details

SHA256: 74fa68588e7e24fd817cc8e96d63f4e5b623ff193c71a644c0ce42ebb9b49dac
Pointer size: 131 Bytes
Size of remote file: 170 kB

assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg ADDED Viewed

Git LFS Details

SHA256: 80bc261b9afcaf1446228a24a96afe3b5c24b4780f3e2f43e27496077611ec6f
Pointer size: 131 Bytes
Size of remote file: 196 kB

assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg ADDED Viewed

Git LFS Details

SHA256: d01f4a73b287541487228939fd505a947b78b6325f76421b2ee5f1523188e08e
Pointer size: 131 Bytes
Size of remote file: 192 kB

assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg ADDED Viewed

Git LFS Details

SHA256: 4f053f7d996112f40e8b49f6440ea75a40f71c02e60d467cff479ced0b54444a
Pointer size: 131 Bytes
Size of remote file: 198 kB

assets/vbench.png ADDED Viewed

Git LFS Details

SHA256: 27131bac1ccb83d3d28e8f558c6a7a91ed92816c0814583299b8584f0cda6546
Pointer size: 131 Bytes
Size of remote file: 170 kB

benchmark/moviegen_bench.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

comfyui/README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# Kandinsky 5 Video for ComfyUI
+![Kandinsky 5 ComfyUI Workflow](../assets/comfyui_kandinsky5.png)
+## Description
+This project provides a workflow for generating videos using the Kandinsky 5 model within the ComfyUI environment.
+## Installation and Setup
+### 1. Install ComfyUI
+If you don't have ComfyUI installed yet, follow these steps:
+```bash
+# Clone ComfyUI repository
+git clone https://github.com/comfyanonymous/ComfyUI.git
+cd ComfyUI
+# Install dependencies
+pip install -r requirements.txt
+# Launch ComfyUI
+python main.py
+```
+### 2. Clone this repository into the ComfyUI custom_nodes folder:
+```bash
+# Navigate to ComfyUI custom_nodes folder
+cd ComfyUI/custom_nodes
+# Clone this repository and install requirements for model
+git clone https://github.com/ai-forever/Kandinsky-5.git kandinsky
+cd kandinsky
+pip install -r requirements.txt
+```
+### 3. Load the Workflow
+Launch ComfyUI (typically available at http://127.0.0.1:8188)
+In the ComfyUI interface, click the "Load" button
+Select the kandisnky5_lite_T2V.json file from this folder of this repository
+The workflow will load into the ComfyUI interface
+### 4. Download Checkpoints
+Download the required models and place them in the appropriate folders.
+1. Run download_models.py It will download models and encoders to ./weights directory.
+2. Rearrange them to comfyui paths(text_encoders/diffusion_models/vae).
+```file-tree
+ComfyUI/
+├── models/
+│   ├── text_encoders/          # For text_encoder and text_encoder2 models
+│   ├── diffusion_models/       # For kandinsky5lite_t2v_*.safetensors models
+│   └── vae/                    # For vae model
+```
+### 5. Configure Parameters
+After loading the workflow, configure the following parameters:
+### Main Parameters
+| Parameter | Description | Recommended Value |
+|-----------|-------------|-------------------|
+| **Prompt** | Text description for video generation | Your descriptive text |
+| **Negative Prompt** | What to exclude from generation | Unwanted elements description |
+| **Width/Height/Length** | Output video size | 768x512x121 for 5s or 768x512x241 for 10s, Width and Height should be divisisble  by 128 for 10s model |
+| **Steps** | Number of generation steps | 50, 16 for distilled version|
+| **CFG Scale** |  | 1.0 for distilled16steps and nocfg, 5.0 for sft and pretrain models |
+| **Scheduler Scale** | Noise scheduler scale | 5.0 for 5s, 10.0 for 10s |

comfyui/kandisnky5_lite_T2V.json ADDED Viewed

	@@ -0,0 +1,541 @@

+{
+  "id": "12380645-f2b5-4537-88ad-dd959a44e47c",
+  "revision": 0,
+  "last_node_id": 77,
+  "last_link_id": 163,
+  "nodes": [
+    {
+      "id": 77,
+      "type": "Kandinsky5VAEDecode",
+      "pos": [
+        635.5618896484375,
+        529.5596313476562
+      ],
+      "size": [
+        239.40000915527344,
+        46
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 155
+        },
+        {
+          "name": "latent",
+          "type": "LATENT",
+          "link": 156
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "slot_index": 0,
+          "links": [
+            157
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Kandinsky5VAEDecode",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 54,
+      "type": "SaveAnimatedWEBP",
+      "pos": [
+        953.2196044921875,
+        91.47215270996094
+      ],
+      "size": [
+        487.7459716796875,
+        484.7098693847656
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 157
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "cnr_id": "comfy-core",
+        "ver": "0.3.26"
+      },
+      "widgets_values": [
+        "ComfyUI",
+        24,
+        false,
+        100,
+        "default"
+      ]
+    },
+    {
+      "id": 72,
+      "type": "Kandinsky5TextEncode",
+      "pos": [
+        175.8029327392578,
+        158.080322265625
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 143
+        },
+        {
+          "name": "extended_text",
+          "shape": 7,
+          "type": "PROMPT",
+          "link": 163
+        }
+      ],
+      "outputs": [
+        {
+          "name": "TEXT",
+          "type": "CONDITION",
+          "slot_index": 0,
+          "links": [
+            161
+          ]
+        },
+        {
+          "name": "POOLED",
+          "type": "CONDITION",
+          "slot_index": 1,
+          "links": [
+            160
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Kandinsky5TextEncode",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
+      },
+      "widgets_values": [
+        ""
+      ]
+    },
+    {
+      "id": 73,
+      "type": "Kandinsky5TextEncode",
+      "pos": [
+        179.4368133544922,
+        406.9901428222656
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 147
+        },
+        {
+          "name": "extended_text",
+          "shape": 7,
+          "type": "PROMPT",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "TEXT",
+          "type": "CONDITION",
+          "slot_index": 0,
+          "links": [
+            159
+          ]
+        },
+        {
+          "name": "POOLED",
+          "type": "CONDITION",
+          "slot_index": 1,
+          "links": [
+            158
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Kandinsky5TextEncode",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
+      },
+      "widgets_values": [
+        "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
+      ]
+    },
+    {
+      "id": 75,
+      "type": "Kandinsky5Generate",
+      "pos": [
+        610.8881225585938,
+        160.90586853027344
+      ],
+      "size": [
+        315,
+        282
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 149
+        },
+        {
+          "name": "config",
+          "type": "CONFIG",
+          "link": 162
+        },
+        {
+          "name": "positive_emb",
+          "type": "CONDITION",
+          "link": 161
+        },
+        {
+          "name": "positive_clip",
+          "type": "CONDITION",
+          "link": 160
+        },
+        {
+          "name": "negative_emb",
+          "type": "CONDITION",
+          "link": 159
+        },
+        {
+          "name": "negative_clip",
+          "type": "CONDITION",
+          "link": 158
+        }
+      ],
+      "outputs": [
+        {
+          "name": "latent",
+          "type": "LATENT",
+          "slot_index": 0,
+          "links": [
+            156
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Kandinsky5Generate",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
+      },
+      "widgets_values": [
+        50,
+        768,
+        512,
+        121,
+        5,
+        10
+      ]
+    },
+    {
+      "id": 68,
+      "type": "expand_prompt",
+      "pos": [
+        -273.5207824707031,
+        268.80975341796875
+      ],
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 139
+        }
+      ],
+      "outputs": [
+        {
+          "name": "exp_prompt",
+          "type": "PROMPT",
+          "slot_index": 0,
+          "links": [
+            163
+          ]
+        },
+        {
+          "name": "log",
+          "type": "STRING",
+          "slot_index": 1,
+          "links": []
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "expand_prompt",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "ef383d80876b498f553b13c8ae99d423308b0aa8"
+      },
+      "widgets_values": [
+        "A heroic astronaut in a sleek, futuristic white-and-silver space suit with reflective visor down, galloping on a majestic black stallion through rugged, snow-capped mountain peaks at golden hour. The horse’s mane flows wildly as it leaps over a rocky ridge, kicking up dust, while the astronaut grips the reins tightly, their suit subtly illuminated by the warm glow of the setting sun. Dramatic low-angle shot, with vast misty valleys below and a sky shifting from deep orange to twilight purple. Cinematic lighting, hyper-detailed, with a sense of epic adventure and sci-fi wonder."
+      ]
+    },
+    {
+      "id": 71,
+      "type": "Kandinsky5LoadTextEmbedders",
+      "pos": [
+        -241.9403839111328,
+        25.038286209106445
+      ],
+      "size": [
+        340.20001220703125,
+        82
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "slot_index": 0,
+          "links": [
+            139,
+            143,
+            147
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Kandinsky5LoadTextEmbedders",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
+      },
+      "widgets_values": [
+        "text_encoder",
+        "text_encoder2"
+      ]
+    },
+    {
+      "id": 74,
+      "type": "Kandinsky5LoadDiT",
+      "pos": [
+        208.43588256835938,
+        27.57303810119629
+      ],
+      "size": [
+        315,
+        78
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "slot_index": 0,
+          "links": [
+            149
+          ]
+        },
+        {
+          "name": "conf",
+          "type": "CONFIG",
+          "slot_index": 1,
+          "links": [
+            162
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Kandinsky5LoadDiT",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
+      },
+      "widgets_values": [
+        "kandinsky5lite_t2v_sft_5s.safetensors"
+      ]
+    },
+    {
+      "id": 76,
+      "type": "Kandinsky5LoadVAE",
+      "pos": [
+        612.3971557617188,
+        23.200862884521484
+      ],
+      "size": [
+        315,
+        58
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "slot_index": 0,
+          "links": [
+            155
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "Kandinsky5LoadVAE",
+        "aux_id": "gen-ai-team/kandinsky-5-inference",
+        "ver": "8ad75bc185e5d43004e1468789a11fc6450cdb2b"
+      },
+      "widgets_values": [
+        "vae"
+      ]
+    }
+  ],
+  "links": [
+    [
+      139,
+      71,
+      0,
+      68,
+      0,
+      "MODEL"
+    ],
+    [
+      143,
+      71,
+      0,
+      72,
+      0,
+      "MODEL"
+    ],
+    [
+      147,
+      71,
+      0,
+      73,
+      0,
+      "MODEL"
+    ],
+    [
+      149,
+      74,
+      0,
+      75,
+      0,
+      "MODEL"
+    ],
+    [
+      155,
+      76,
+      0,
+      77,
+      0,
+      "MODEL"
+    ],
+    [
+      156,
+      75,
+      0,
+      77,
+      1,
+      "LATENT"
+    ],
+    [
+      157,
+      77,
+      0,
+      54,
+      0,
+      "IMAGE"
+    ],
+    [
+      158,
+      73,
+      1,
+      75,
+      5,
+      "CONDITION"
+    ],
+    [
+      159,
+      73,
+      0,
+      75,
+      4,
+      "CONDITION"
+    ],
+    [
+      160,
+      72,
+      1,
+      75,
+      3,
+      "CONDITION"
+    ],
+    [
+      161,
+      72,
+      0,
+      75,
+      2,
+      "CONDITION"
+    ],
+    [
+      162,
+      74,
+      1,
+      75,
+      1,
+      "CONFIG"
+    ],
+    [
+      163,
+      68,
+      0,
+      72,
+      1,
+      "PROMPT"
+    ]
+  ],
+  "groups": [],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.6611570247933911,
+      "offset": [
+        286.65909097964607,
+        84.06209500649425
+      ]
+    },
+    "frontendVersion": "1.26.13"
+  },
+  "version": 0.4
+}

comfyui/nodes_kandinsky.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import torch
+import os
+from omegaconf.dictconfig import DictConfig
+from ..kandinsky.models.vae import build_vae
+from ..kandinsky.models.text_embedders import Kandinsky5TextEmbedder
+from ..kandinsky.models.dit import get_dit
+from ..kandinsky.generation_utils import generate
+import folder_paths
+from comfy.comfy_types import ComfyNodeABC
+from comfy.utils import ProgressBar as pbar
+from safetensors.torch import load_file
+from omegaconf import OmegaConf
+from pathlib import Path
+class Kandinsky5LoadTextEmbedders:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "qwen": (os.listdir(folder_paths.get_folder_paths("text_encoders")[0]), {"default": "qwen2_5_vl_7b_instruct"}),
+                "clip": (os.listdir(folder_paths.get_folder_paths("text_encoders")[0]), {"default": "clip_text"})
+            }
+        }
+    RETURN_TYPES = ("MODEL",)
+    RETURN_NAMES = ("model",)
+    FUNCTION = "load_te"
+    CATEGORY = "advanced/loaders"
+    DESCRIPTION = "return clip and qwen text embedders"
+    def load_te(self, qwen, clip):
+        qwen_path = os.path.join(folder_paths.get_folder_paths("text_encoders")[0],qwen)
+        clip_path = os.path.join(folder_paths.get_folder_paths("text_encoders")[0],clip)
+        conf = {'qwen': {'checkpoint_path': qwen_path, 'max_length': 256},
+            'clip': {'checkpoint_path': clip_path, 'max_length': 77}
+        }
+        return (Kandinsky5TextEmbedder(DictConfig(conf), device='cpu'),)
+class Kandinsky5LoadDiT:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "dit": (folder_paths.get_filename_list("diffusion_models"), ),
+            }
+        }
+    RETURN_TYPES = ("MODEL","CONFIG")
+    RETURN_NAMES = ("model","conf")
+    FUNCTION = "load_dit"
+    CATEGORY = "advanced/loaders"
+    DESCRIPTION = "return kandy dit"
+    def load_dit(self, dit):
+        dit_path = folder_paths.get_full_path_or_raise("diffusion_models", dit)
+        current_file = Path(__file__)
+        parent_directory = current_file.parent.parent
+        sec = dit.split("_")[-1].split(".")[0]
+        conf = OmegaConf.load(os.path.join(parent_directory,f"configs/config_{sec}_sft.yaml"))
+        dit = get_dit(conf.model.dit_params)
+        state_dict = load_file(dit_path)
+        dit.load_state_dict(state_dict)
+        return (dit,conf)
+class Kandinsky5TextEncode(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("MODEL",),
+                "prompt": ("STRING", {"multiline": True})
+            },
+            "optional": {
+                "extended_text": ("PROMPT",),
+            },
+        }
+    RETURN_TYPES = ("CONDITION", "CONDITION")
+    RETURN_NAMES = ("TEXT", "POOLED")
+    OUTPUT_TOOLTIPS = ("A conditioning containing the embedded text used to guide the diffusion model.",)
+    FUNCTION = "encode"
+    CATEGORY = "conditioning"
+    DESCRIPTION = "Encodes a text prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images."
+    def encode(self, model, prompt, extended_text=None):
+        text = extended_text if extended_text is not None else prompt
+        device='cuda:0'
+        model = model.to(device)
+        text_embeds = model.embedder([text], type_of_content='video')
+        pooled_embed = model.clip_embedder([text])
+        model = model.to('cpu')
+        return (text_embeds, pooled_embed)
+class Kandinsky5LoadVAE:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "vae": (os.listdir(folder_paths.get_folder_paths("vae")[0]), {"default": "hunyuan_vae"}),
+            }
+        }
+    RETURN_TYPES = ("MODEL",)
+    RETURN_NAMES = ("model",)
+    FUNCTION = "load_vae"
+    CATEGORY = "advanced/loaders"
+    DESCRIPTION = "return vae"
+    def load_vae(self, vae):
+        vae_path = os.path.join(folder_paths.get_folder_paths("vae")[0],vae)
+        vae = build_vae(DictConfig({'checkpoint_path':vae_path, 'name':'hunyuan'}))
+        vae = vae.eval()
+        return (vae,)
+class expand_prompt(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("MODEL",),
+                "prompt": ("STRING", {"multiline": True})
+            }
+        }
+    RETURN_TYPES = ("PROMPT","STRING")
+    RETURN_NAMES = ("exp_prompt","log")
+    OUTPUT_NODE = True
+    OUTPUT_TOOLTIPS = ("expanded prompt",)
+    FUNCTION = "expand_prompt"
+    CATEGORY = "conditioning"
+    DESCRIPTION = "extend prompt with."
+    def expand_prompt(self, model, prompt, device='cuda:0'):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""You are a prompt beautifier that transforms short user video descriptions into rich, detailed English prompts specifically optimized for video generation models.
+        Here are some example descriptions from the dataset that the model was trained:
+        1. "In a dimly lit room with a cluttered background, papers are pinned to the wall and various objects rest on a desk. Three men stand present: one wearing a red sweater, another in a black sweater, and the third in a gray shirt. The man in the gray shirt speaks and makes hand gestures, while the other two men look forward. The camera remains stationary, focusing on the three men throughout the sequence. A gritty and realistic visual style prevails, marked by a greenish tint that contributes to a moody atmosphere. Low lighting casts shadows, enhancing the tense mood of the scene."
+        2. "In an office setting, a man sits at a desk wearing a gray sweater and seated in a black office chair. A wooden cabinet with framed pictures stands beside him, alongside a small plant and a lit desk lamp. Engaged in a conversation, he makes various hand gestures to emphasize his points. His hands move in different positions, indicating different ideas or points. The camera remains stationary, focusing on the man throughout. Warm lighting creates a cozy atmosphere. The man appears to be explaining something. The overall visual style is professional and polished, suitable for a business or educational context."
+        3. "A person works on a wooden object resembling a sunburst pattern, holding it in their left hand while using their right hand to insert a thin wire into the gaps between the wooden pieces. The background features a natural outdoor setting with greenery and a tree trunk visible. The camera stays focused on the hands and the wooden object throughout, capturing the detailed process of assembling the wooden structure. The person carefully threads the wire through the gaps, ensuring the wooden pieces are securely fastened together. The scene unfolds with a naturalistic and instructional style, emphasizing the craftsmanship and the methodical steps taken to complete the task."
+        IImportantly! These are just examples from a large training dataset of 200 million videos.
+        Rewrite Prompt: "{prompt}" to get high-quality video generation. Answer only with expanded prompt.""",
+                    },
+                ],
+            }
+        ]
+        model = model.to(device)
+        text = model.embedder.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = model.embedder.processor(
+            text=[text],
+            images=None,
+            videos=None,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(model.embedder.model.device)
+        generated_ids = model.embedder.model.generate(
+            **inputs, max_new_tokens=256
+        )
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :]
+            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = model.embedder.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        print(output_text[0])
+        model = model.to('cpu')
+        return (output_text[0],str(output_text[0]))
+class Kandinsky5Generate(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("MODEL", {"tooltip": "The model used for denoising the input latent."}),
+                "config": ("CONFIG", {"tooltip": "Config of model and generation."}),
+                "steps": ("INT", {"default": 50, "min": 1, "max": 10000, "tooltip": "The number of steps used in the denoising process."}),
+                "width": ("INT", {"default": 768, "min": 512, "max": 768, "tooltip": "width of video."}),
+                "height": ("INT", {"default": 512, "min": 512, "max": 768, "tooltip": "height of video."}),
+                "length": ("INT", {"default": 121, "min": 5, "max": 241, "tooltip": "lenght of video."}),
+                "cfg": ("FLOAT", {"default": 5.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01, "tooltip": "The Classifier-Free Guidance scale balances creativity and adherence to the prompt. Higher values result in images more closely matching the prompt however too high values will negatively impact quality."}),
+                "scheduler_scale":("FLOAT", {"default": 10.0, "min": 1.0, "max": 25.0, "step":0.1, "round": 0.01, "tooltip": "scheduler scale"}),
+                "precision": (["float16", "bfloat16"], {"default": "bfloat16"}),
+                "positive_emb": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to include in the image."}),
+                "positive_clip": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to exclude from the image."}),
+                "negative_emb": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to include in the image."}),
+                "negative_clip": ("CONDITION", {"tooltip": "The conditioning describing the attributes you want to exclude from the image."}),
+            }
+        }
+    RETURN_TYPES = ("LATENT",)
+    RETURN_NAMES = ("latent",)
+    OUTPUT_TOOLTIPS = ("The denoised latent.",)
+    FUNCTION = "sample"
+    CATEGORY = "sampling"
+    DESCRIPTION = "Uses the provided model, positive and negative conditioning to denoise the latent image."
+    def sample(self, model, config, steps, width, height, length, cfg, precision, positive_emb, positive_clip, negative_emb, negative_clip, scheduler_scale):
+        bs = 1
+        device = 'cuda:0'
+        model = model.to(device)
+        patch_size = (1, 2, 2)
+        autocast_type = torch.bfloat16 if precision=='bfloat16' else torch.float16
+        dim = config.model.dit_params.in_visual_dim
+        length, height, width = 1 + (length - 1)//4, height // 8, width // 8
+        bs_text_embed, text_cu_seqlens = positive_emb
+        bs_null_text_embed, null_text_cu_seqlens = negative_emb
+        text_embed = {"text_embeds": bs_text_embed, "pooled_embed": positive_clip }
+        null_embed = {"text_embeds": bs_null_text_embed, "pooled_embed": negative_clip }
+        visual_rope_pos = [
+            torch.arange(length // patch_size[0]),
+            torch.arange(height // patch_size[1]),
+            torch.arange(width // patch_size[2])
+        ]
+        text_rope_pos = torch.cat([torch.arange(end) for end in torch.diff(text_cu_seqlens).cpu()])
+        null_text_rope_pos = torch.cat([torch.arange(end) for end in torch.diff(null_text_cu_seqlens).cpu()])
+        with torch.no_grad():
+            with torch.autocast(device_type='cuda', dtype=autocast_type):
+                latent_visual = generate(
+                    model, device, (bs * length, height, width, dim), steps,
+                    text_embed, null_embed,
+                    visual_rope_pos, text_rope_pos, null_text_rope_pos,
+                    cfg, scheduler_scale, config
+                )
+        model = model.to('cpu')
+        return (latent_visual,)
+class Kandinsky5VAEDecode(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("MODEL", {"tooltip": "vae."}),
+                "latent": ("LATENT", {"tooltip": "latent."}),}
+        }
+    RETURN_TYPES = ("IMAGE",)
+    OUTPUT_TOOLTIPS = ("The decoded image.",)
+    FUNCTION = "decode"
+    CATEGORY = "latent"
+    DESCRIPTION = "Decodes latent images back into pixel space images."
+    def decode(self, model, latent):
+        device = 'cuda:0'
+        model = model.to(device)
+        with torch.no_grad():
+            with torch.autocast(device_type='cuda', dtype=torch.float16):
+                bs = 1
+                images = latent.reshape(bs, -1, latent.shape[-3], latent.shape[-2], latent.shape[-1])# bs, t, h, w, c
+                # shape for decode: bs, c, t, h, w
+                images = (images / 0.476986).permute(0, 4, 1, 2, 3)
+                images = model.decode(images).sample
+                if not isinstance(images, torch.Tensor):
+                    images = images.sample
+                images = ((images.clamp(-1., 1.) + 1.) * 0.5)#.to(torch.uint8)
+        images = images[0].float().permute(1, 2, 3, 0)
+        model = model.to('cpu')
+        return (images,)
+NODE_CLASS_MAPPINGS = {
+    "Kandinsky5LoadTextEmbedders": Kandinsky5LoadTextEmbedders,
+    "Kandinsky5TextEncode": Kandinsky5TextEncode,
+    "Kandinsky5Generate": Kandinsky5Generate,
+    "Kandinsky5LoadVAE": Kandinsky5LoadVAE,
+    "Kandinsky5VAEDecode": Kandinsky5VAEDecode,
+    "Kandinsky5LoadDiT": Kandinsky5LoadDiT,
+    "expand_prompt": expand_prompt
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "Kandinsky5LoadTextEmbedders": "Kandinsky5LoadTextEmbedders",
+    "Kandinsky5TextEncode": "Kandinsky5TextEncode",
+    "Kandinsky5Generate": "Kandinsky5Generate",
+    "Kandinsky5LoadVAE": "Kandinsky5LoadVAE",
+    "Kandinsky5VAEDecode": "Kandinsky5VAEDecode",
+    "Kandinsky5LoadDiT": "Kandinsky5LoadDiT",
+    "expand_prompt": "expand_prompt"
+}

configs/config_10s_distil.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+metrics:
+  scheduler_scale: 5
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_distilled16steps_10s.safetensors"
+  num_steps: 16
+  guidance_weight: 1.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: nabla
+    causal: false
+    local: false
+    glob: false
+    window: 3
+    P: 0.9
+    wT: 11
+    wW: 3
+    wH: 3
+    add_sta: true
+    method: topcdf
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77

configs/config_10s_nocfg.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+metrics:
+  scheduler_scale: 5
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_sft_10s.safetensors"
+  num_steps: 50
+  guidance_weight: 1.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: nabla
+    causal: false
+    local: false
+    glob: false
+    window: 3
+    P: 0.9
+    wT: 11
+    wW: 3
+    wH: 3
+    add_sta: true
+    method: topcdf
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77
+magcache:
+  mag_ratios: [0.8736, 0, 1.12136, 0, 1.07896, 0, 1.06666, 0, 1.06235, 0, 1.03925, 0, 1.04018, 0, 1.0355, 0, 1.0327, 0, 1.02839, 0, 1.02768, 0, 1.02488, 0, 1.02143, 0, 1.02133, 0, 1.01715, 0, 1.01943, 0, 1.02177, 0, 1.01829, 0, 1.01747, 0, 1.01626, 0, 1.01559, 0, 1.01435, 0, 1.01435, 0, 1.01571, 0, 1.01312, 0, 1.01338, 0, 1.01437, 0, 1.01211, 0, 1.01237, 0, 1.01356, 0, 1.0101, 0, 1.01194, 0, 1.00898, 0, 1.0091, 0, 1.0108, 0, 1.00705, 0, 1.0018, 0, 1.01209, 0, 1.00525, 0, 1.00098, 0, 0.99914, 0, 0.99592, 0, 0.99089, 0, 0.98506, 0, 0.97495, 0, 0.9604, 0, 0.93492, 0, 0.89367, 0, 0.79353, 0]

configs/config_10s_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+metrics:
+  scheduler_scale: 5
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_pretrain_10s.safetensors"
+  num_steps: 50
+  guidance_weight: 5.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: nabla
+    causal: false
+    local: false
+    glob: false
+    window: 3
+    P: 0.9
+    wT: 11
+    wW: 3
+    wH: 3
+    add_sta: true
+    method: topcdf
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77

configs/config_10s_sft.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+metrics:
+  scheduler_scale: 5
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_sft_10s.safetensors"
+  num_steps: 50
+  guidance_weight: 5.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: nabla
+    causal: false
+    local: false
+    glob: false
+    window: 3
+    P: 0.9
+    wT: 11
+    wW: 3
+    wH: 3
+    add_sta: true
+    method: topcdf
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77
+magcache:
+  mag_ratios: [0.92261, 0.92261, 0.95898, 0.95962, 1.04862, 1.04855, 1.0806, 1.08045, 1.04405, 1.0445, 1.03587, 1.03619, 1.03789, 1.03785, 1.03485, 1.03514, 1.03724, 1.03814, 1.02484, 1.02502, 1.02525, 1.02508, 1.02473, 1.02532, 1.02625, 1.02706, 1.0197, 1.02011, 1.02326, 1.02324, 1.02081, 1.02116, 1.01993, 1.02047, 1.01979, 1.0205, 1.01823, 1.01852, 1.01785, 1.01813, 1.01563, 1.01606, 1.02057, 1.02083, 1.01132, 1.01207, 1.02053, 1.01959, 1.01718, 1.01749, 1.01546, 1.01589, 1.01516, 1.01525, 1.01578, 1.01608, 1.01616, 1.01618, 1.01443, 1.01466, 1.01554, 1.01568, 1.01494, 1.01515, 1.01567, 1.01572, 1.01418, 1.01458, 1.01601, 1.01618, 1.01491, 1.01508, 1.0162, 1.01625, 1.01412, 1.01419, 1.01431, 1.01437, 1.0106, 1.0108, 1.01428, 1.01427, 1.01222, 1.01236, 1.00812, 1.00818, 1.00759, 1.00764, 1.001, 1.00119, 0.98798, 0.98819, 0.9727, 0.97279, 0.93234, 0.93213, 0.83781, 0.83746]

configs/config_5s_distil.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+metrics:
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_distilled16steps_5s.safetensors"
+  num_steps: 16
+  guidance_weight: 1.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: flash
+    causal: false
+    local: false
+    glob: false
+    window: 3
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77

configs/config_5s_nocfg.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+metrics:
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_nocfg_5s.safetensors"
+  num_steps: 50
+  guidance_weight: 1.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: flash
+    causal: false
+    local: false
+    glob: false
+    window: 3
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77
+magcache:
+  mag_ratios: [0.8827, 0, 1.14399, 0, 1.08362, 0, 1.06681, 0, 1.05906, 0, 1.03969, 0, 1.03835, 0, 1.03338, 0, 1.031, 0, 1.02616, 0, 1.02654, 0, 1.02322, 0, 1.02078, 0, 1.02, 0, 1.01673, 0, 1.01353, 0, 1.02175, 0, 1.0156, 0, 1.01616, 0, 1.01557, 0, 1.0131, 0, 1.01264, 0, 1.01378, 0, 1.0147, 0, 1.0109, 0, 1.01178, 0, 1.01248, 0, 1.0111, 0, 1.0099, 0, 1.01248, 0, 1.00721, 0, 1.01134, 0, 1.00752, 0, 1.00837, 0, 1.00817, 0, 1.00475, 0, 0.99937, 0, 1.01171, 0, 1.00434, 0, 0.99868, 0, 0.9969, 0, 0.995, 0, 0.98869, 0, 0.98454, 0, 0.97462, 0, 0.95885, 0, 0.93354, 0, 0.88895, 0, 0.78835, 0]

configs/config_5s_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+metrics:
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_pretrain_5s.safetensors"
+  num_steps: 50
+  guidance_weight: 5.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: flash
+    causal: false
+    local: false
+    glob: false
+    window: 3
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77

configs/config_5s_sft.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+metrics:
+  scale_factor:
+  - 1.0
+  - 2.0
+  - 2.0
+  resolution: 512
+model:
+  checkpoint_path: "./weights/model/kandinsky5lite_t2v_sft_5s.safetensors"
+  num_steps: 50
+  guidance_weight: 5.0
+  dit_params:
+    in_visual_dim: 16
+    out_visual_dim: 16
+    time_dim: 512
+    patch_size:
+    - 1
+    - 2
+    - 2
+    model_dim: 1792
+    ff_dim: 7168
+    num_text_blocks: 2
+    num_visual_blocks: 32
+    axes_dims:
+    - 16
+    - 24
+    - 24
+    visual_cond: true
+    in_text_dim: 3584
+    in_text_dim2: 768
+  attention:
+    type: flash
+    causal: false
+    local: false
+    glob: false
+    window: 3
+  vae:
+    checkpoint_path: "./weights/vae/"
+    name: "hunyuan"
+  text_embedder:
+    qwen:
+      emb_size: 3584
+      checkpoint_path: "./weights/text_encoder/"
+      max_length: 256
+    clip:
+      checkpoint_path: "./weights/text_encoder2/"
+      emb_size: 768
+      max_length: 77
+magcache:
+  mag_ratios: [0.91607, 0.91507, 0.95254, 0.95349, 1.04876, 1.04937, 1.0842, 1.084, 1.04372, 1.04445, 1.03521, 1.03559, 1.03906, 1.03904, 1.03104, 1.03132, 1.03618, 1.03671, 1.02519, 1.02512, 1.02595, 1.02603, 1.02378, 1.02447, 1.02427, 1.02514, 1.01967, 1.01996, 1.02266, 1.02269, 1.01885, 1.01951, 1.01913, 1.01977, 1.01944, 1.02018, 1.01664, 1.01707, 1.01682, 1.01723, 1.0155, 1.01611, 1.01998, 1.02022, 1.01194, 1.01244, 1.01626, 1.01555, 1.01611, 1.01654, 1.01545, 1.01579, 1.01362, 1.01376, 1.01589, 1.01627, 1.01527, 1.01521, 1.01301, 1.01334, 1.01415, 1.01444, 1.0144, 1.01464, 1.01444, 1.01442, 1.01361, 1.01399, 1.01397, 1.01408, 1.01412, 1.01432, 1.01453, 1.01454, 1.01341, 1.01342, 1.01317, 1.01342, 1.01018, 1.01051, 1.01278, 1.0128, 1.01021, 1.01037, 1.00809, 1.00794, 1.00679, 1.00711, 0.99882, 0.99948, 0.98905, 0.98905, 0.9755, 0.97545, 0.93786, 0.93738, 0.84336, 0.84193]

download_models.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+from huggingface_hub import snapshot_download
+if __name__ == "__main__":
+    cache_dir = "./weights"
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-10s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-10s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    dit_path = snapshot_download(
+        repo_id="ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-10s",
+        allow_patterns="model/*",
+        local_dir=cache_dir,
+    )
+    vae_path = snapshot_download(
+        repo_id="hunyuanvideo-community/HunyuanVideo",
+        allow_patterns="vae/*",
+        local_dir=cache_dir,
+    )
+    text_encoder_path = snapshot_download(
+        repo_id="Qwen/Qwen2.5-VL-7B-Instruct",
+        local_dir=os.path.join(cache_dir, "text_encoder/"),
+    )
+    text_encoder2_path = snapshot_download(
+        repo_id="openai/clip-vit-large-patch14",
+        local_dir=os.path.join(cache_dir, "text_encoder2/"),
+    )

inference_example.ipynb ADDED Viewed

	@@ -0,0 +1,192 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49a3d5bb-3d04-4d11-aba6-043fc5667abd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kandinsky import get_T2V_pipeline\n",
+    "from IPython.display import Video\n",
+    "from PIL import Image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9bcf5ed-f813-47e1-ade6-3f655d35d0e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = get_T2V_pipeline(\n",
+    "    device_map={\"dit\": \"cuda:0\", \"vae\": \"cuda:0\", \"text_embedder\": \"cuda:0\"},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f1563bb-0ebc-4f7c-97f4-6918a55d774f",
+   "metadata": {},
+   "source": [
+    "# Video"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48bc4ef9-0492-41a1-8133-c7e10a894d88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = pipe(\"a cat in a red hat\", time_length=2, width=768, height=512, save_path='./test.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3379b1a9-b987-443e-aad0-a7f8ac62b69d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Video('./test.mp4')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4de2c5e-2784-4827-92b8-6997d3012e79",
+   "metadata": {},
+   "source": [
+    "# Images"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9cbc0fc-173f-463e-896b-b29b727b0e65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = pipe(\n",
+    "    \"a dog in a red boots\", \n",
+    "    time_length=0, width=768, height=512, \n",
+    "    save_path='./image.png'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a2c7741-9d7e-4396-a434-d65ff9e224e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image = Image.open(\"./image.png\")\n",
+    "image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea82ab34-ea66-4559-ac86-ec63af0e0f7a",
+   "metadata": {},
+   "source": [
+    "# Distilled model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d42c0c0-92a9-4f91-a263-b3b9e58dbf99",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = get_T2V_pipeline(\n",
+    "    device_map={\"dit\": \"cuda:0\", \"vae\": \"cuda:0\", \"text_embedder\": \"cuda:0\"},\n",
+    "    conf_path=\"./configs/config_5s_distil.yaml\"\n",
+    ") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14e5d407-135b-451c-a092-840bae230524",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = pipe(\"cheburashka in a blue hat\", time_length=5, width=768, height=512, guidance_weight=1.0, num_steps=16, scheduler_scale=5, save_path='./test2.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53cda76a-a60e-4e62-9cb4-3b4fcf2f0b4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Video('./test2.mp4')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "004cc1da-ceb8-4e11-9774-8aa231b3fece",
+   "metadata": {},
+   "source": [
+    "# 10s video NABLA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea8fb5f8-ad6d-4e04-a4e3-0b381e3259db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = get_T2V_pipeline(\n",
+    "    device_map={\"dit\": \"cuda:0\", \"vae\": \"cuda:0\", \"text_embedder\": \"cuda:0\"},\n",
+    "    conf_path=\"./configs/config_10s_sft.yaml\"\n",
+    ") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ffa470b-5c29-4e05-88af-fe26c4d0ada1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = pipe(\"Shiba Inu is driving a car\", time_length=10, width=768, height=512, save_path='./test3.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4bbd345-9ae0-48a6-bdb9-24f4c63d48d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Video('./test3.mp4')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}