Spaces:

jbilcke-hf
/

VideoModelStudio

Paused

App Files Files Community

Julian Bilcke commited on Feb 23

Commit

c90af3c

1 Parent(s): a529bb7

working on UI improvements

Browse files

Files changed (3) hide show

app.py +43 -2
config.py +98 -24
training_service.py +11 -4

app.py CHANGED Viewed

@@ -661,6 +661,26 @@ class VideoTrainerUI:
             training_dataset
         )
     def create_ui(self):
         """Create Gradio interface"""
@@ -820,6 +840,15 @@ class VideoTrainerUI:
                             with gr.Row():
                                 train_title = gr.Markdown("## 0 files available for training (0 bytes)")
                             with gr.Row():
                                 with gr.Column():
                                     model_type = gr.Dropdown(
@@ -1096,16 +1125,28 @@ class VideoTrainerUI:
                 outputs=[training_dataset]
             )
             # Training control events
             start_btn.click(
-                fn=lambda model_type, *args: (
                     self.log_parser.reset(),
                     self.trainer.start_training(
                         MODEL_TYPES[model_type],
-                        *args
                     )
                 ),
                 inputs=[
                     model_type,
                     lora_rank,
                     lora_alpha,

             training_dataset
         )
+    def update_training_params(self, preset_name: str) -> Dict:
+        """Update UI components based on selected preset"""
+        preset = TRAINING_PRESETS[preset_name]
+        # Get preset description for display
+        description = preset.get("description", "")
+        bucket_info = f"\nBucket configuration: {len(preset['training_buckets'])} buckets"
+        info_text = f"{description}{bucket_info}"
+        return {
+            "model_type": gr.Dropdown(value=MODEL_TYPES[preset["model_type"]]),
+            "lora_rank": gr.Dropdown(value=preset["lora_rank"]),
+            "lora_alpha": gr.Dropdown(value=preset["lora_alpha"]),
+            "num_epochs": gr.Number(value=preset["num_epochs"]),
+            "batch_size": gr.Number(value=preset["batch_size"]),
+            "learning_rate": gr.Number(value=preset["learning_rate"]),
+            "save_iterations": gr.Number(value=preset["save_iterations"]),
+            "preset_info": gr.Markdown(value=info_text)
+        }
     def create_ui(self):
         """Create Gradio interface"""
                             with gr.Row():
                                 train_title = gr.Markdown("## 0 files available for training (0 bytes)")
+                            with gr.Row():
+                                with gr.Column():
+                                    training_preset = gr.Dropdown(
+                                        choices=list(TRAINING_PRESETS.keys()),
+                                        label="Training Preset",
+                                        value=list(TRAINING_PRESETS.keys())[0]
+                                    )
+                                preset_info = gr.Markdown()
                             with gr.Row():
                                 with gr.Column():
                                     model_type = gr.Dropdown(
                 outputs=[training_dataset]
             )
+            training_preset.change(
+                fn=self.update_training_params,
+                inputs=[training_preset],
+                outputs=[
+                    model_type, lora_rank, lora_alpha,
+                    num_epochs, batch_size, learning_rate,
+                    save_iterations, preset_info
+                ]
+            )
             # Training control events
             start_btn.click(
+                fn=lambda preset, model_type, *args: (
                     self.log_parser.reset(),
                     self.trainer.start_training(
                         MODEL_TYPES[model_type],
+                        *args,
+                        preset_name=preset
                     )
                 ),
                 inputs=[
+                    training_preset,
                     model_type,
                     lora_rank,
                     lora_alpha,

config.py CHANGED Viewed

@@ -55,8 +55,8 @@ MODEL_TYPES = {
 # it is best to use resolutions that are powers of 8
 # The resolution should be divisible by 32
 # so we cannot use 1080, 540 etc as they are not divisible by 32
-TRAINING_WIDTH = 768 # 32 * 24
-TRAINING_HEIGHT = 512 # 32 * 16
 # 1920 = 32 * 60 (divided by 2: 960 = 32 * 30)
 # 1920 = 32 * 60 (divided by 2: 960 = 32 * 30)
@@ -65,26 +65,100 @@ TRAINING_HEIGHT = 512 # 32 * 16
 # it is important that the resolution buckets properly cover the training dataset,
 # or else that we exclude from the dataset videos that are out of this range
 # right now, finetrainers will crash if that happens, so the workaround is to have more buckets in here
-TRAINING_BUCKETS = [
-    (1, TRAINING_HEIGHT, TRAINING_WIDTH), #  1
-    (8 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 8 + 1
-    (8 * 2 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 16 + 1
-    (8 * 4 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 32 + 1
-    (8 * 6 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 48 + 1
-    (8 * 8 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 64 + 1
-    (8 * 10 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 80 + 1
-    (8 * 12 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 96 + 1
-    (8 * 14 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 112 + 1
-    (8 * 16 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 128 + 1
-    (8 * 18 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 144 + 1
-    (8 * 20 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 160 + 1
-    (8 * 22 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 176 + 1
-    (8 * 24 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 192 + 1
-    (8 * 28 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 224 + 1
-    (8 * 32 + 1, TRAINING_HEIGHT, TRAINING_WIDTH), # 256 + 1
 ]
 @dataclass
 class TrainingConfig:
     """Configuration class for finetrainers training"""
@@ -159,7 +233,7 @@ class TrainingConfig:
     nccl_timeout: int = 1800
     @classmethod
-    def hunyuan_video_lora(cls, data_path: str, output_path: str) -> 'TrainingConfig':
         """Configuration for Hunyuan video-to-video LoRA training"""
         return cls(
             model_name="hunyuan_video",
@@ -174,13 +248,13 @@ class TrainingConfig:
             gradient_accumulation_steps=1,
             lora_rank=128,
             lora_alpha=128,
-            video_resolution_buckets=TRAINING_BUCKETS,
             caption_dropout_p=0.05,
             flow_weighting_scheme="none"  # Hunyuan specific
         )
     @classmethod
-    def ltx_video_lora(cls, data_path: str, output_path: str) -> 'TrainingConfig':
         """Configuration for LTX-Video LoRA training"""
         return cls(
             model_name="ltx_video",
@@ -195,7 +269,7 @@ class TrainingConfig:
             gradient_accumulation_steps=4,
             lora_rank=128,
             lora_alpha=128,
-            video_resolution_buckets=TRAINING_BUCKETS,
             caption_dropout_p=0.05,
             flow_weighting_scheme="logit_normal"  # LTX specific
         )

 # it is best to use resolutions that are powers of 8
 # The resolution should be divisible by 32
 # so we cannot use 1080, 540 etc as they are not divisible by 32
+MEDIUM_19_9_RATIO_WIDTH = 768 # 32 * 24
+MEDIUM_19_9_RATIO_HEIGHT = 512 # 32 * 16
 # 1920 = 32 * 60 (divided by 2: 960 = 32 * 30)
 # 1920 = 32 * 60 (divided by 2: 960 = 32 * 30)
 # it is important that the resolution buckets properly cover the training dataset,
 # or else that we exclude from the dataset videos that are out of this range
 # right now, finetrainers will crash if that happens, so the workaround is to have more buckets in here
+NB_FRAMES_1 = 1  #  1
+NB_FRAMES_9 = 8 + 1 # 8 + 1
+NB_FRAMES_17 = 8 * 2 + 1 # 16 + 1
+NB_FRAMES_32 = 8 * 4 + 1  # 32 + 1
+NB_FRAMES_48 = 8 * 6 + 1 # 48 + 1
+NB_FRAMES_64 = 8 * 8 + 1  # 64 + 1
+NB_FRAMES_80 = 8 * 10 + 1  # 80 + 1
+NB_FRAMES_96 = 8 * 12 + 1  # 96 + 1
+NB_FRAMES_112 = 8 * 14 + 1  # 112 + 1
+NB_FRAMES_128 = 8 * 16 + 1  # 128 + 1
+NB_FRAMES_144 = 8 * 18 + 1  # 144 + 1
+NB_FRAMES_160  = 8 * 20 + 1  # 160 + 1
+NB_FRAMES_176 = 8 * 22 + 1  # 176 + 1
+NB_FRAMES_192 = 8 * 24 + 1  # 192 + 1
+NB_FRAMES_224 = 8 * 28 + 1  # 224 + 1
+NB_FRAMES_256 = 8 * 32 + 1  # 256 + 1
+# 256 isn't a lot by the way, especially with 60 FPS videos..
+# can we crank it and put more frames in here?
+SMALL_TRAINING_BUCKETS = [
+    (NB_FRAMES_1,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 1
+    (NB_FRAMES_9,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 8 + 1
+    (NB_FRAMES_17,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 16 + 1
+    (NB_FRAMES_32,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 32 + 1
+    (NB_FRAMES_48,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 48 + 1
+    (NB_FRAMES_64,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 64 + 1
+    (NB_FRAMES_80,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 80 + 1
+    (NB_FRAMES_96,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 96 + 1
+    (NB_FRAMES_112, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 112 + 1
+    (NB_FRAMES_128, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 128 + 1
+    (NB_FRAMES_144, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 144 + 1
+    (NB_FRAMES_160, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 160 + 1
+    (NB_FRAMES_176, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 176 + 1
+    (NB_FRAMES_192, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 192 + 1
+    (NB_FRAMES_224, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 224 + 1
+    (NB_FRAMES_256, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 256 + 1
+]
+MEDIUM_19_9_RATIO_WIDTH = 928 # 32 * 29
+MEDIUM_19_9_RATIO_HEIGHT = 512 # 32 * 16
+MEDIUM_19_9_RATIO_BUCKETS = [
+    (NB_FRAMES_1,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), #  1
+    (NB_FRAMES_9,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 8 + 1
+    (NB_FRAMES_17,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 16 + 1
+    (NB_FRAMES_32,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 32 + 1
+    (NB_FRAMES_48,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 48 + 1
+    (NB_FRAMES_64,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 64 + 1
+    (NB_FRAMES_80,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 80 + 1
+    (NB_FRAMES_96,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 96 + 1
+    (NB_FRAMES_112, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 112 + 1
+    (NB_FRAMES_128, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 128 + 1
+    (NB_FRAMES_144, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 144 + 1
+    (NB_FRAMES_160, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 160 + 1
+    (NB_FRAMES_176, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 176 + 1
+    (NB_FRAMES_192, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 192 + 1
+    (NB_FRAMES_224, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 224 + 1
+    (NB_FRAMES_256, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 256 + 1
 ]
+TRAINING_PRESETS = {
+    "HunyuanVideo (normal)": {
+        "model_type": "hunyuan_video",
+        "lora_rank": "128",
+        "lora_alpha": "128",
+        "num_epochs": 70,
+        "batch_size": 1,
+        "learning_rate": 2e-5,
+        "save_iterations": 500,
+        "training_buckets": SMALL_TRAINING_BUCKETS,
+    },
+    "LTX-Video (normal)": {
+        "model_type": "ltx_video",
+        "lora_rank": "128",
+        "lora_alpha": "128",
+        "num_epochs": 70,
+        "batch_size": 1,
+        "learning_rate": 3e-5,
+        "save_iterations": 500,
+        "training_buckets": SMALL_TRAINING_BUCKETS,
+    },
+    "LTX-Video (16:9, HQ)": {
+        "model_type": "ltx_video",
+        "lora_rank": "256",
+        "lora_alpha": "128",
+        "num_epochs": 50,
+        "batch_size": 1,
+        "learning_rate": 3e-5,
+        "save_iterations": 200,
+        "training_buckets": MEDIUM_19_9_RATIO_BUCKETS,
+    }
+}
 @dataclass
 class TrainingConfig:
     """Configuration class for finetrainers training"""
     nccl_timeout: int = 1800
     @classmethod
+    def hunyuan_video_lora(cls, data_path: str, output_path: str, buckets=None) -> 'TrainingConfig':
         """Configuration for Hunyuan video-to-video LoRA training"""
         return cls(
             model_name="hunyuan_video",
             gradient_accumulation_steps=1,
             lora_rank=128,
             lora_alpha=128,
+            video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
             caption_dropout_p=0.05,
             flow_weighting_scheme="none"  # Hunyuan specific
         )
     @classmethod
+    def ltx_video_lora(cls, data_path: str, output_path: str, buckets=None) -> 'TrainingConfig':
         """Configuration for LTX-Video LoRA training"""
         return cls(
             model_name="ltx_video",
             gradient_accumulation_steps=4,
             lora_rank=128,
             lora_alpha=128,
+            video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
             caption_dropout_p=0.05,
             flow_weighting_scheme="logit_normal"  # LTX specific
         )

training_service.py CHANGED Viewed

@@ -257,18 +257,25 @@ class TrainingService:
                 logger.error(error_msg)
                 return error_msg, "No training data available"
-            # Get config for selected model type
             if model_type == "hunyuan_video":
                 config = TrainingConfig.hunyuan_video_lora(
                     data_path=str(TRAINING_PATH),
-                    output_path=str(OUTPUT_PATH)
                 )
             else:  # ltx_video
                 config = TrainingConfig.ltx_video_lora(
                     data_path=str(TRAINING_PATH),
-                    output_path=str(OUTPUT_PATH)
                 )
             # Update with UI parameters
             config.train_epochs = int(num_epochs)
             config.lora_rank = int(lora_rank)

                 logger.error(error_msg)
                 return error_msg, "No training data available"
+             # Get preset configuration
+            preset = TRAINING_PRESETS[preset_name]
+            training_buckets = preset["training_buckets"]
+            # Get config for selected model type with preset buckets
             if model_type == "hunyuan_video":
                 config = TrainingConfig.hunyuan_video_lora(
                     data_path=str(TRAINING_PATH),
+                    output_path=str(OUTPUT_PATH),
+                    buckets=training_buckets
                 )
             else:  # ltx_video
                 config = TrainingConfig.ltx_video_lora(
                     data_path=str(TRAINING_PATH),
+                    output_path=str(OUTPUT_PATH),
+                    buckets=training_buckets
                 )
             # Update with UI parameters
             config.train_epochs = int(num_epochs)
             config.lora_rank = int(lora_rank)