Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on May 8

Commit

ff46889

1 Parent(s): 8d10dcc

v1.2 edit

Browse files

Files changed (3) hide show

app.py +5 -1
diffrhythm/model/cfm.py +10 -16
diffrhythm/model/dit.py +7 -3

app.py CHANGED Viewed

@@ -232,8 +232,12 @@ with gr.Blocks(css=css) as demo:
 3. **Supported Languages**
     - **Chinese and English**
     - More languages comming soon
-4. **Others**
     - If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings.
                         """)

 3. **Supported Languages**
     - **Chinese and English**
     - More languages comming soon
+4. **Editing Function in Advanced Settings**
+    - Using full-length audio as reference is recommended for best results.
+    - Use -1 to represent the start/end of audio (e.g. [[-1,25], [50,-1]] means "from start to 25s" and "from 50s to end").
+5. **Others**
     - If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings.
                         """)

diffrhythm/model/cfm.py CHANGED Viewed

@@ -208,27 +208,21 @@ class CFM(nn.Module):
         negative_style_prompt = negative_style_prompt.repeat(batch_infer_num, 1)
         start_time = start_time.repeat(batch_infer_num)
         fixed_span_mask = fixed_span_mask.repeat(batch_infer_num, 1, 1)
-        start_time_embed, positive_text_embed, positive_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=False, start_time=start_time)
-        _, negative_text_embed, negative_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=True, start_time=start_time)
-        text_embed = torch.cat([positive_text_embed, negative_text_embed], 0)
-        text_residuals = [torch.cat([a, b], 0) for a, b in zip(positive_text_residuals, negative_text_residuals)]
-        step_cond = torch.cat([step_cond, step_cond], 0)
-        style_prompt = torch.cat([style_prompt, negative_style_prompt], 0)
-        start_time_embed = torch.cat([start_time_embed, start_time_embed], 0)
         def fn(t, x):
-            x = torch.cat([x, x], 0)
             pred = self.transformer(
-                x=x, text_embed=text_embed, text_residuals=text_residuals, cond=step_cond, time=t,
-                drop_audio_cond=True, drop_prompt=False, style_prompt=style_prompt, start_time=start_time_embed
             )
-            positive_pred, negative_pred = pred.chunk(2, 0)
-            cfg_pred = positive_pred + (positive_pred - negative_pred) * cfg_strength
-            return cfg_pred
         # noise input
         # to make sure batch inference result is same with different batch size, and for sure single inference

         negative_style_prompt = negative_style_prompt.repeat(batch_infer_num, 1)
         start_time = start_time.repeat(batch_infer_num)
         fixed_span_mask = fixed_span_mask.repeat(batch_infer_num, 1, 1)
         def fn(t, x):
+            # predict flow
             pred = self.transformer(
+                x=x, cond=step_cond, text=text, time=t, drop_audio_cond=False, drop_text=False, drop_prompt=False,
+                style_prompt=style_prompt, start_time=start_time
             )
+            if cfg_strength < 1e-5:
+                return pred
+            null_pred = self.transformer(
+                x=x, cond=step_cond, text=text, time=t, drop_audio_cond=True, drop_text=True, drop_prompt=False,
+                style_prompt=negative_style_prompt, start_time=start_time
+            )
+            return pred + (pred - null_pred) * cfg_strength
         # noise input
         # to make sure batch inference result is same with different batch size, and for sure single inference

diffrhythm/model/dit.py CHANGED Viewed

@@ -162,21 +162,25 @@ class DiT(nn.Module):
     def forward(
         self,
         x: float["b n d"],  # nosied input audio  # noqa: F722
-        text_embed: int["b nt"],  # text  # noqa: F722
-        text_residuals,
         cond: float["b n d"],  # masked cond audio  # noqa: F722
         time: float["b"] | float[""],  # time step  # noqa: F821 F722
         drop_audio_cond,  # cfg for cond audio
         drop_prompt=False,
         style_prompt=None, # [b d t]
         start_time=None,
     ):
         batch, seq_len = x.shape[0], x.shape[1]
         if time.ndim == 0:
             time = time.repeat(batch)
         t = self.time_embed(time)
-        c = t + start_time
         if drop_prompt:
             style_prompt = torch.zeros_like(style_prompt)

     def forward(
         self,
         x: float["b n d"],  # nosied input audio  # noqa: F722
         cond: float["b n d"],  # masked cond audio  # noqa: F722
+        text: int["b nt"],  # text  # noqa: F722
         time: float["b"] | float[""],  # time step  # noqa: F821 F722
         drop_audio_cond,  # cfg for cond audio
+        drop_text,  # cfg for text
         drop_prompt=False,
         style_prompt=None, # [b d t]
         start_time=None,
     ):
         batch, seq_len = x.shape[0], x.shape[1]
         if time.ndim == 0:
             time = time.repeat(batch)
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
+        s_t = self.start_time_embed(start_time)
+        c = t + s_t
+        text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
         if drop_prompt:
             style_prompt = torch.zeros_like(style_prompt)