Spaces:

tencent
/

SongGeneration

Running on L40S

App Files Files Community

root commited on 8 days ago

Commit

45a12c7

1 Parent(s): 1ca8a57

update params

Browse files

Files changed (2) hide show

app.py +13 -13
codeclm/tokenizer/Flow1dVAE/model_septoken.py +43 -32

app.py CHANGED Viewed

@@ -89,7 +89,7 @@ def save_as_flac(sample_rate, audio_data):
 # 模拟歌曲生成函数
-def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=None, top_k=None, gen_type="mixed", progress=gr.Progress(track_tqdm=True)):
     global MODEL
     global STRUCTS
     params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
@@ -225,19 +225,19 @@ lyrics
                     minimum=0.1,
                     maximum=2.0,
                     step=0.1,
-                    value=0.9,
                     interactive=True,
                     elem_id="temperature",
                 )
-                top_k = gr.Slider(
-                    label="Top-K",
-                    minimum=1,
-                    maximum=100,
-                    step=1,
-                    value=50,
-                    interactive=True,
-                    elem_id="top_k",
-                )
             with gr.Row():
                 generate_btn = gr.Button("Generate Song", variant="primary")
                 generate_bgm_btn = gr.Button("Generate Pure Music", variant="primary")
@@ -268,12 +268,12 @@ lyrics
     # 生成按钮点击事件
     generate_btn.click(
         fn=generate_song,
-        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k],
         outputs=[output_audio, output_json]
     )
     generate_bgm_btn.click(
         fn=generate_song,
-        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k, gr.State("bgm")],
         outputs=[output_audio, output_json]
     )

 # 模拟歌曲生成函数
+def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=0.1, top_k=-1, gen_type="mixed", progress=gr.Progress(track_tqdm=True)):
     global MODEL
     global STRUCTS
     params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
                     minimum=0.1,
                     maximum=2.0,
                     step=0.1,
+                    value=0.75,
                     interactive=True,
                     elem_id="temperature",
                 )
+                # top_k = gr.Slider(
+                #     label="Top-K",
+                #     minimum=1,
+                #     maximum=100,
+                #     step=1,
+                #     value=50,
+                #     interactive=True,
+                #     elem_id="top_k",
+                # )
             with gr.Row():
                 generate_btn = gr.Button("Generate Song", variant="primary")
                 generate_bgm_btn = gr.Button("Generate Pure Music", variant="primary")
     # 生成按钮点击事件
     generate_btn.click(
         fn=generate_song,
+        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(-1)],
         outputs=[output_audio, output_json]
     )
     generate_bgm_btn.click(
         fn=generate_song,
+        inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(-1), gr.State("bgm")],
         outputs=[output_audio, output_json]
     )

codeclm/tokenizer/Flow1dVAE/model_septoken.py CHANGED Viewed

@@ -146,41 +146,52 @@ class BASECFM(torch.nn.Module, ABC):
             mu (torch.Tensor): output of encoder
                 shape: (batch_size, n_channels, mel_timesteps, n_feats)
         """
-        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
         noise = x.clone()
-        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
-        # Or in future might add like a return_all_steps flag
-        sol = []
-        for step in tqdm(range(1, len(t_span))):
-            x[:,0:incontext_length,:] = (1 - (1 - self.sigma_min) * t) * noise[:,0:incontext_length,:] + t * incontext_x[:,0:incontext_length,:]
-            if(guidance_scale > 1.0):
-                model_input = torch.cat([ \
-                    torch.cat([latent_mask_input, latent_mask_input], 0), \
-                    torch.cat([incontext_x, incontext_x], 0), \
-                    torch.cat([torch.zeros_like(mu), mu], 0), \
-                    torch.cat([x, x], 0), \
-                    ], 2)
-                timestep=t.unsqueeze(-1).repeat(2)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-                dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
-                dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
             else:
-                model_input = torch.cat([latent_mask_input, incontext_x, mu, x], 2)
-                timestep=t.unsqueeze(-1)
-                dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
-            dphi_dt = dphi_dt[: ,:, -x.shape[2]:]
-            x = x + dt * dphi_dt
-            t = t + dt
-            sol.append(x)
-            if step < len(t_span) - 1:
-                dt = t_span[step + 1] - t
-        return sol[-1]
     def projection_loss(self,hidden_proj, bestrq_emb):
         bsz = hidden_proj.shape[0]

             mu (torch.Tensor): output of encoder
                 shape: (batch_size, n_channels, mel_timesteps, n_feats)
         """
+        dt = t_span[1:] - t_span[:-1]
+        t = t_span[:-1]
+        B = x.shape[0]
+        if guidance_scale > 1.0:
+            def double(z):
+                return torch.cat([z, z], 0) if z is not None else None
+            attention_mask = double(attention_mask)
+        x_next = x.clone()
         noise = x.clone()
+        for i in tqdm(range(len(dt))):
+            ti = t[i]
+            x_next[:, :incontext_length] = (
+                (1 - (1 - self.sigma_min) * ti) * noise[:, :incontext_length] +
+                ti * incontext_x[:, :incontext_length]
+            )
+            if guidance_scale > 1.0:
+                model_input = torch.cat([
+                    double(latent_mask_input),
+                    double(incontext_x),
+                    torch.cat([torch.zeros_like(mu), mu], 0),
+                    double(x_next),
+                ], dim=2)
+                timestep = ti.expand(2 * B)
             else:
+                model_input = torch.cat([
+                    latent_mask_input, incontext_x, mu, x_next
+                ], dim=2)
+                timestep = ti.expand(B)
+            v = self.estimator(inputs_embeds=model_input,
+                            attention_mask=attention_mask,
+                            time_step=timestep).last_hidden_state
+            v = v[..., -x.shape[2]:]
+            if guidance_scale > 1.0:
+                v_uncond, v_cond = v.chunk(2, 0)
+                v = v_uncond + guidance_scale * (v_cond - v_uncond)
+            x_next = x_next + dt[i] * v
+        return x_next
     def projection_loss(self,hidden_proj, bestrq_emb):
         bsz = hidden_proj.shape[0]