Spaces:
Running
on
L40S
Running
on
L40S
root
commited on
Commit
·
45a12c7
1
Parent(s):
1ca8a57
update params
Browse files- app.py +13 -13
- codeclm/tokenizer/Flow1dVAE/model_septoken.py +43 -32
app.py
CHANGED
|
@@ -89,7 +89,7 @@ def save_as_flac(sample_rate, audio_data):
|
|
| 89 |
|
| 90 |
|
| 91 |
# 模拟歌曲生成函数
|
| 92 |
-
def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=
|
| 93 |
global MODEL
|
| 94 |
global STRUCTS
|
| 95 |
params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
|
|
@@ -225,19 +225,19 @@ lyrics
|
|
| 225 |
minimum=0.1,
|
| 226 |
maximum=2.0,
|
| 227 |
step=0.1,
|
| 228 |
-
value=0.
|
| 229 |
interactive=True,
|
| 230 |
elem_id="temperature",
|
| 231 |
)
|
| 232 |
-
top_k = gr.Slider(
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
)
|
| 241 |
with gr.Row():
|
| 242 |
generate_btn = gr.Button("Generate Song", variant="primary")
|
| 243 |
generate_bgm_btn = gr.Button("Generate Pure Music", variant="primary")
|
|
@@ -268,12 +268,12 @@ lyrics
|
|
| 268 |
# 生成按钮点击事件
|
| 269 |
generate_btn.click(
|
| 270 |
fn=generate_song,
|
| 271 |
-
inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature,
|
| 272 |
outputs=[output_audio, output_json]
|
| 273 |
)
|
| 274 |
generate_bgm_btn.click(
|
| 275 |
fn=generate_song,
|
| 276 |
-
inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature,
|
| 277 |
outputs=[output_audio, output_json]
|
| 278 |
)
|
| 279 |
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
# 模拟歌曲生成函数
|
| 92 |
+
def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=0.1, top_k=-1, gen_type="mixed", progress=gr.Progress(track_tqdm=True)):
|
| 93 |
global MODEL
|
| 94 |
global STRUCTS
|
| 95 |
params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
|
|
|
|
| 225 |
minimum=0.1,
|
| 226 |
maximum=2.0,
|
| 227 |
step=0.1,
|
| 228 |
+
value=0.75,
|
| 229 |
interactive=True,
|
| 230 |
elem_id="temperature",
|
| 231 |
)
|
| 232 |
+
# top_k = gr.Slider(
|
| 233 |
+
# label="Top-K",
|
| 234 |
+
# minimum=1,
|
| 235 |
+
# maximum=100,
|
| 236 |
+
# step=1,
|
| 237 |
+
# value=50,
|
| 238 |
+
# interactive=True,
|
| 239 |
+
# elem_id="top_k",
|
| 240 |
+
# )
|
| 241 |
with gr.Row():
|
| 242 |
generate_btn = gr.Button("Generate Song", variant="primary")
|
| 243 |
generate_bgm_btn = gr.Button("Generate Pure Music", variant="primary")
|
|
|
|
| 268 |
# 生成按钮点击事件
|
| 269 |
generate_btn.click(
|
| 270 |
fn=generate_song,
|
| 271 |
+
inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(-1)],
|
| 272 |
outputs=[output_audio, output_json]
|
| 273 |
)
|
| 274 |
generate_bgm_btn.click(
|
| 275 |
fn=generate_song,
|
| 276 |
+
inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, gr.State(-1), gr.State("bgm")],
|
| 277 |
outputs=[output_audio, output_json]
|
| 278 |
)
|
| 279 |
|
codeclm/tokenizer/Flow1dVAE/model_septoken.py
CHANGED
|
@@ -146,41 +146,52 @@ class BASECFM(torch.nn.Module, ABC):
|
|
| 146 |
mu (torch.Tensor): output of encoder
|
| 147 |
shape: (batch_size, n_channels, mel_timesteps, n_feats)
|
| 148 |
"""
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
noise = x.clone()
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
torch.cat([
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
dphi_dt = self.estimator(inputs_embeds=model_input, attention_mask=attention_mask,time_step=timestep).last_hidden_state
|
| 169 |
-
dphi_dt_uncond, dhpi_dt_cond = dphi_dt.chunk(2,0)
|
| 170 |
-
dphi_dt = dphi_dt_uncond + guidance_scale * (dhpi_dt_cond - dphi_dt_uncond)
|
| 171 |
else:
|
| 172 |
-
model_input = torch.cat([
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
def projection_loss(self,hidden_proj, bestrq_emb):
|
| 186 |
bsz = hidden_proj.shape[0]
|
|
|
|
| 146 |
mu (torch.Tensor): output of encoder
|
| 147 |
shape: (batch_size, n_channels, mel_timesteps, n_feats)
|
| 148 |
"""
|
| 149 |
+
dt = t_span[1:] - t_span[:-1]
|
| 150 |
+
t = t_span[:-1]
|
| 151 |
+
B = x.shape[0]
|
| 152 |
+
|
| 153 |
+
if guidance_scale > 1.0:
|
| 154 |
+
def double(z):
|
| 155 |
+
return torch.cat([z, z], 0) if z is not None else None
|
| 156 |
+
attention_mask = double(attention_mask)
|
| 157 |
+
|
| 158 |
+
x_next = x.clone()
|
| 159 |
noise = x.clone()
|
| 160 |
|
| 161 |
+
for i in tqdm(range(len(dt))):
|
| 162 |
+
ti = t[i]
|
| 163 |
+
|
| 164 |
+
x_next[:, :incontext_length] = (
|
| 165 |
+
(1 - (1 - self.sigma_min) * ti) * noise[:, :incontext_length] +
|
| 166 |
+
ti * incontext_x[:, :incontext_length]
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
if guidance_scale > 1.0:
|
| 170 |
+
model_input = torch.cat([
|
| 171 |
+
double(latent_mask_input),
|
| 172 |
+
double(incontext_x),
|
| 173 |
+
torch.cat([torch.zeros_like(mu), mu], 0),
|
| 174 |
+
double(x_next),
|
| 175 |
+
], dim=2)
|
| 176 |
+
timestep = ti.expand(2 * B)
|
|
|
|
|
|
|
|
|
|
| 177 |
else:
|
| 178 |
+
model_input = torch.cat([
|
| 179 |
+
latent_mask_input, incontext_x, mu, x_next
|
| 180 |
+
], dim=2)
|
| 181 |
+
timestep = ti.expand(B)
|
| 182 |
+
|
| 183 |
+
v = self.estimator(inputs_embeds=model_input,
|
| 184 |
+
attention_mask=attention_mask,
|
| 185 |
+
time_step=timestep).last_hidden_state
|
| 186 |
+
v = v[..., -x.shape[2]:]
|
| 187 |
+
|
| 188 |
+
if guidance_scale > 1.0:
|
| 189 |
+
v_uncond, v_cond = v.chunk(2, 0)
|
| 190 |
+
v = v_uncond + guidance_scale * (v_cond - v_uncond)
|
| 191 |
+
|
| 192 |
+
x_next = x_next + dt[i] * v
|
| 193 |
+
|
| 194 |
+
return x_next
|
| 195 |
|
| 196 |
def projection_loss(self,hidden_proj, bestrq_emb):
|
| 197 |
bsz = hidden_proj.shape[0]
|