update
Browse files
app.py
CHANGED
|
@@ -35,8 +35,8 @@ def inference_gradio(input, model_choice): # input is audio waveform in [T, cha
|
|
| 35 |
audio = np.transpose(audio) # transpose to [channel, T] for librosa
|
| 36 |
audio = audio / MAX_WAV_VALUE # convert int16 to float range used by BigVGAN
|
| 37 |
|
| 38 |
-
h =
|
| 39 |
-
model =
|
| 40 |
|
| 41 |
if sr != h.sampling_rate: # convert audio to model's sampling rate
|
| 42 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
|
|
@@ -254,8 +254,8 @@ DICT_MODEL_NAME_FILE_PAIRS = {
|
|
| 254 |
"bigvgan_v2_44khz_128band_512x": "g_03000000"
|
| 255 |
}
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
|
| 260 |
for model_name in LIST_MODEL_NAME:
|
| 261 |
model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
|
|
@@ -275,8 +275,8 @@ for model_name in LIST_MODEL_NAME:
|
|
| 275 |
generator.eval()
|
| 276 |
generator.remove_weight_norm()
|
| 277 |
|
| 278 |
-
|
| 279 |
-
|
| 280 |
|
| 281 |
######################## script for gradio UI ########################
|
| 282 |
|
|
@@ -285,29 +285,29 @@ iface = gr.Blocks(css=css)
|
|
| 285 |
with iface:
|
| 286 |
gr.HTML(
|
| 287 |
"""
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
</div>
|
| 301 |
-
<p style="margin-bottom: 10px; font-size: 94%">
|
| 302 |
-
<a href="https://arxiv.org/abs/2206.04658">[Paper]</a> <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a> <a href="https://bigvgan-demo.github.io/">[Demo]</a> <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
|
| 303 |
-
</p>
|
| 304 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
"""
|
| 306 |
)
|
| 307 |
gr.HTML(
|
| 308 |
"""
|
| 309 |
<div>
|
| 310 |
-
<
|
| 311 |
<p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
|
| 312 |
<ul>
|
| 313 |
<li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
|
|
@@ -318,13 +318,21 @@ with iface:
|
|
| 318 |
</div>
|
| 319 |
"""
|
| 320 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
with gr.Group():
|
| 323 |
-
model_choice = gr.
|
| 324 |
label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
|
| 325 |
value="bigvgan_v2_24khz_100band_256x",
|
| 326 |
choices=[m for m in LIST_MODEL_NAME],
|
| 327 |
-
type="index",
|
| 328 |
interactive=True,
|
| 329 |
)
|
| 330 |
|
|
|
|
| 35 |
audio = np.transpose(audio) # transpose to [channel, T] for librosa
|
| 36 |
audio = audio / MAX_WAV_VALUE # convert int16 to float range used by BigVGAN
|
| 37 |
|
| 38 |
+
h = dict_config[model_choice]
|
| 39 |
+
model = dict_model[model_choice]
|
| 40 |
|
| 41 |
if sr != h.sampling_rate: # convert audio to model's sampling rate
|
| 42 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
|
|
|
|
| 254 |
"bigvgan_v2_44khz_128band_512x": "g_03000000"
|
| 255 |
}
|
| 256 |
|
| 257 |
+
dict_model = {}
|
| 258 |
+
dict_config = {}
|
| 259 |
|
| 260 |
for model_name in LIST_MODEL_NAME:
|
| 261 |
model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
|
|
|
|
| 275 |
generator.eval()
|
| 276 |
generator.remove_weight_norm()
|
| 277 |
|
| 278 |
+
dict_model[model_name] = generator
|
| 279 |
+
dict_config[model_name] = h
|
| 280 |
|
| 281 |
######################## script for gradio UI ########################
|
| 282 |
|
|
|
|
| 285 |
with iface:
|
| 286 |
gr.HTML(
|
| 287 |
"""
|
| 288 |
+
<div style="text-align: center; max-width: 900px; margin: 0 auto;">
|
| 289 |
+
<div
|
| 290 |
+
style="
|
| 291 |
+
display: inline-flex;
|
| 292 |
+
align-items: center;
|
| 293 |
+
gap: 0.8rem;
|
| 294 |
+
font-size: 1.75rem;
|
| 295 |
+
"
|
| 296 |
+
>
|
| 297 |
+
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
|
| 298 |
+
BigVGAN: A Universal Neural Vocoder with Large-Scale Training
|
| 299 |
+
</h1>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
</div>
|
| 301 |
+
<p style="margin-bottom: 10px; font-size: 125%">
|
| 302 |
+
<a href="https://arxiv.org/abs/2206.04658">[Paper]</a> <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a> <a href="https://bigvgan-demo.github.io/">[Demo]</a> <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
|
| 303 |
+
</p>
|
| 304 |
+
</div>
|
| 305 |
"""
|
| 306 |
)
|
| 307 |
gr.HTML(
|
| 308 |
"""
|
| 309 |
<div>
|
| 310 |
+
<h3>News</h3>
|
| 311 |
<p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
|
| 312 |
<ul>
|
| 313 |
<li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
|
|
|
|
| 318 |
</div>
|
| 319 |
"""
|
| 320 |
)
|
| 321 |
+
gr.HTML(
|
| 322 |
+
"""
|
| 323 |
+
<div>
|
| 324 |
+
<h3>Model Overview</h3>
|
| 325 |
+
BigVGAN is a neural vocoder model that generates audio waveforms using mel spectrogram as inputs.
|
| 326 |
+
<center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800" style="margin-top: 20px;"></center>
|
| 327 |
+
</div>
|
| 328 |
+
"""
|
| 329 |
+
)
|
| 330 |
|
| 331 |
with gr.Group():
|
| 332 |
+
model_choice = gr.Dropdown(
|
| 333 |
label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
|
| 334 |
value="bigvgan_v2_24khz_100band_256x",
|
| 335 |
choices=[m for m in LIST_MODEL_NAME],
|
|
|
|
| 336 |
interactive=True,
|
| 337 |
)
|
| 338 |
|