switch to gr.Audio for output display
Browse files
app.py
CHANGED
|
@@ -10,7 +10,7 @@ from meldataset import mel_spectrogram, MAX_WAV_VALUE
|
|
| 10 |
from models import BigVGAN as Generator
|
| 11 |
import librosa
|
| 12 |
import numpy as np
|
| 13 |
-
from utils import plot_spectrogram
|
| 14 |
import PIL
|
| 15 |
|
| 16 |
if torch.cuda.is_available():
|
|
@@ -43,16 +43,21 @@ def inference_gradio(input, model_choice): # input is audio waveform in [T, cha
|
|
| 43 |
if len(audio.shape) == 2: # stereo
|
| 44 |
audio = librosa.to_mono(audio) # convert to mono if stereo
|
| 45 |
audio = librosa.util.normalize(audio) * 0.95
|
| 46 |
-
output, spec_gen = inference_model(audio, h, model) # output is generated audio in ndarray
|
| 47 |
|
| 48 |
-
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
|
| 51 |
-
output_image_gen = PIL.Image.frombytes('RGB',
|
| 52 |
-
spec_plot_gen.canvas.get_width_height(),
|
| 53 |
-
spec_plot_gen.canvas.tostring_rgb())
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
@spaces.GPU(duration=120)
|
|
@@ -61,8 +66,8 @@ def inference_model(audio_input, h, model):
|
|
| 61 |
model.to(device)
|
| 62 |
|
| 63 |
def get_mel(x):
|
| 64 |
-
|
| 65 |
-
|
| 66 |
with torch.inference_mode():
|
| 67 |
wav = torch.FloatTensor(audio_input)
|
| 68 |
# compute mel spectrogram from the ground truth audio
|
|
@@ -72,15 +77,16 @@ def inference_model(audio_input, h, model):
|
|
| 72 |
|
| 73 |
audio_gen = y_g_hat.squeeze().cpu()
|
| 74 |
spec_gen = get_mel(audio_gen.unsqueeze(0))
|
| 75 |
-
audio_gen = audio_gen
|
| 76 |
-
audio_gen = audio_gen
|
|
|
|
| 77 |
|
| 78 |
# unload to cpu
|
| 79 |
-
model.to(
|
| 80 |
# delete gpu tensor
|
| 81 |
del spec_gt, y_g_hat
|
| 82 |
-
|
| 83 |
-
return audio_gen, spec_gen
|
| 84 |
|
| 85 |
|
| 86 |
css = """
|
|
@@ -222,9 +228,9 @@ css = """
|
|
| 222 |
|
| 223 |
######################## script for loading the models ########################
|
| 224 |
|
| 225 |
-
|
| 226 |
|
| 227 |
-
|
| 228 |
"bigvgan_24khz_100band",
|
| 229 |
"bigvgan_base_24khz_100band",
|
| 230 |
"bigvgan_22khz_80band",
|
|
@@ -236,7 +242,7 @@ list_model_name = [
|
|
| 236 |
"bigvgan_v2_44khz_128band_512x"
|
| 237 |
]
|
| 238 |
|
| 239 |
-
|
| 240 |
"bigvgan_24khz_100band": "g_05000000",
|
| 241 |
"bigvgan_base_24khz_100band": "g_05000000",
|
| 242 |
"bigvgan_22khz_80band": "g_05000000",
|
|
@@ -251,9 +257,9 @@ model_files = {
|
|
| 251 |
list_model = []
|
| 252 |
list_config = []
|
| 253 |
|
| 254 |
-
for model_name in
|
| 255 |
-
model_file = hf_hub_download(
|
| 256 |
-
config_file = hf_hub_download(
|
| 257 |
|
| 258 |
with open(config_file) as f:
|
| 259 |
data = f.read()
|
|
@@ -314,24 +320,29 @@ with iface:
|
|
| 314 |
)
|
| 315 |
|
| 316 |
with gr.Group():
|
| 317 |
-
model_choice = gr.Radio(
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
button = gr.Button("Submit")
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
button.click(
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
|
|
|
| 335 |
|
| 336 |
gr.Examples(
|
| 337 |
[
|
|
@@ -347,7 +358,7 @@ with iface:
|
|
| 347 |
],
|
| 348 |
fn=inference_gradio,
|
| 349 |
inputs=[audio_input, model_choice],
|
| 350 |
-
outputs=[
|
| 351 |
)
|
| 352 |
|
| 353 |
gr.HTML(
|
|
@@ -355,12 +366,12 @@ with iface:
|
|
| 355 |
<table border="1" cellspacing="0" cellpadding="5">
|
| 356 |
<thead>
|
| 357 |
<tr>
|
| 358 |
-
<th>
|
| 359 |
<th>Sampling Rate</th>
|
| 360 |
<th>Mel band</th>
|
| 361 |
<th>fmax</th>
|
| 362 |
<th>Upsampling Ratio</th>
|
| 363 |
-
<th>
|
| 364 |
<th>Dataset</th>
|
| 365 |
<th>Fine-Tuned</th>
|
| 366 |
</tr>
|
|
|
|
| 10 |
from models import BigVGAN as Generator
|
| 11 |
import librosa
|
| 12 |
import numpy as np
|
| 13 |
+
from utils import plot_spectrogram
|
| 14 |
import PIL
|
| 15 |
|
| 16 |
if torch.cuda.is_available():
|
|
|
|
| 43 |
if len(audio.shape) == 2: # stereo
|
| 44 |
audio = librosa.to_mono(audio) # convert to mono if stereo
|
| 45 |
audio = librosa.util.normalize(audio) * 0.95
|
|
|
|
| 46 |
|
| 47 |
+
output, spec_gen = inference_model(
|
| 48 |
+
audio, h, model
|
| 49 |
+
) # output is generated audio in ndarray, int16
|
| 50 |
|
| 51 |
+
spec_plot_gen = plot_spectrogram(spec_gen)
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
output_audio = (h.sampling_rate, output) # tuple for gr.Audio output
|
| 54 |
+
output_image = PIL.Image.frombytes(
|
| 55 |
+
"RGB",
|
| 56 |
+
spec_plot_gen.canvas.get_width_height(),
|
| 57 |
+
spec_plot_gen.canvas.tostring_rgb(),
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
return output_audio, output_image
|
| 61 |
|
| 62 |
|
| 63 |
@spaces.GPU(duration=120)
|
|
|
|
| 66 |
model.to(device)
|
| 67 |
|
| 68 |
def get_mel(x):
|
| 69 |
+
return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
|
| 70 |
+
|
| 71 |
with torch.inference_mode():
|
| 72 |
wav = torch.FloatTensor(audio_input)
|
| 73 |
# compute mel spectrogram from the ground truth audio
|
|
|
|
| 77 |
|
| 78 |
audio_gen = y_g_hat.squeeze().cpu()
|
| 79 |
spec_gen = get_mel(audio_gen.unsqueeze(0))
|
| 80 |
+
audio_gen = audio_gen.numpy() # [T], float [-1, 1]
|
| 81 |
+
audio_gen = (audio_gen * MAX_WAV_VALUE).astype("int16") # [T], int16
|
| 82 |
+
spec_gen = spec_gen.squeeze().numpy() # [C, T_frame]
|
| 83 |
|
| 84 |
# unload to cpu
|
| 85 |
+
model.to("cpu")
|
| 86 |
# delete gpu tensor
|
| 87 |
del spec_gt, y_g_hat
|
| 88 |
+
|
| 89 |
+
return audio_gen, spec_gen
|
| 90 |
|
| 91 |
|
| 92 |
css = """
|
|
|
|
| 228 |
|
| 229 |
######################## script for loading the models ########################
|
| 230 |
|
| 231 |
+
MODEL_PATH = "nvidia/BigVGAN"
|
| 232 |
|
| 233 |
+
LIST_MODEL_NAME = [
|
| 234 |
"bigvgan_24khz_100band",
|
| 235 |
"bigvgan_base_24khz_100band",
|
| 236 |
"bigvgan_22khz_80band",
|
|
|
|
| 242 |
"bigvgan_v2_44khz_128band_512x"
|
| 243 |
]
|
| 244 |
|
| 245 |
+
DICT_MODEL_NAME_FILE_PAIRS = {
|
| 246 |
"bigvgan_24khz_100band": "g_05000000",
|
| 247 |
"bigvgan_base_24khz_100band": "g_05000000",
|
| 248 |
"bigvgan_22khz_80band": "g_05000000",
|
|
|
|
| 257 |
list_model = []
|
| 258 |
list_config = []
|
| 259 |
|
| 260 |
+
for model_name in LIST_MODEL_NAME:
|
| 261 |
+
model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
|
| 262 |
+
config_file = hf_hub_download(MODEL_PATH, f"{model_name}/config.json", use_auth_token=os.environ['TOKEN'])
|
| 263 |
|
| 264 |
with open(config_file) as f:
|
| 265 |
data = f.read()
|
|
|
|
| 320 |
)
|
| 321 |
|
| 322 |
with gr.Group():
|
| 323 |
+
model_choice = gr.Radio(
|
| 324 |
+
label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
|
| 325 |
+
value="bigvgan_v2_24khz_100band_256x",
|
| 326 |
+
choices=[m for m in LIST_MODEL_NAME],
|
| 327 |
+
type="index",
|
| 328 |
+
interactive=True,
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
audio_input = gr.Audio(
|
| 332 |
+
label="Input Audio", elem_id="input-audio", interactive=True
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
button = gr.Button("Submit")
|
| 336 |
+
|
| 337 |
+
output_audio = gr.Audio(label="Output Audio", elem_id="output-audio")
|
| 338 |
+
output_image = gr.Image(label="Output Mel Spectrogram", elem_id="output-image-gen")
|
| 339 |
+
|
| 340 |
+
button.click(
|
| 341 |
+
inference_gradio,
|
| 342 |
+
inputs=[audio_input, model_choice],
|
| 343 |
+
outputs=[output_audio, output_image],
|
| 344 |
+
concurrency_limit=10,
|
| 345 |
+
)
|
| 346 |
|
| 347 |
gr.Examples(
|
| 348 |
[
|
|
|
|
| 358 |
],
|
| 359 |
fn=inference_gradio,
|
| 360 |
inputs=[audio_input, model_choice],
|
| 361 |
+
outputs=[output_audio, output_image]
|
| 362 |
)
|
| 363 |
|
| 364 |
gr.HTML(
|
|
|
|
| 366 |
<table border="1" cellspacing="0" cellpadding="5">
|
| 367 |
<thead>
|
| 368 |
<tr>
|
| 369 |
+
<th>Model Name</th>
|
| 370 |
<th>Sampling Rate</th>
|
| 371 |
<th>Mel band</th>
|
| 372 |
<th>fmax</th>
|
| 373 |
<th>Upsampling Ratio</th>
|
| 374 |
+
<th>Parameters</th>
|
| 375 |
<th>Dataset</th>
|
| 376 |
<th>Fine-Tuned</th>
|
| 377 |
</tr>
|