Spaces:

nvidia
/

BigVGAN

Running

App Files Files Community

L0SG commited on Jul 13, 2024

Commit

3455431

1 Parent(s): ec3f86c

switch to gr.Audio for output display

Browse files

Files changed (1) hide show

app.py +52 -41

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ from meldataset import mel_spectrogram, MAX_WAV_VALUE
 from models import BigVGAN as Generator
 import librosa
 import numpy as np
-from utils import plot_spectrogram, plot_spectrogram_clipped
 import PIL
 if torch.cuda.is_available():
@@ -43,16 +43,21 @@ def inference_gradio(input, model_choice):  # input is audio waveform in [T, cha
     if len(audio.shape) == 2:  # stereo
         audio = librosa.to_mono(audio)  # convert to mono if stereo
     audio = librosa.util.normalize(audio) * 0.95
-    output, spec_gen = inference_model(audio, h, model)  # output is generated audio in ndarray
-    spec_plot_gen = plot_spectrogram(spec_gen.numpy())
-    output_video = gr.make_waveform((h.sampling_rate, output))
-    output_image_gen = PIL.Image.frombytes('RGB',
-                                           spec_plot_gen.canvas.get_width_height(),
-                                           spec_plot_gen.canvas.tostring_rgb())
-    return output_video, output_image_gen
 @spaces.GPU(duration=120)
@@ -61,8 +66,8 @@ def inference_model(audio_input, h, model):
     model.to(device)
     def get_mel(x):
-            return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
     with torch.inference_mode():
         wav = torch.FloatTensor(audio_input)
         # compute mel spectrogram from the ground truth audio
@@ -72,15 +77,16 @@ def inference_model(audio_input, h, model):
         audio_gen = y_g_hat.squeeze().cpu()
         spec_gen = get_mel(audio_gen.unsqueeze(0))
-        audio_gen = audio_gen * MAX_WAV_VALUE
-        audio_gen = audio_gen.numpy().astype('int16')
     # unload to cpu
-    model.to('cpu')
     # delete gpu tensor
     del spec_gt, y_g_hat
-    return audio_gen, spec_gen[0]
 css = """
@@ -222,9 +228,9 @@ css = """
 ######################## script for loading the models ########################
-model_path = "nvidia/BigVGAN"
-list_model_name = [
     "bigvgan_24khz_100band",
     "bigvgan_base_24khz_100band",
     "bigvgan_22khz_80band",
@@ -236,7 +242,7 @@ list_model_name = [
     "bigvgan_v2_44khz_128band_512x"
 ]
-model_files = {
     "bigvgan_24khz_100band": "g_05000000",
     "bigvgan_base_24khz_100band": "g_05000000",
     "bigvgan_22khz_80band": "g_05000000",
@@ -251,9 +257,9 @@ model_files = {
 list_model = []
 list_config = []
-for model_name in list_model_name:
-    model_file = hf_hub_download(model_path, f"{model_name}/{model_files[model_name]}", use_auth_token=os.environ['TOKEN'])
-    config_file = hf_hub_download(model_path, f"{model_name}/config.json", use_auth_token=os.environ['TOKEN'])
     with open(config_file) as f:
         data = f.read()
@@ -314,24 +320,29 @@ with iface:
     )
     with gr.Group():
-        model_choice = gr.Radio(label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
-                                value="bigvgan_v2_24khz_100band_256x",
-                                choices=[m for m in list_model_name],
-                                type="index",
-                                interactive=True)
-        audio_input = gr.Audio(label="Input Audio",
-                                elem_id="input-audio",
-                                interactive=True)
         button = gr.Button("Submit")
-        output_video = gr.Video(label="Output Audio",
-                                elem_id="output-video")
-        output_image_gen = gr.Image(label="Output Mel Spectrogram",
-                                    elem_id="output-image-gen")
-        button.click(inference_gradio,
-                        inputs=[audio_input, model_choice],
-                        outputs=[output_video, output_image_gen],
-                        concurrency_limit=10
-                        )
         gr.Examples(
             [
@@ -347,7 +358,7 @@ with iface:
             ],
             fn=inference_gradio,
             inputs=[audio_input, model_choice],
-            outputs=[output_video, output_image_gen]
         )
     gr.HTML(
@@ -355,12 +366,12 @@ with iface:
             <table border="1" cellspacing="0" cellpadding="5">
                 <thead>
                     <tr>
-                        <th>Folder Name</th>
                         <th>Sampling Rate</th>
                         <th>Mel band</th>
                         <th>fmax</th>
                         <th>Upsampling Ratio</th>
-                        <th>Params.</th>
                         <th>Dataset</th>
                         <th>Fine-Tuned</th>
                     </tr>

 from models import BigVGAN as Generator
 import librosa
 import numpy as np
+from utils import plot_spectrogram
 import PIL
 if torch.cuda.is_available():
     if len(audio.shape) == 2:  # stereo
         audio = librosa.to_mono(audio)  # convert to mono if stereo
     audio = librosa.util.normalize(audio) * 0.95
+    output, spec_gen = inference_model(
+        audio, h, model
+    )  # output is generated audio in ndarray, int16
+    spec_plot_gen = plot_spectrogram(spec_gen)
+    output_audio = (h.sampling_rate, output) # tuple for gr.Audio output
+    output_image = PIL.Image.frombytes(
+        "RGB",
+        spec_plot_gen.canvas.get_width_height(),
+        spec_plot_gen.canvas.tostring_rgb(),
+    )
+    return output_audio, output_image
 @spaces.GPU(duration=120)
     model.to(device)
     def get_mel(x):
+        return mel_spectrogram(x, h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax)
     with torch.inference_mode():
         wav = torch.FloatTensor(audio_input)
         # compute mel spectrogram from the ground truth audio
         audio_gen = y_g_hat.squeeze().cpu()
         spec_gen = get_mel(audio_gen.unsqueeze(0))
+        audio_gen = audio_gen.numpy()  # [T], float [-1, 1]
+        audio_gen = (audio_gen * MAX_WAV_VALUE).astype("int16")  # [T], int16
+        spec_gen = spec_gen.squeeze().numpy()  # [C, T_frame]
     # unload to cpu
+    model.to("cpu")
     # delete gpu tensor
     del spec_gt, y_g_hat
+    return audio_gen, spec_gen
 css = """
 ######################## script for loading the models ########################
+MODEL_PATH = "nvidia/BigVGAN"
+LIST_MODEL_NAME = [
     "bigvgan_24khz_100band",
     "bigvgan_base_24khz_100band",
     "bigvgan_22khz_80band",
     "bigvgan_v2_44khz_128band_512x"
 ]
+DICT_MODEL_NAME_FILE_PAIRS = {
     "bigvgan_24khz_100band": "g_05000000",
     "bigvgan_base_24khz_100band": "g_05000000",
     "bigvgan_22khz_80band": "g_05000000",
 list_model = []
 list_config = []
+for model_name in LIST_MODEL_NAME:
+    model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
+    config_file = hf_hub_download(MODEL_PATH, f"{model_name}/config.json", use_auth_token=os.environ['TOKEN'])
     with open(config_file) as f:
         data = f.read()
     )
     with gr.Group():
+        model_choice = gr.Radio(
+            label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
+            value="bigvgan_v2_24khz_100band_256x",
+            choices=[m for m in LIST_MODEL_NAME],
+            type="index",
+            interactive=True,
+        )
+        audio_input = gr.Audio(
+            label="Input Audio", elem_id="input-audio", interactive=True
+        )
         button = gr.Button("Submit")
+        output_audio = gr.Audio(label="Output Audio", elem_id="output-audio")
+        output_image = gr.Image(label="Output Mel Spectrogram", elem_id="output-image-gen")
+        button.click(
+            inference_gradio,
+            inputs=[audio_input, model_choice],
+            outputs=[output_audio, output_image],
+            concurrency_limit=10,
+        )
         gr.Examples(
             [
             ],
             fn=inference_gradio,
             inputs=[audio_input, model_choice],
+            outputs=[output_audio, output_image]
         )
     gr.HTML(
             <table border="1" cellspacing="0" cellpadding="5">
                 <thead>
                     <tr>
+                        <th>Model Name</th>
                         <th>Sampling Rate</th>
                         <th>Mel band</th>
                         <th>fmax</th>
                         <th>Upsampling Ratio</th>
+                        <th>Parameters</th>
                         <th>Dataset</th>
                         <th>Fine-Tuned</th>
                     </tr>