Spaces:

nvidia
/

BigVGAN

Running

App Files Files Community

L0SG commited on Jul 13, 2024

Commit

eac4c42

1 Parent(s): 3455431

update

Browse files

Files changed (1) hide show

app.py +33 -25

app.py CHANGED Viewed

@@ -35,8 +35,8 @@ def inference_gradio(input, model_choice):  # input is audio waveform in [T, cha
     audio = np.transpose(audio)  # transpose to [channel, T] for librosa
     audio = audio / MAX_WAV_VALUE  # convert int16 to float range used by BigVGAN
-    h = list_config[model_choice]
-    model = list_model[model_choice]
     if sr != h.sampling_rate:  # convert audio to model's sampling rate
         audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
@@ -254,8 +254,8 @@ DICT_MODEL_NAME_FILE_PAIRS = {
     "bigvgan_v2_44khz_128band_512x": "g_03000000"
 }
-list_model = []
-list_config = []
 for model_name in LIST_MODEL_NAME:
     model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
@@ -275,8 +275,8 @@ for model_name in LIST_MODEL_NAME:
     generator.eval()
     generator.remove_weight_norm()
-    list_model.append(generator)
-    list_config.append(h)
 ######################## script for gradio UI ########################
@@ -285,29 +285,29 @@ iface = gr.Blocks(css=css)
 with iface:
     gr.HTML(
         """
-            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
-              <div
-                style="
-                  display: inline-flex;
-                  align-items: center;
-                  gap: 0.8rem;
-                  font-size: 1.75rem;
-                "
-              >
-                <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
-                  BigVGAN: A Universal Neural Vocoder with Large-Scale Training
-                </h1>
-              </div>
-              <p style="margin-bottom: 10px; font-size: 94%">
-                <a href="https://arxiv.org/abs/2206.04658">[Paper]</a>  <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a>  <a href="https://bigvgan-demo.github.io/">[Demo]</a>  <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
-              </p>
             </div>
         """
     )
     gr.HTML(
         """
         <div>
-        <h2>News</h2>
         <p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
         <ul>
             <li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
@@ -318,13 +318,21 @@ with iface:
         </div>
         """
     )
     with gr.Group():
-        model_choice = gr.Radio(
             label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
             value="bigvgan_v2_24khz_100band_256x",
             choices=[m for m in LIST_MODEL_NAME],
-            type="index",
             interactive=True,
         )

     audio = np.transpose(audio)  # transpose to [channel, T] for librosa
     audio = audio / MAX_WAV_VALUE  # convert int16 to float range used by BigVGAN
+    h = dict_config[model_choice]
+    model = dict_model[model_choice]
     if sr != h.sampling_rate:  # convert audio to model's sampling rate
         audio = librosa.resample(audio, orig_sr=sr, target_sr=h.sampling_rate)
     "bigvgan_v2_44khz_128band_512x": "g_03000000"
 }
+dict_model = {}
+dict_config = {}
 for model_name in LIST_MODEL_NAME:
     model_file = hf_hub_download(MODEL_PATH, f"{model_name}/{DICT_MODEL_NAME_FILE_PAIRS[model_name]}", use_auth_token=os.environ['TOKEN'])
     generator.eval()
     generator.remove_weight_norm()
+    dict_model[model_name] = generator
+    dict_config[model_name] = h
 ######################## script for gradio UI ########################
 with iface:
     gr.HTML(
         """
+        <div style="text-align: center; max-width: 900px; margin: 0 auto;">
+            <div
+            style="
+                display: inline-flex;
+                align-items: center;
+                gap: 0.8rem;
+                font-size: 1.75rem;
+            "
+            >
+            <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+                BigVGAN: A Universal Neural Vocoder with Large-Scale Training
+            </h1>
             </div>
+            <p style="margin-bottom: 10px; font-size: 125%">
+            <a href="https://arxiv.org/abs/2206.04658">[Paper]</a>  <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a>  <a href="https://bigvgan-demo.github.io/">[Demo]</a>  <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
+            </p>
+        </div>
         """
     )
     gr.HTML(
         """
         <div>
+        <h3>News</h3>
         <p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
         <ul>
             <li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
         </div>
         """
     )
+    gr.HTML(
+        """
+        <div>
+        <h3>Model Overview</h3>
+        BigVGAN is a neural vocoder model that generates audio waveforms using mel spectrogram as inputs.
+        <center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800" style="margin-top: 20px;"></center>
+        </div>
+        """
+    )
     with gr.Group():
+        model_choice = gr.Dropdown(
             label="Select the model. Default: bigvgan_v2_24khz_100band_256x",
             value="bigvgan_v2_24khz_100band_256x",
             choices=[m for m in LIST_MODEL_NAME],
             interactive=True,
         )