Spaces:

m-a-p
/

Music-Descriptor

Running

App Files Files Community

Epsilon617 commited on Jun 2, 2023

Commit

cb013a1

1 Parent(s): 479afcd

reformatting the outputs as dataframe

Browse files

Files changed (3) hide show

Prediction_Head/__pycache__/MTGGenre_head.cpython-310.pyc +0 -0
__pycache__/app.cpython-310.pyc +0 -0
app.py +45 -21

Prediction_Head/__pycache__/MTGGenre_head.cpython-310.pyc ADDED Viewed

Binary file (1.67 kB). View file

__pycache__/app.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-310.pyc and b/__pycache__/app.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -10,6 +10,9 @@ import logging
 import json
 import os
 import importlib
 modeling_MERT = importlib.import_module("MERT-v1-95M.modeling_MERT")
@@ -36,8 +39,7 @@ inputs = [
 live_inputs = [
     gr.Audio(source="microphone",streaming=True, type="filepath"),
 ]
-# outputs = [gr.components.Textbox()]
-# outputs = [gr.components.Textbox(), transcription_df]
 title = "One Model for All Music Understanding Tasks"
 description = "An example of using the [MERT-v1-95M](https://huggingface.co/m-a-p/MERT-v1-95M) model as backbone to conduct multiple music understanding tasks with the universal represenation."
 article = "The tasks include EMO, GS, MTGInstrument, MTGGenre, MTGTop50, MTGMood, NSynthI, NSynthP, VocalSetS, VocalSetT. \n\n More models can be referred at the [map organization page](https://huggingface.co/m-a-p)."
@@ -46,6 +48,17 @@ audio_examples = [
     # ["input/example-2.wav"],
 ]
 # Load the model and the corresponding preprocessor config
 # model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True)
 # processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)
@@ -105,6 +118,7 @@ for task in TASKS:
 model.to(device)
 def model_infernce(inputs):
     waveform, sample_rate = torchaudio.load(inputs)
@@ -112,7 +126,7 @@ def model_infernce(inputs):
     # make sure the sample_rate aligned
     if resample_rate != sample_rate:
-        print(f'setting rate from {sample_rate} to {resample_rate}')
         resampler = T.Resample(sample_rate, resample_rate)
         waveform = resampler(waveform)
@@ -129,13 +143,16 @@ def model_infernce(inputs):
     all_layer_hidden_states = all_layer_hidden_states.mean(dim=2)
     task_output_texts = ""
     for task in TASKS:
         num_class = len(ID2CLASS[task].keys())
         if MERT_BEST_LAYER_IDX[task] == 'all':
             logits = CLASSIFIERS[task](all_layer_hidden_states) # [1, 87]
         else:
             logits = CLASSIFIERS[task](all_layer_hidden_states[:, MERT_BEST_LAYER_IDX[task]])
-        print(f'task {task} logits:', logits.shape, 'num class:', num_class)
         sorted_idx = torch.argsort(logits, dim = -1, descending=True)[0] # batch =1
         sorted_prob,_ = torch.sort(nn.functional.softmax(logits[0], dim=-1), dim=-1, descending=True)
@@ -145,33 +162,40 @@ def model_infernce(inputs):
         top_n_show = 3 if num_class >= 3 else num_class
         task_output_texts = task_output_texts + f"TASK {task} output:\n" + "\n".join([str(ID2CLASS[task][str(sorted_idx[idx].item())])+f', probability: {sorted_prob[idx].item():.2%}' for idx in range(top_n_show)]) + '\n'
         task_output_texts = task_output_texts + '----------------------\n'
-        # output_texts = "\n".join([id2cls[str(idx.item())].replace('genre---', '') for idx in sorted_idx[:5]])
-    # logger.warning(all_layer_hidden_states.shape)
-    # return f"device {device}, sample reprensentation:  {str(all_layer_hidden_states[12, 0, :10])}"
-    # return f"device: {device}\n" + output_texts
-    return task_output_texts
 def convert_audio(inputs, microphone):
     if (microphone is not None):
         inputs = microphone
-    text = model_infernce(inputs)
-    return text
 def live_convert_audio(microphone):
     if (microphone is not None):
         inputs = microphone
-    text = model_infernce(inputs)
-    return text
 audio_chunked = gr.Interface(
     fn=convert_audio,
     inputs=inputs,
-    outputs=[gr.components.Textbox()],
     allow_flagging="never",
     title=title,
     description=description,
@@ -182,7 +206,7 @@ audio_chunked = gr.Interface(
 live_audio_chunked = gr.Interface(
     fn=live_convert_audio,
     inputs=live_inputs,
-    outputs=[gr.components.Textbox()],
     allow_flagging="never",
     title=title,
     description=description,
@@ -204,5 +228,5 @@ with demo:
             "Live Streaming Music"
         ]
     )
-demo.queue(concurrency_count=1, max_size=5)
 demo.launch(show_api=False)

 import json
 import os
+import re
+import pandas as pd
 import importlib
 modeling_MERT = importlib.import_module("MERT-v1-95M.modeling_MERT")
 live_inputs = [
     gr.Audio(source="microphone",streaming=True, type="filepath"),
 ]
 title = "One Model for All Music Understanding Tasks"
 description = "An example of using the [MERT-v1-95M](https://huggingface.co/m-a-p/MERT-v1-95M) model as backbone to conduct multiple music understanding tasks with the universal represenation."
 article = "The tasks include EMO, GS, MTGInstrument, MTGGenre, MTGTop50, MTGMood, NSynthI, NSynthP, VocalSetS, VocalSetT. \n\n More models can be referred at the [map organization page](https://huggingface.co/m-a-p)."
     # ["input/example-2.wav"],
 ]
+df_init = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
+transcription_df = gr.DataFrame(value=df_init, label="Output Dataframe", row_count=(
+    0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
+# outputs = [gr.components.Textbox()]
+outputs = [ transcription_df]
+df_init_live = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
+transcription_df_live = gr.DataFrame(value=df_init_live, label="Output Dataframe", row_count=(
+    0, "dynamic"), max_rows=30, wrap=True, overflow_row_behaviour='paginate')
+outputs_live = [transcription_df_live]
 # Load the model and the corresponding preprocessor config
 # model = AutoModel.from_pretrained("m-a-p/MERT-v0-public", trust_remote_code=True)
 # processor = Wav2Vec2FeatureExtractor.from_pretrained("m-a-p/MERT-v0-public",trust_remote_code=True)
 model.to(device)
 def model_infernce(inputs):
     waveform, sample_rate = torchaudio.load(inputs)
     # make sure the sample_rate aligned
     if resample_rate != sample_rate:
+        # print(f'setting rate from {sample_rate} to {resample_rate}')
         resampler = T.Resample(sample_rate, resample_rate)
         waveform = resampler(waveform)
     all_layer_hidden_states = all_layer_hidden_states.mean(dim=2)
     task_output_texts = ""
+    df = pd.DataFrame(columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
+    df_objects = []
     for task in TASKS:
         num_class = len(ID2CLASS[task].keys())
         if MERT_BEST_LAYER_IDX[task] == 'all':
             logits = CLASSIFIERS[task](all_layer_hidden_states) # [1, 87]
         else:
             logits = CLASSIFIERS[task](all_layer_hidden_states[:, MERT_BEST_LAYER_IDX[task]])
+        # print(f'task {task} logits:', logits.shape, 'num class:', num_class)
         sorted_idx = torch.argsort(logits, dim = -1, descending=True)[0] # batch =1
         sorted_prob,_ = torch.sort(nn.functional.softmax(logits[0], dim=-1), dim=-1, descending=True)
         top_n_show = 3 if num_class >= 3 else num_class
         task_output_texts = task_output_texts + f"TASK {task} output:\n" + "\n".join([str(ID2CLASS[task][str(sorted_idx[idx].item())])+f', probability: {sorted_prob[idx].item():.2%}' for idx in range(top_n_show)]) + '\n'
         task_output_texts = task_output_texts + '----------------------\n'
+        row_elements = [task]
+        for idx in range(top_n_show):
+            print(ID2CLASS[task])
+            # print('id', str(sorted_idx[idx].item()))
+            output_class_name = str(ID2CLASS[task][str(sorted_idx[idx].item())])
+            output_class_name = re.sub(r'^\w+---', '', output_class_name)
+            output_class_name = re.sub(r'^\w+\/\w+---', '', output_class_name)
+            # print('output name', output_class_name)
+            output_prob = f' {sorted_prob[idx].item():.2%}'
+            row_elements.append(output_class_name+output_prob)
+        # fill empty elment
+        for _ in range(4 - len(row_elements)):
+            row_elements.append(' ')
+        df_objects.append(row_elements)
+    df = pd.DataFrame(df_objects, columns=['Task', 'Top 1', 'Top 2', 'Top 3'])
+    return df
 def convert_audio(inputs, microphone):
     if (microphone is not None):
         inputs = microphone
+    df = model_infernce(inputs)
+    return df
 def live_convert_audio(microphone):
     if (microphone is not None):
         inputs = microphone
+    df = model_infernce(inputs)
+    return df
 audio_chunked = gr.Interface(
     fn=convert_audio,
     inputs=inputs,
+    outputs=outputs,
     allow_flagging="never",
     title=title,
     description=description,
 live_audio_chunked = gr.Interface(
     fn=live_convert_audio,
     inputs=live_inputs,
+    outputs=outputs_live,
     allow_flagging="never",
     title=title,
     description=description,
             "Live Streaming Music"
         ]
     )
+# demo.queue(concurrency_count=1, max_size=5)
 demo.launch(show_api=False)