Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- .gitignore +1 -0
- Makefile +19 -0
- app.py +109 -58
- examples/audio_instruction.wav +3 -0
- examples/audio_understand.wav +3 -0
- examples/elon_musk.mp3 +3 -0
- examples/music_under.wav +3 -0
- examples/nuggets.mp3 +3 -0
- examples/nvidia_conference.mp3 +3 -0
- requirements.txt +5 -1
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
Makefile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: style format start clean
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
style:
|
| 5 |
+
python -m black --line-length 119 .
|
| 6 |
+
python -m isort .
|
| 7 |
+
ruff check --fix .
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
quality:
|
| 11 |
+
python -m black --check --line-length 119 .
|
| 12 |
+
python -m isort --check-only .
|
| 13 |
+
ruff check .
|
| 14 |
+
|
| 15 |
+
start:
|
| 16 |
+
gradio app.py
|
| 17 |
+
|
| 18 |
+
clean:
|
| 19 |
+
ps aux | grep "app" | grep -v "grep" | awk '{print $$2}' | xargs kill -9
|
app.py
CHANGED
|
@@ -1,64 +1,115 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
system_message,
|
| 14 |
-
max_tokens,
|
| 15 |
-
temperature,
|
| 16 |
-
top_p,
|
| 17 |
-
):
|
| 18 |
-
messages = [{"role": "system", "content": system_message}]
|
| 19 |
-
|
| 20 |
-
for val in history:
|
| 21 |
-
if val[0]:
|
| 22 |
-
messages.append({"role": "user", "content": val[0]})
|
| 23 |
-
if val[1]:
|
| 24 |
-
messages.append({"role": "assistant", "content": val[1]})
|
| 25 |
-
|
| 26 |
-
messages.append({"role": "user", "content": message})
|
| 27 |
-
|
| 28 |
-
response = ""
|
| 29 |
-
|
| 30 |
-
for message in client.chat_completion(
|
| 31 |
-
messages,
|
| 32 |
-
max_tokens=max_tokens,
|
| 33 |
-
stream=True,
|
| 34 |
-
temperature=temperature,
|
| 35 |
-
top_p=top_p,
|
| 36 |
):
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
"""
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
if __name__ == "__main__":
|
|
|
|
|
|
|
| 64 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
| 3 |
+
import librosa
|
| 4 |
+
|
| 5 |
+
def split_audio(audio_arrays, chunk_limit=480000):
|
| 6 |
+
CHUNK_LIM = chunk_limit
|
| 7 |
+
audio_splits = []
|
| 8 |
+
# Split the loaded audio to 30s chunks and extend the messages content
|
| 9 |
+
for i in range(
|
| 10 |
+
0,
|
| 11 |
+
len(audio_arrays),
|
| 12 |
+
CHUNK_LIM,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
):
|
| 14 |
+
audio_splits.append(audio_arrays[i : i + CHUNK_LIM])
|
| 15 |
+
return audio_splits
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Placeholder for your actual LLM processing API call
|
| 19 |
+
def process_audio(audio, text, chat_history):
|
| 20 |
+
conversation = [
|
| 21 |
+
{
|
| 22 |
+
"role": "user",
|
| 23 |
+
"content": [
|
| 24 |
+
],
|
| 25 |
+
},
|
| 26 |
+
]
|
| 27 |
+
audio = librosa.load(audio, sr=16000)[0]
|
| 28 |
+
|
| 29 |
+
if audio is not None:
|
| 30 |
+
splitted_audio = split_audio(audio)
|
| 31 |
+
for au in splitted_audio:
|
| 32 |
+
conversation[0]["content"].append(
|
| 33 |
+
{
|
| 34 |
+
"type": "audio_url",
|
| 35 |
+
"audio": "placeholder",
|
| 36 |
+
}
|
| 37 |
+
)
|
| 38 |
+
chat_history.append({"role": "user", "content": gr.Audio(value=(16000, audio))})
|
| 39 |
+
|
| 40 |
+
conversation[0]["content"].append(
|
| 41 |
+
{
|
| 42 |
+
"type": "text",
|
| 43 |
+
"text": text,
|
| 44 |
+
}
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
chat_history.append({"role": "user", "content": text})
|
| 48 |
+
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
| 49 |
+
inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True)
|
| 50 |
+
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 51 |
+
outputs = model.generate(**inputs, eos_token_id=151645, pad_token_id=151643, max_new_tokens=4096)
|
| 52 |
+
|
| 53 |
+
cont = outputs[:, inputs["input_ids"].shape[-1] :]
|
| 54 |
+
|
| 55 |
+
result = processor.batch_decode(cont, skip_special_tokens=True)[0]
|
| 56 |
+
chat_history.append(
|
| 57 |
+
{
|
| 58 |
+
"role": "assistant",
|
| 59 |
+
"content": result,
|
| 60 |
+
}
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return chat_history
|
| 64 |
+
|
| 65 |
+
with gr.Blocks() as demo:
|
| 66 |
+
gr.Markdown("## ποΈ Aero-1-Audio")
|
| 67 |
+
gr.Markdown(
|
| 68 |
+
"""
|
| 69 |
+
Aero-1-Audio is a compact audio model. With only 1.5B parameters and 50k hours training data, it can perform a variety of tasks, including:
|
| 70 |
+
ASR, basic Audio Understanding, Audio Instruction Following, and scene analysis
|
| 71 |
+
|
| 72 |
+
We provide several examples such as:
|
| 73 |
+
- nvidia conference and a show from elon musk for long ASR
|
| 74 |
+
- Simple Audio Instruction Following
|
| 75 |
+
- Audio Understanding for weather and music
|
| 76 |
+
|
| 77 |
+
The model might not be able to follow your instruction in multiple cases and might be wrong in many times
|
| 78 |
+
|
| 79 |
+
"""
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
chatbot = gr.Chatbot(type="messages")
|
| 83 |
+
|
| 84 |
+
with gr.Row(variant="compact", equal_height=True):
|
| 85 |
+
audio_input = gr.Audio(label="Speak Here", type="filepath")
|
| 86 |
+
text_input = gr.Textbox(label="Text Input", placeholder="Type here", interactive=True)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
with gr.Row():
|
| 90 |
+
chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear")
|
| 91 |
+
chatbot_submit = gr.Button("Submit", variant="primary")
|
| 92 |
+
chatbot_submit.click(
|
| 93 |
+
process_audio,
|
| 94 |
+
inputs=[audio_input, text_input, chatbot],
|
| 95 |
+
outputs=[chatbot],
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
gr.Examples(
|
| 99 |
+
[
|
| 100 |
+
["Please transcribe the audio for me", "./examples/elon_musk.mp3"],
|
| 101 |
+
["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"],
|
| 102 |
+
["Please transcribe the audio for me", "./examples/nuggets.mp3"],
|
| 103 |
+
["Please follow the instruction in the audio", "./examples/audio_instruction.wav"],
|
| 104 |
+
["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"],
|
| 105 |
+
["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"],
|
| 106 |
+
],
|
| 107 |
+
inputs=[text_input, audio_input],
|
| 108 |
+
label="Examples",
|
| 109 |
+
)
|
| 110 |
|
| 111 |
|
| 112 |
if __name__ == "__main__":
|
| 113 |
+
processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True)
|
| 114 |
+
model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True)
|
| 115 |
demo.launch()
|
examples/audio_instruction.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f422585aebb2b59288267f8bd27313c36593a8c3a4686981c081edba9b323ed3
|
| 3 |
+
size 322284
|
examples/audio_understand.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee10f548b4852f6cf082ecb8f8a652981487bfdc081ee6eb7e1e4a7a6c63a30f
|
| 3 |
+
size 1455146
|
examples/elon_musk.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08642086ea9f6efa1aeb0593aacab8dd975bbd254c07863e92881b7c3aa464fa
|
| 3 |
+
size 2686804
|
examples/music_under.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31ba05f979d0b66e08b6fd7eec87a6f4f1d90887111bf8de6ce8005450606d29
|
| 3 |
+
size 3834990
|
examples/nuggets.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31453a7d842b64082a0a587e4222c9e6716e3f03560d0602db5ae042a0815381
|
| 3 |
+
size 772564
|
examples/nvidia_conference.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a20df97d1fec6147accce706b61ba7e918850d75cc7044a4fa1ac72d67ad9b05
|
| 3 |
+
size 14659846
|
requirements.txt
CHANGED
|
@@ -1 +1,5 @@
|
|
| 1 |
-
huggingface_hub
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
huggingface_hub
|
| 2 |
+
librosa
|
| 3 |
+
transformers@git+https://github.com/huggingface/transformers@v4.51.3-Qwen2.5-Omni-preview
|
| 4 |
+
torch
|
| 5 |
+
accelerate
|