Spaces:

lmms-lab
/

Aero-1-Audio-Demo

Runtime error

App Files Files Community

kcz358 commited on Apr 29

Commit

9cdb7cc

verified ·

1 Parent(s): b199ca0

Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +2 -0
.gitignore +1 -0
Makefile +19 -0
app.py +109 -58
examples/audio_instruction.wav +3 -0
examples/audio_understand.wav +3 -0
examples/elon_musk.mp3 +3 -0
examples/music_under.wav +3 -0
examples/nuggets.mp3 +3 -0
examples/nvidia_conference.mp3 +3 -0
requirements.txt +5 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

Makefile ADDED Viewed

	@@ -0,0 +1,19 @@

+.PHONY: style format start clean
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .
+start:
+	gradio app.py
+clean:
+	ps aux | grep "app" | grep -v "grep" | awk '{print $$2}' | xargs kill -9

app.py CHANGED Viewed

@@ -1,64 +1,115 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
     ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoProcessor
+import librosa
+def split_audio(audio_arrays, chunk_limit=480000):
+    CHUNK_LIM = chunk_limit
+    audio_splits = []
+    # Split the loaded audio to 30s chunks and extend the messages content
+    for i in range(
+        0,
+        len(audio_arrays),
+        CHUNK_LIM,
     ):
+        audio_splits.append(audio_arrays[i : i + CHUNK_LIM])
+    return audio_splits
+# Placeholder for your actual LLM processing API call
+def process_audio(audio, text, chat_history):
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+            ],
+        },
+    ]
+    audio = librosa.load(audio, sr=16000)[0]
+    if audio is not None:
+        splitted_audio = split_audio(audio)
+        for au in splitted_audio:
+            conversation[0]["content"].append(
+                {
+                    "type": "audio_url",
+                    "audio": "placeholder",
+                }
+            )
+        chat_history.append({"role": "user", "content": gr.Audio(value=(16000, audio))})
+    conversation[0]["content"].append(
+        {
+            "type": "text",
+            "text": text,
+        }
+    )
+    chat_history.append({"role": "user", "content": text})
+    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+    inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True)
+    inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    outputs = model.generate(**inputs, eos_token_id=151645, pad_token_id=151643, max_new_tokens=4096)
+    cont = outputs[:, inputs["input_ids"].shape[-1] :]
+    result = processor.batch_decode(cont, skip_special_tokens=True)[0]
+    chat_history.append(
+        {
+            "role": "assistant",
+            "content": result,
+        }
+    )
+    return chat_history
+with gr.Blocks() as demo:
+    gr.Markdown("## 🎙️ Aero-1-Audio")
+    gr.Markdown(
+    """
+    Aero-1-Audio is a compact audio model. With only 1.5B parameters and 50k hours training data, it can perform a variety of tasks, including:
+    ASR, basic Audio Understanding, Audio Instruction Following, and scene analysis
+    We provide several examples such as:
+    - nvidia conference and a show from elon musk for long ASR
+    - Simple Audio Instruction Following
+    - Audio Understanding for weather and music
+    The model might not be able to follow your instruction in multiple cases and might be wrong in many times
+    """
+    )
+    chatbot = gr.Chatbot(type="messages")
+    with gr.Row(variant="compact", equal_height=True):
+        audio_input = gr.Audio(label="Speak Here", type="filepath")
+        text_input = gr.Textbox(label="Text Input", placeholder="Type here", interactive=True)
+    with gr.Row():
+        chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear")
+        chatbot_submit = gr.Button("Submit", variant="primary")
+        chatbot_submit.click(
+            process_audio,
+            inputs=[audio_input, text_input, chatbot],
+            outputs=[chatbot],
+        )
+    gr.Examples(
+        [
+            ["Please transcribe the audio for me", "./examples/elon_musk.mp3"],
+            ["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"],
+            ["Please transcribe the audio for me", "./examples/nuggets.mp3"],
+            ["Please follow the instruction in the audio", "./examples/audio_instruction.wav"],
+            ["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"],
+            ["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"],
+        ],
+        inputs=[text_input, audio_input],
+        label="Examples",
+    )
 if __name__ == "__main__":
+    processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True)
     demo.launch()

examples/audio_instruction.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f422585aebb2b59288267f8bd27313c36593a8c3a4686981c081edba9b323ed3
+size 322284

examples/audio_understand.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee10f548b4852f6cf082ecb8f8a652981487bfdc081ee6eb7e1e4a7a6c63a30f
+size 1455146

examples/elon_musk.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08642086ea9f6efa1aeb0593aacab8dd975bbd254c07863e92881b7c3aa464fa
+size 2686804

examples/music_under.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31ba05f979d0b66e08b6fd7eec87a6f4f1d90887111bf8de6ce8005450606d29
+size 3834990

examples/nuggets.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31453a7d842b64082a0a587e4222c9e6716e3f03560d0602db5ae042a0815381
+size 772564

examples/nvidia_conference.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a20df97d1fec6147accce706b61ba7e918850d75cc7044a4fa1ac72d67ad9b05
+size 14659846

requirements.txt CHANGED Viewed

	@@ -1 +1,5 @@
1	- huggingface_hub~~==0.25.2~~

+huggingface_hub
+librosa
+transformers@git+https://github.com/huggingface/transformers@v4.51.3-Qwen2.5-Omni-preview
+torch
+accelerate