Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoProcessor, TextIteratorStreamer | |
| import librosa | |
| from threading import Thread | |
| import spaces | |
| def split_audio(audio_arrays, chunk_limit=480000): | |
| CHUNK_LIM = chunk_limit | |
| audio_splits = [] | |
| # Split the loaded audio to 30s chunks and extend the messages content | |
| for i in range( | |
| 0, | |
| len(audio_arrays), | |
| CHUNK_LIM, | |
| ): | |
| audio_splits.append(audio_arrays[i : i + CHUNK_LIM]) | |
| return audio_splits | |
| def user(audio, text, chat_history): | |
| if audio is not None: | |
| chat_history.append(gr.ChatMessage(role="user", content={"path": audio, "alt_text": "Audio"})) | |
| chat_history.append({"role": "user", "content": text}) | |
| return "", chat_history | |
| def process_audio(audio, text, chat_history): | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| ], | |
| }, | |
| ] | |
| audio_path = audio | |
| audio = librosa.load(audio, sr=16000)[0] | |
| if audio is not None: | |
| splitted_audio = split_audio(audio) | |
| for au in splitted_audio: | |
| conversation[0]["content"].append( | |
| { | |
| "type": "audio_url", | |
| "audio": "placeholder", | |
| } | |
| ) | |
| # chat_history.append(gr.ChatMessage(role="user", content={"path": audio_path, "alt_text": "Audio"})) | |
| conversation[0]["content"].append( | |
| { | |
| "type": "text", | |
| "text": text, | |
| } | |
| ) | |
| # chat_history.append({"role": "user", "content": text}) | |
| # Set up the streamer for token generation | |
| streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) | |
| inputs = processor(text=prompt, audios=splitted_audio, sampling_rate=16000, return_tensors="pt", padding=True) | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| # Set up generation arguments including max tokens and streamer | |
| generation_args = { | |
| "max_new_tokens": 4096, | |
| "streamer": streamer, | |
| "eos_token_id":151645, | |
| "pad_token_id":151643, | |
| **inputs | |
| } | |
| # Start a separate thread for model generation to allow streaming output | |
| chat_history.append({"role": "assistant", "content": ""}) | |
| thread = Thread( | |
| target=model.generate, | |
| kwargs=generation_args, | |
| ) | |
| thread.start() | |
| for character in streamer: | |
| chat_history[-1]['content'] += character | |
| yield chat_history | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🎙️ Aero-1-Audio") | |
| gr.Markdown( | |
| """ | |
|  | |
| Aero-1-Audio is a lightweight audio-language model with only 1.5 billion parameters, trained on 50,000 hours of high-quality audio data. Despite its compact size, it supports a wide range of tasks, such as Automatic Speech Recognition (ASR), Basic Audio Understanding, Audio Instruction Following, and Scene Audio Analysis. | |
| Notably, Aero-1-Audio excels at lossless ASR on ultra-long audio—up to 16 minutes—without the need for audio segmentation. | |
| [Github](https://github.com/EvolvingLMMs-Lab/Aero-1/blob/main/README.md) | [Playground](https://huggingface.co/spaces/lmms-lab/Aero-1-Audio-Demo) | [Model Checkpoints](https://huggingface.co/lmms-lab/Aero-1-Audio-1.5B) | [Evaluation Results](https://github.com/EvolvingLMMs-Lab/lmms-eval/pull/658) | [Cookbook](https://www.lmms-lab.com/posts/lmms-lab-docs/aero_audio/) | |
| To explore its capabilities, you can upload your own audio or record your voice directly. | |
| Or simply start by trying the example demo below. | |
| ⚠️ Disclaimer: Aero-1-Audio is still under active development. Occasional inaccuracies may occur. We appreciate your understanding and welcome any feedback to help us make it better. | |
| """ | |
| ) | |
| chatbot = gr.Chatbot(type="messages") | |
| with gr.Row(variant="compact", equal_height=True): | |
| audio_input = gr.Audio(label="Speak Here", type="filepath") | |
| text_input = gr.Textbox(label="Text Input", placeholder="Please transcribe this audio for me", value="Please transcribe this audio for me", interactive=True) | |
| with gr.Row(): | |
| chatbot_clear = gr.ClearButton([text_input, audio_input, chatbot], value="Clear") | |
| chatbot_submit = gr.Button("Submit", variant="primary") | |
| chatbot_submit.click( | |
| user, | |
| inputs=[audio_input, text_input, chatbot], | |
| outputs=[text_input, chatbot], | |
| queue=False | |
| ).then( | |
| process_audio, | |
| inputs=[audio_input, text_input, chatbot], | |
| outputs=[chatbot], | |
| ) | |
| gr.Examples( | |
| [ | |
| ["Please transcribe the audio for me", "./examples/elon_musk.mp3"], | |
| ["Please transcribe the audio for me", "./examples/nvidia_conference.mp3"], | |
| ["Please follow the instruction in the audio", "./examples/audio_instruction.wav"], | |
| ["What is the primary instrument featured in the solo of this track?", "./examples/music_under.wav"], | |
| ["What weather condition can be heard in the audio?", "./examples/audio_understand.wav"], | |
| ], | |
| inputs=[text_input, audio_input], | |
| label="Examples", | |
| ) | |
| if __name__ == "__main__": | |
| processor = AutoProcessor.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained("lmms-lab/Aero-1-Audio-1.5B", device_map="cuda", torch_dtype="auto", attn_implementation="sdpa", trust_remote_code=True) | |
| demo.launch() | |