Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration | |
| import av | |
| import numpy as np | |
| import pydub | |
| from io import BytesIO | |
| from models.nllb import nllb | |
| from models.parakeet import parakeet_ctc_model | |
| from stream_VAD import stream | |
| from models.es_fastconformer import stt_es_model | |
| RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}) | |
| # Load models once | |
| model_nllb, tokenizer_nllb = nllb() | |
| parakeet = parakeet_ctc_model() | |
| stt_model = stt_es_model() | |
| def process_audio(audio_chunk, language): | |
| # Convert audio chunk to pydub.AudioSegment | |
| audio_segment = pydub.AudioSegment( | |
| data=audio_chunk.tobytes(), | |
| sample_width=audio_chunk.format.sample_width, | |
| frame_rate=audio_chunk.sample_rate, | |
| channels=len(audio_chunk.layout.channels) | |
| ) | |
| # Process audio based on selected language | |
| if language == "en": | |
| processed_audio = stream(parakeet, model_nllb, tokenizer_nllb, "english", "spanish", audio_segment) | |
| elif language == "es": | |
| processed_audio = stream(stt_model, model_nllb, tokenizer_nllb, "spanish", "english", audio_segment) | |
| else: | |
| return audio_chunk | |
| # Convert processed audio back to numpy array | |
| processed_audio_np = np.array(processed_audio.get_array_of_samples()) | |
| return processed_audio.frame_rate, processed_audio_np | |
| def audio_callback(frame: av.AudioFrame, language): | |
| audio_data = frame.to_ndarray() | |
| audio_chunk = av.AudioFrame.from_ndarray(audio_data, format="s16", layout="mono") | |
| return process_audio(audio_chunk, language) | |
| st.title("Real-Time Audio Processing") | |
| language = st.radio("Select Language", ["en", "es"], index=0) | |
| webrtc_ctx = webrtc_streamer( | |
| key="audio", | |
| mode=WebRtcMode.SENDRECV, | |
| rtc_configuration=RTC_CONFIGURATION, | |
| media_stream_constraints={"audio": True, "video": False}, | |
| audio_receiver_size=256, | |
| async_processing=True, | |
| ) | |
| if webrtc_ctx.audio_receiver: | |
| webrtc_ctx.audio_receiver.on("data", lambda frame: audio_callback(frame, language)) | |
| if "audio_buffer" not in st.session_state: | |
| st.session_state["audio_buffer"] = BytesIO() | |
| if webrtc_ctx.audio_receiver: | |
| audio_frames = webrtc_ctx.audio_receiver.get_frames() | |
| for frame in audio_frames: | |
| processed_audio_rate, processed_audio_np = audio_callback(frame, language) | |
| audio_segment = pydub.AudioSegment( | |
| data=processed_audio_np.tobytes(), | |
| sample_width=processed_audio_np.dtype.itemsize, | |
| frame_rate=processed_audio_rate, | |
| channels=1 | |
| ) | |
| st.session_state["audio_buffer"].write(audio_segment.export(format="wav").read()) | |
| st.audio(st.session_state["audio_buffer"].getvalue(), format="audio/wav") | |