Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| from huggingface_hub import InferenceClient | |
| # Load ASR model | |
| asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025" | |
| processor = Wav2Vec2Processor.from_pretrained(asr_model_name) | |
| asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name) | |
| # Load text generation client | |
| client = InferenceClient("unsloth/gemma-3-1b-it") | |
| # Function: Transcribe audio | |
| def transcribe(audio_file): | |
| waveform, sample_rate = torchaudio.load(audio_file) | |
| resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
| waveform = resampler(waveform).squeeze().numpy() | |
| inputs = processor(waveform, sampling_rate=16000, return_tensors="pt") | |
| with torch.no_grad(): | |
| logits = asr_model(inputs.input_values).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.batch_decode(predicted_ids)[0] | |
| return transcription | |
| # Function: Generate response based on transcription | |
| def generate_text(prompt): | |
| response = client.text_generation(prompt, max_new_tokens=150, temperature=0.7) | |
| return response.strip() | |
| # Gradio interface | |
| def asr_and_generate(audio): | |
| if not audio: | |
| return "No audio provided.", "" | |
| transcription = transcribe(audio) | |
| generated = generate_text(transcription) | |
| return transcription, generated | |
| demo = gr.Interface( | |
| fn=asr_and_generate, | |
| inputs=gr.Audio(label="Upload or Record Audio", type="filepath"), | |
| outputs=[ | |
| gr.Textbox(label="Transcription"), | |
| gr.Textbox(label="AI Response") | |
| ], | |
| title="ASR to Text Generation", | |
| description="Upload audio. The model will transcribe speech to text and generate a response using a fine-tuned text model." | |
| ) | |
| demo.launch() |