|  | import soundfile as sf | 
					
						
						|  | import torch | 
					
						
						|  | from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import sox | 
					
						
						|  | import subprocess | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def read_file_and_process(wav_file): | 
					
						
						|  | filename = wav_file.split('.')[0] | 
					
						
						|  | filename_16k = filename + "16k.wav" | 
					
						
						|  | resampler(wav_file, filename_16k) | 
					
						
						|  | speech, _ = sf.read(filename_16k) | 
					
						
						|  | inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True) | 
					
						
						|  |  | 
					
						
						|  | return inputs | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def resampler(input_file_path, output_file_path): | 
					
						
						|  | command = ( | 
					
						
						|  | f"ffmpeg -hide_banner -loglevel panic -i {input_file_path} -ar 16000 -ac 1 -bits_per_raw_sample 16 -vn " | 
					
						
						|  | f"{output_file_path}" | 
					
						
						|  | ) | 
					
						
						|  | subprocess.call(command, shell=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def parse_transcription(logits): | 
					
						
						|  | predicted_ids = torch.argmax(logits, dim=-1) | 
					
						
						|  | transcription = processor.decode(predicted_ids[0], skip_special_tokens=True) | 
					
						
						|  | return transcription | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def parse(wav_file): | 
					
						
						|  | input_values = read_file_and_process(wav_file) | 
					
						
						|  | with torch.no_grad(): | 
					
						
						|  | logits = model(**input_values).logits | 
					
						
						|  |  | 
					
						
						|  | if wav_file: | 
					
						
						|  | return parse_transcription(logits) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_id = "anuragshas/wav2vec2-large-xlsr-53-odia" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | processor = Wav2Vec2Processor.from_pretrained(model_id) | 
					
						
						|  | model = Wav2Vec2ForCTC.from_pretrained(model_id) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | input_ = gr.Audio(source="upload", type="filepath") | 
					
						
						|  | txtbox = gr.Textbox( | 
					
						
						|  | label="Output from the model will appear here:", | 
					
						
						|  | lines=5 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | gr.Interface(parse, inputs=[input_], outputs=txtbox, | 
					
						
						|  | streaming=True, interactive=True, | 
					
						
						|  | analytics_enabled=False, show_tips=False, enable_queue=True).launch(inline=False); | 
					
						
						|  |  |