Spaces:

VanguardAI
/

MultiModal_OpenSource_AI

Paused

App Files Files Community

MultiModal_OpenSource_AI / app.py

VanguardAI

Update app.py

d5685b0 verified over 1 year ago

raw

history blame

2.75 kB

	import sounddevice as sd
	import scipy.io.wavfile as wavfile
	import numpy as np
	import gradio as gr
	from groq import Groq
	import tempfile
	import os

	class Recorder:
	def __init__(self, sample_rate=44100):
	self.recording = False
	self.frames = []
	self.sample_rate = sample_rate
	self.stream = None

	def toggle_recording(self):
	if not self.recording:
	self.frames = []
	self.stream = sd.InputStream(callback=self.callback, channels=2, samplerate=self.sample_rate)
	self.stream.start()
	self.recording = True
	return "Recording... Press to Stop"
	else:
	self.stream.stop()
	self.stream.close()
	self.recording = False
	return "Recording stopped. Press to Record"

	def callback(self, indata, frames, time, status):
	if self.recording:
	self.frames.append(indata.copy())

	def save_audio(self):
	if self.frames:
	audio_data = np.concatenate(self.frames, axis=0)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
	wavfile.write(temp_wav_file.name, self.sample_rate, audio_data)
	return temp_wav_file.name
	else:
	return None

	recorder = Recorder()

	def record():
	return recorder.toggle_recording()

	def transcribe():
	audio_file = recorder.save_audio()
	if audio_file:
	client = Groq(api_key="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")
	with open(audio_file, "rb") as file:
	transcription = client.audio.transcriptions.create(
	file=(audio_file, file.read()),
	model="whisper-large-v3",
	prompt="Specify context or spelling", # Optional
	response_format="json", # Optional
	language="en", # Optional
	temperature=0.0 # Optional
	)
	os.remove(audio_file) # Clean up the temporary file

	# Inspect the transcription object to find the text
	print(transcription)

	# Access the text attribute directly if available
	if hasattr(transcription, 'text'):
	return transcription.text
	else:
	return "Transcription text not found."

	else:
	return "No audio recorded."

	with gr.Blocks() as gradio_interface:
	with gr.Column():
	record_button = gr.Button("Press to Record")
	record_button.click(fn=record, outputs=record_button)
	transcription_output = gr.Textbox(label="Transcription")
	record_button.click(fn=transcribe, outputs=transcription_output)

	if __name__ == "__main__":
	gradio_interface.launch()