Spaces:

helvekami
/

ShukaNote

Running

App Files Files Community

helvekami commited on Mar 6

Commit

8c679c2

1 Parent(s): 5f13772

Updated Gradio App

Browse files

Files changed (1) hide show

app.py +46 -63

app.py CHANGED Viewed

@@ -1,76 +1,59 @@
-import gradio as gr
 import transformers
 import librosa
 import torch
 import numpy as np
-# Load the Shuka model pipeline.
-pipe = transformers.pipeline(
-    model="sarvamai/shuka_v1",
-    trust_remote_code=True,
-    device=0 if torch.cuda.is_available() else -1,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None
-)
-def process_audio(audio):
-    """
-    Processes the input audio and returns a text response generated by the Shuka model.
-    """
-    if audio is None:
-        return "No audio provided. Please upload or record an audio file."
-    try:
-        # Gradio returns a tuple: (sample_rate, audio_data)
-        sample_rate, audio_data = audio
-    except Exception as e:
-        return f"Error processing audio input: {e}"
-    if audio_data is None or len(audio_data) == 0:
-        return "Audio data is empty. Please try again with a valid audio file."
-    # Force conversion of audio data to a floating-point numpy array.
-    audio_data = np.array(audio_data, dtype=np.float32)
-    # If the audio data is multi-dimensional, squeeze it to 1D.
-    if audio_data.ndim > 1:
-        audio_data = np.squeeze(audio_data)
-    # Resample to 16000 Hz if necessary.
-    if sample_rate != 16000:
-        try:
-            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-            sample_rate = 16000
-        except Exception as e:
-            return f"Error during resampling: {e}"
-    # Define conversation turns for the model.
-    turns = [
-        {'role': 'system', 'content': 'Respond naturally and informatively.'},
-        {'role': 'user', 'content': '<|audio|>'}
-    ]
     try:
-        result = pipe({'audio': audio_data, 'turns': turns, 'sampling_rate': sample_rate}, max_new_tokens=512)
     except Exception as e:
-        return f"Error during model processing: {e}"
-    # Extract the generated text response.
-    if isinstance(result, list) and len(result) > 0:
-        response = result[0].get('generated_text', '')
-    else:
-        response = str(result)
-    return response
-# Create the Gradio interface.
 iface = gr.Interface(
-    fn=process_audio,
-    inputs=gr.Audio(type="numpy"),  # Use file upload for audio input.
     outputs="text",
-    title="Sarvam AI Shuka Voice Demo",
-    description="Upload an audio file and get a response using Sarvam AI's Shuka model."
 )
 if __name__ == "__main__":
-    # Launch the app with share=True to create a public link.
-    iface.launch(share=True)

 import transformers
+import gradio as gr
 import librosa
 import torch
+import spaces
 import numpy as np
+@spaces.GPU(duration=60)
+def transcribe_and_respond(audio_file):
     try:
+        pipe = transformers.pipeline(
+            model='sarvamai/shuka_v1',
+            trust_remote_code=True,
+            device=0,
+            torch_dtype=torch.bfloat16
+        )
+        # Load the audio file at 16kHz
+        audio, sr = librosa.load(audio_file, sr=16000)
+        # Ensure audio is a floating-point numpy array
+        audio = np.array(audio, dtype=np.float32)
+        # If audio has more than one channel, convert to mono by averaging
+        if audio.ndim > 1:
+            audio = np.mean(audio, axis=-1)
+        # Debug: Print audio properties
+        print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
+        turns = [
+            {'role': 'system', 'content': 'Respond naturally and informatively.'},
+            {'role': 'user', 'content': '<|audio|>'}
+        ]
+        # Debug: Print initial turns
+        print(f"Initial turns: {turns}")
+        # Call the model with the audio and prompt
+        output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
+        # Debug: Print the final output from the model
+        print(f"Model output: {output}")
+        return output
     except Exception as e:
+        return f"Error: {str(e)}"
 iface = gr.Interface(
+    fn=transcribe_and_respond,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs="text",
+    title="Live Transcription and Response",
+    description="Speak into your microphone, and the model will respond naturally and informatively.",
+    live=True
 )
 if __name__ == "__main__":
+    iface.launch()