Spaces:

helvekami
/

ShukaNote

Running

App Files Files Community

helvekami commited on Mar 6

Commit

fbc6758

1 Parent(s): e2f65f6

Updated Gradio App

Browse files

Files changed (1) hide show

app.py +30 -16

app.py CHANGED Viewed

@@ -16,40 +16,54 @@ def process_audio(audio):
     Processes the input audio and returns a text response generated by the Shuka model.
     """
     if audio is None:
-        return "No audio provided."
-    # Gradio returns a tuple (sample_rate, numpy_array)
-    sample_rate, audio_data = audio
     # Resample to 16000 Hz if necessary
     if sample_rate != 16000:
-        audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
-        sample_rate = 16000
-    # Define conversation turns with a system prompt and a user prompt that signals audio input
     turns = [
         {'role': 'system', 'content': 'Respond naturally and informatively.'},
         {'role': 'user', 'content': '<|audio|>'}
     ]
-    # Run the pipeline with the audio input and conversation context
-    result = pipe({'audio': audio_data, 'turns': turns, 'sampling_rate': sample_rate}, max_new_tokens=512)
-    # Extract the generated text response
     if isinstance(result, list) and len(result) > 0:
         response = result[0].get('generated_text', '')
     else:
         response = str(result)
     return response
-# Create the Gradio interface without the 'source' parameter.
 iface = gr.Interface(
     fn=process_audio,
-    inputs=gr.Audio(type="numpy"),
     outputs="text",
     title="Sarvam AI Shuka Voice Demo",
-    description="Upload a voice note and get a response using Sarvam AI's Shuka model."
 )
 if __name__ == "__main__":
-    iface.launch()

     Processes the input audio and returns a text response generated by the Shuka model.
     """
     if audio is None:
+        return "No audio provided. Please upload or record an audio file."
+    try:
+        # Gradio returns a tuple: (sample_rate, numpy_array)
+        sample_rate, audio_data = audio
+    except Exception as e:
+        return f"Error processing audio input: {e}"
+    if audio_data is None or len(audio_data) == 0:
+        return "Audio data is empty. Please try again with a valid audio file."
     # Resample to 16000 Hz if necessary
     if sample_rate != 16000:
+        try:
+            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
+            sample_rate = 16000
+        except Exception as e:
+            return f"Error during resampling: {e}"
+    # Define conversation turns for the model
     turns = [
         {'role': 'system', 'content': 'Respond naturally and informatively.'},
         {'role': 'user', 'content': '<|audio|>'}
     ]
+    try:
+        result = pipe({'audio': audio_data, 'turns': turns, 'sampling_rate': sample_rate}, max_new_tokens=512)
+    except Exception as e:
+        return f"Error during model processing: {e}"
+    # Extract generated text
     if isinstance(result, list) and len(result) > 0:
         response = result[0].get('generated_text', '')
     else:
         response = str(result)
     return response
+# Create the Gradio interface.
+# If you wish to record audio directly, you may need to upgrade Gradio to a version that supports "source" for the Audio component.
 iface = gr.Interface(
     fn=process_audio,
+    inputs=gr.Audio(type="numpy"),  # using file upload input for audio
     outputs="text",
     title="Sarvam AI Shuka Voice Demo",
+    description="Upload an audio file and get a response using Sarvam AI's Shuka model."
 )
 if __name__ == "__main__":
+    # If port 7860 is in use, you can specify another port (here we use 7861)
+    iface.launch(server_port=7861)