Commit
·
b46da1b
1
Parent(s):
f65990c
change in main.py to explicitely tell Open AI what the format for audio is
Browse files
main.py
CHANGED
|
@@ -101,15 +101,18 @@ You are an expert AI assistant for a premier real estate developer.
|
|
| 101 |
"""
|
| 102 |
|
| 103 |
|
| 104 |
-
# ---
|
| 105 |
-
def transcribe_audio(audio_bytes: bytes) -> str:
|
| 106 |
for attempt in range(3):
|
| 107 |
try:
|
| 108 |
audio_file = io.BytesIO(audio_bytes)
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
| 110 |
transcript = client_openai.audio.transcriptions.create(
|
| 111 |
model="whisper-1",
|
| 112 |
-
file=audio_file
|
| 113 |
)
|
| 114 |
text = transcript.text.strip()
|
| 115 |
|
|
@@ -122,13 +125,16 @@ def transcribe_audio(audio_bytes: bytes) -> str:
|
|
| 122 |
)
|
| 123 |
text = response.choices[0].message.content.strip()
|
| 124 |
|
|
|
|
| 125 |
return text
|
|
|
|
| 126 |
except Exception as e:
|
| 127 |
logging.error(f"Transcription error (attempt {attempt+1}): {e}")
|
| 128 |
if attempt == 2:
|
| 129 |
return ""
|
| 130 |
return ""
|
| 131 |
|
|
|
|
| 132 |
def generate_elevenlabs_sync(text: str, voice: str) -> bytes:
|
| 133 |
for attempt in range(3):
|
| 134 |
try:
|
|
@@ -214,23 +220,23 @@ async def test_text_query_endpoint(query: TextQuery):
|
|
| 214 |
return {"response": response}
|
| 215 |
|
| 216 |
|
| 217 |
-
# ---
|
| 218 |
async def process_audio(audio_path):
|
| 219 |
-
if not audio_path:
|
| 220 |
-
return None, "No audio
|
| 221 |
|
| 222 |
try:
|
| 223 |
-
#
|
| 224 |
with open(audio_path, "rb") as f:
|
| 225 |
audio_bytes = f.read()
|
| 226 |
|
| 227 |
if len(audio_bytes) == 0:
|
| 228 |
-
return None, "Empty audio."
|
| 229 |
|
| 230 |
-
# 1. Transcribe
|
| 231 |
-
user_text = await run_in_threadpool(transcribe_audio, audio_bytes)
|
| 232 |
if not user_text:
|
| 233 |
-
return None, "Couldn't understand. Try again."
|
| 234 |
|
| 235 |
logging.info(f"User: {user_text}")
|
| 236 |
|
|
@@ -271,16 +277,14 @@ with gr.Blocks(title="Real Estate AI") as demo:
|
|
| 271 |
|
| 272 |
out_text = gr.Textbox(label="Conversation", lines=8)
|
| 273 |
|
|
|
|
| 274 |
inp.change(process_audio, inp, [out_audio, out_text])
|
| 275 |
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
],
|
| 282 |
-
inputs=inp
|
| 283 |
-
)
|
| 284 |
|
| 285 |
|
| 286 |
# --- MOUNT GRADIO ---
|
|
|
|
| 101 |
"""
|
| 102 |
|
| 103 |
|
| 104 |
+
# --- FIXED: transcribe_audio accepts path + bytes ---
|
| 105 |
+
def transcribe_audio(audio_path: str, audio_bytes: bytes) -> str:
|
| 106 |
for attempt in range(3):
|
| 107 |
try:
|
| 108 |
audio_file = io.BytesIO(audio_bytes)
|
| 109 |
+
filename = os.path.basename(audio_path) # e.g., "audio.wav"
|
| 110 |
+
|
| 111 |
+
logging.info(f"Transcribing audio: {filename} ({len(audio_bytes)} bytes)")
|
| 112 |
+
|
| 113 |
transcript = client_openai.audio.transcriptions.create(
|
| 114 |
model="whisper-1",
|
| 115 |
+
file=(filename, audio_file) # ← Critical: gives format hint
|
| 116 |
)
|
| 117 |
text = transcript.text.strip()
|
| 118 |
|
|
|
|
| 125 |
)
|
| 126 |
text = response.choices[0].message.content.strip()
|
| 127 |
|
| 128 |
+
logging.info(f"Transcribed: {text}")
|
| 129 |
return text
|
| 130 |
+
|
| 131 |
except Exception as e:
|
| 132 |
logging.error(f"Transcription error (attempt {attempt+1}): {e}")
|
| 133 |
if attempt == 2:
|
| 134 |
return ""
|
| 135 |
return ""
|
| 136 |
|
| 137 |
+
|
| 138 |
def generate_elevenlabs_sync(text: str, voice: str) -> bytes:
|
| 139 |
for attempt in range(3):
|
| 140 |
try:
|
|
|
|
| 220 |
return {"response": response}
|
| 221 |
|
| 222 |
|
| 223 |
+
# --- FIXED: process_audio passes path + bytes ---
|
| 224 |
async def process_audio(audio_path):
|
| 225 |
+
if not audio_path or not os.path.exists(audio_path):
|
| 226 |
+
return None, "No valid audio file received."
|
| 227 |
|
| 228 |
try:
|
| 229 |
+
# Read raw bytes
|
| 230 |
with open(audio_path, "rb") as f:
|
| 231 |
audio_bytes = f.read()
|
| 232 |
|
| 233 |
if len(audio_bytes) == 0:
|
| 234 |
+
return None, "Empty audio file."
|
| 235 |
|
| 236 |
+
# 1. Transcribe — pass path + bytes
|
| 237 |
+
user_text = await run_in_threadpool(transcribe_audio, audio_path, audio_bytes)
|
| 238 |
if not user_text:
|
| 239 |
+
return None, "Couldn't understand audio. Try again."
|
| 240 |
|
| 241 |
logging.info(f"User: {user_text}")
|
| 242 |
|
|
|
|
| 277 |
|
| 278 |
out_text = gr.Textbox(label="Conversation", lines=8)
|
| 279 |
|
| 280 |
+
# Only trigger on real file (not example text)
|
| 281 |
inp.change(process_audio, inp, [out_audio, out_text])
|
| 282 |
|
| 283 |
+
# --- FIXED: Examples now use real audio files (optional) ---
|
| 284 |
+
# Remove text examples to avoid FileNotFoundError
|
| 285 |
+
# Or: Record real .wav files and upload to repo
|
| 286 |
+
# For now: disable examples
|
| 287 |
+
# gr.Examples(examples=[], inputs=inp)
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
|
| 290 |
# --- MOUNT GRADIO ---
|