ReadRight

Sleeping

App Files Files Community

ParulPandey commited on Jun 6

Commit

254209b

verified ·

1 Parent(s): 8be64e3

Create app.py

Browse files

Files changed (1) hide show

app.py +385 -0

app.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import gradio as gr
+import os
+import difflib
+from gradio_client import Client, file as gradio_file # Renamed to avoid conflict
+import time
+import google.generativeai as genai
+# --- Configuration & Clients ---
+def configure_gemini_api():
+    """Configures the Google Gemini API with API key from Secrets or environment."""
+    api_key = None
+    try:
+        api_key = gr.Secrets.get("GOOGLE_API_KEY") # For Hugging Face Spaces
+    except AttributeError: # Running locally, gr.Secrets not available
+        api_key = os.environ.get("GOOGLE_API_KEY")
+    except FileNotFoundError: # gr.Secrets.get can raise this if no secrets file found
+        api_key = os.environ.get("GOOGLE_API_KEY")
+    if api_key:
+        try:
+            genai.configure(api_key=api_key)
+            print("Google Gemini API configured successfully.")
+            return True
+        except Exception as e:
+            print(f"Error configuring Gemini API: {e}")
+            return False
+    else:
+        print("WARN: GOOGLE_API_KEY not found in Gradio Secrets or environment. Story generation with Gemini will be disabled.")
+        return False
+GEMINI_API_CONFIGURED = configure_gemini_api()
+# Initialize TTS Client (Using ESPnet VITS as an alternative to Bark)
+try:
+    tts_client = Client("espnet/kan-bayashi_ljspeech_vits")
+    print("ESPnet VITS TTS client initialized successfully.")
+    # --- IMPORTANT: For Debugging VITS API if issues persist ---
+    # print("--- ESPnet VITS TTS API Details (Uncomment to view) ---")
+    # print(tts_client.view_api(all_endpoints=True))
+    # print("----------------------------------------------------")
+    # For a more structured dictionary output:
+    # api_info_tts = tts_client.view_api(return_format="dict")
+    # import json
+    # print(json.dumps(api_info_tts, indent=2))
+    # --- End Debugging Section ---
+except Exception as e:
+    print(f"Fatal: Could not initialize ESPnet VITS TTS client: {e}. TTS will not work.")
+    tts_client = None
+# Initialize STT Client for Whisper (abidlabs/whisper-large-v2)
+try:
+    whisper_stt_client = Client("abidlabs/whisper-large-v2")
+    print("Whisper STT client initialized successfully.")
+    # --- For Debugging Whisper API ---
+    # print("--- Whisper STT API Details (Uncomment to view) ---")
+    # print(whisper_stt_client.view_api(all_endpoints=True))
+    # print("-------------------------------------------------")
+except Exception as e:
+    print(f"Fatal: Could not initialize Whisper STT client: {e}. STT will not work.")
+    whisper_stt_client = None
+# --- Helper Functions ---
+def generate_story_with_gemini(name, grade, topic):
+    if not GEMINI_API_CONFIGURED:
+        return "Google Gemini API key not configured. Story generation is disabled. 🔑"
+    try:
+        model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest") # Fast and capable
+        prompt = (
+            f"You are a super friendly and imaginative storyteller for kids. "
+            f"Please write an exciting and fun short story (around 100-120 words) for a student named {name} who is in Grade {grade}. "
+            f"The story must be about '{topic}'. "
+            f"Use simple words and sentences that a Grade {grade} student can easily read aloud and understand. "
+            f"Make the story engaging and positive. Jump right into the story without any introduction like 'Here is a story for you'."
+        )
+        safety_settings = [
+            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
+        ]
+        generation_config = genai.types.GenerationConfig(
+            candidate_count=1, max_output_tokens=300, temperature=0.75
+        )
+        response = model.generate_content(
+            prompt, generation_config=generation_config, safety_settings=safety_settings
+        )
+        if response.candidates and response.candidates[0].content.parts:
+            story = response.text
+            if response.prompt_feedback and response.prompt_feedback.block_reason:
+                 return f"Oh dear! My story idea for '{topic}' was a bit too wild and got blocked (Reason: {response.prompt_feedback.block_reason}). Let's try a different topic! 😊"
+            if not story.strip():
+                return f"Hmm, Gemini gave me a blank page for '{topic}'. Let's try a different topic or try again! ✨"
+            return story.strip()
+        else:
+            if response.prompt_feedback and response.prompt_feedback.block_reason:
+                 return f"Oh dear! My story idea for '{topic}' was a bit too wild and got blocked (Reason: {response.prompt_feedback.block_reason}). Let's try a different topic! 😊"
+            print(f"Gemini API response issue: {response}")
+            return f"Hmm, Gemini's story magic seems to be on a little break for '{topic}'. Maybe try another topic? 🤔"
+    except Exception as e:
+        print(f"Error generating story with Gemini: {e}")
+        if "API_KEY_INVALID" in str(e).lower() or "api key not valid" in str(e).lower():
+            return "Oops! The Google Gemini API key seems to be having a problem. Please tell the grown-ups to check it! 🔑"
+        return f"Oh no! 😟 I had a little trouble dreaming up a story with Gemini. Error: {e}"
+def text_to_speech_vits(text_to_speak):
+    if not tts_client:
+        return "The VITS sound machine isn't working right now. 🛠️ Please tell the grown-ups!"
+    try:
+        # Parameters for espnet/kan-bayashi_ljspeech_vits.
+        # YOU MUST VERIFY these with tts_client.view_api() if TTS fails.
+        # The fn_index (or api_name) and the order/names of parameters are critical.
+        job = tts_client.submit(
+            text_to_speak,  # text (str)
+            "EN",           # lang (str) - e.g., "EN" for English in this model
+            0,              # speaker_id (int | float) - usually 0 for LJSpeech default
+            0.667,          # noise_scale (float) - variance of Z
+            0.8,            # noise_scale_w (float) - variance of Z in duration
+            1.0,            # length_scale (float) - controls speed
+            fn_index=0      # ASSUMPTION: TTS is the first function (index 0).
+                            # If view_api() shows a different fn_index or an api_name like "/predict", use that.
+        )
+        # VITS is generally faster than Bark, but network can add delays
+        audio_filepath = job.result(timeout=90)
+        # This space typically returns just the audio filepath directly.
+        if isinstance(audio_filepath, str) and audio_filepath.endswith(('.wav', '.mp3', '.flac')):
+            return audio_filepath
+        else:
+            # Sometimes the result might be a tuple, e.g., (filepath, samplerate)
+            # Check the actual output structure from view_api() or by printing audio_filepath
+            print(f"Unexpected VITS TTS result format: {audio_filepath}")
+            if isinstance(audio_filepath, tuple) and len(audio_filepath) > 0 and isinstance(audio_filepath[0], str):
+                return audio_filepath[0] # Assume audio path is the first element if it's a tuple
+            return "Hmm, the sound from VITS came out a bit funny. 🤔"
+    except Exception as e:
+        print(f"Error with VITS TTS (espnet/kan-bayashi_ljspeech_vits): {e}")
+        if "Queue full" in str(e).lower() or "too much pending traffic" in str(e).lower():
+            return "The VITS sound machine is busy! Please try again in a moment. 🕒"
+        # Provide more specific error if submit call itself failed due to wrong params
+        if "expected" in str(e).lower() and ("argument" in str(e).lower() or "parameter" in str(e).lower()):
+             return f"VITS TTS had a hiccup with parameters. (Details: {e}). Please check view_api() output."
+        return f"Oh dear, VITS couldn't make the sound. 🔇 Error: {e}"
+def speech_to_text_whisper_space(audio_filepath):
+    if not whisper_stt_client:
+        return "The Whisper listening ears aren't working right now. 🛠️ Please tell the grown-ups!"
+    if not audio_filepath:
+        return "Oops! I didn't get any recording to listen to. 🎤"
+    try:
+        # API for abidlabs/whisper-large-v2 usually takes audio, task, language.
+        job = whisper_stt_client.submit(
+            gradio_file(audio_filepath), # Use gradio_client.file to handle the upload
+            "transcribe",                # task
+            "English",                   # language (can be None for auto-detect)
+            api_name="/predict"          # This is common for abidlabs/whisper spaces
+        )
+        result_dict = job.result(timeout=120) # Wait up to 2 minutes
+        if isinstance(result_dict, dict) and 'text' in result_dict:
+            return result_dict['text']
+        elif isinstance(result_dict, str): # Fallback if it's simpler and returns text directly
+            return result_dict
+        else:
+            print(f"Unexpected Whisper STT result format: {result_dict}")
+            return "Hmm, I couldn't quite understand the words from Whisper. 🤔"
+    except Exception as e:
+        print(f"Error transcribing audio with Whisper Space: {e}")
+        if "Queue full" in str(e).lower() or "too much pending traffic" in str(e).lower():
+            return "The Whisper listening ears are super busy! 인기폭발! очередь! Please try again in a bit. 🕒"
+        return f"Oh no! Whisper had trouble hearing that. 🙉 Error: {e}"
+def clean_text_for_comparison(text):
+    if not isinstance(text, str): return []
+    text = text.lower()
+    punctuation_to_remove = "!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~" # Keeps apostrophes for contractions
+    text = text.translate(str.maketrans('', '', punctuation_to_remove))
+    return text.split()
+def compare_texts_for_feedback(original_text, student_text):
+    original_words = clean_text_for_comparison(original_text)
+    student_words = clean_text_for_comparison(student_text)
+    if not student_words:
+        return "It sounds like you didn't record anything, or maybe it was super quiet! 🤫 Try recording again nice and clear!", ""
+    matcher = difflib.SequenceMatcher(None, original_words, student_words, autojunk=False)
+    feedback_lines = []
+    highlighted_passage_parts = []
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        original_segment = original_words[i1:i2]
+        student_segment = student_words[j1:j2]
+        if tag == 'equal':
+            highlighted_passage_parts.append(" ".join(original_segment))
+        elif tag == 'replace':
+            # Try to highlight word by word if segments are same length for better visual
+            if len(original_segment) == len(student_segment):
+                for i in range(len(original_segment)):
+                    o_word = original_segment[i]
+                    s_word = student_segment[i]
+                    feedback_lines.append(f"- You said: \"*{s_word}*\" instead of: \"**{o_word}**\"")
+                    highlighted_passage_parts.append(f"~~{o_word}~~ **{s_word}**")
+            else: # General replacement if segment lengths differ
+                feedback_lines.append(f"- Instead of: \"**{' '.join(original_segment)}**\", you said: \"*{' '.join(student_segment)}*\"")
+                highlighted_passage_parts.append(f"~~{' '.join(original_segment)}~~ **{' '.join(student_segment)}**")
+        elif tag == 'delete': # Student skipped words from original
+            feedback_lines.append(f"- You missed: \"**{' '.join(original_segment)}**\"")
+            highlighted_passage_parts.append(f"~~{' '.join(original_segment)}~~ (*skipped*)")
+        elif tag == 'insert': # Student added words not in original
+            feedback_lines.append(f"- You added: \"*{' '.join(student_segment)}*\" (which wasn't in the story)")
+            highlighted_passage_parts.append(f"(*added:* **{' '.join(student_segment)}**)")
+    final_highlighted_text = " ".join(highlighted_passage_parts)
+    if not feedback_lines:
+        return "🎉🥳 WOOHOO! Amazing reading! You got all the words spot on! 🥳🎉", final_highlighted_text
+    else:
+        feedback_summary = "Great try! Here are a few words to practice to make it even better:\n" + "\n".join(feedback_lines)
+        return feedback_summary, final_highlighted_text
+# --- Gradio UI Functions ---
+def generate_story_and_audio_for_ui(name, grade, topic, progress=gr.Progress(track_tqdm=True)):
+    if not name or not grade or not topic:
+        return "Oops! Please tell me your name, grade, and a fun topic first! 😊", None, gr.update(visible=False), ""
+    progress(0.1, desc="📖 Asking Gemini to dream up a cool story for you...")
+    story_text = generate_story_with_gemini(name, grade, topic)
+    gemini_error_keywords = ["Gemini API key not configured", "Oh no!", "Oops!", "Hmm,"]
+    if any(keyword in story_text for keyword in gemini_error_keywords) or not story_text.strip() :
+        return story_text, None, gr.update(visible=False), story_text # Keep recording area hidden
+    progress(0.5, desc="🎧 Warming up the VITS sound machine... (this should be quicker!)")
+    tts_audio_path = text_to_speech_vits(story_text) # Use VITS TTS
+    error_conditions_tts = [
+        "couldn't make the sound", "sound came out a bit funny", "sound machine isn't working",
+        "sound machine is busy", "VITS had a hiccup" # Check for VITS specific errors
+    ]
+    if any(err in (tts_audio_path or "") for err in error_conditions_tts):
+        return story_text, tts_audio_path, gr.update(visible=False), story_text # Keep recording hidden
+    progress(1.0, desc="✅ Story and sound are ready! Let's go!")
+    return (
+        story_text,
+        tts_audio_path,
+        gr.update(visible=True), # Show recording_assessment_area
+        story_text               # Pass story_text to gr.State
+    )
+def assess_student_reading_ui(original_passage_state, student_audio_path, progress=gr.Progress(track_tqdm=True)):
+    if not student_audio_path:
+        return "🎤 Whoops! Did you forget to record your awesome reading? Try again!", ""
+    if not original_passage_state: # Should not happen if UI flow is correct
+        return "Hmm, I lost the story! 😟 Please generate a new story first.", ""
+    progress(0.2, desc="👂 Whisper is listening carefully to your recording...")
+    transcribed_text = speech_to_text_whisper_space(student_audio_path)
+    error_conditions_stt = [
+        "couldn't understand the words", "had trouble hearing that", "listening ears aren't working",
+        "listening ears are super busy", "didn't get any recording"
+    ]
+    if any(err in (transcribed_text or "") for err in error_conditions_stt):
+        return transcribed_text, "" # Show STT error
+    progress(0.7, desc="🧠 Thinking about the words...")
+    feedback, highlighted_passage = compare_texts_for_feedback(original_passage_state, transcribed_text)
+    progress(1.0, desc="⭐ Feedback is ready!")
+    return feedback, highlighted_passage
+# --- Gradio Interface ---
+css = """
+body { font-family: 'Comic Sans MS', 'Chalkboard SE', 'Comic Neue', cursive; background-color: #F0F8FF; } /* AliceBlue background */
+.gr-button {
+    background-color: #FF69B4 !important; /* HotPink */
+    color: white !important;
+    border-radius: 20px !important;
+    font-weight: bold !important;
+    border: 2px solid #FF1493 !important; /* DeepPink border */
+    box-shadow: 0px 3px 5px rgba(0,0,0,0.2) !important;
+}
+.gr-button:hover { background-color: #FF1493 !important; } /* DeepPink on hover */
+.gr-panel {
+    border-radius: 15px !important;
+    box-shadow: 5px 5px 15px rgba(0,0,0,0.1) !important;
+    background-color: #FFFACD !important; /* LemonChiffon panel background */
+    border: 2px dashed #FFD700 !important; /* Gold dashed border */
+}
+label, .gr-checkbox-label { color: #4B0082 !important; font-weight: bold !important; } /* Indigo */
+.gr-textbox, .gr-dropdown { border-radius: 10px !important; border: 1px solid #DDA0DD !important; } /* Plum border for inputs */
+#student_audio_input audio { background-color: #E6E6FA; border-radius: 10px; } /* Lavender for audio player */
+#feedback_output, #highlighted_passage_output {
+    background-color: #FFFFE0; /* LightYellow */
+    padding: 15px;
+    border-radius: 10px;
+    border: 1px solid #FAFAD2; /* LightGoldenrodYellow */
+}
+"""
+# Using a theme that allows CSS to take more precedence
+with gr.Blocks(theme=gr.themes.Base(), css=css) as app: # theme=gr.themes.Soft() or gr.themes.Base()
+    gr.Markdown(
+        """
+        <div style="text-align: center; padding: 20px 0;">
+            <h1 style="color: #FF6347; font-size: 3em; text-shadow: 2px 2px #D3D3D3;">🌈🦄✨ AI Reading Buddy ✨🦄🌈</h1>
+            <p style="font-size: 1.3em; color: #483D8B;">Let's read a super fun story from Gemini and practice our words!</p>
+        </div>
+        """
+    )
+    original_passage_state = gr.State("") # To store the generated story
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### <span style='color:#DB7093;'>✏️ Tell Me About You!</span>")
+            student_name_input = gr.Textbox(label="👑 Your Awesome Name:", placeholder="E.g., Princess Lily")
+            student_grade_input = gr.Dropdown(
+                label="🧑‍🎓 Your Grade:",
+                choices=[f"{i}" for i in range(1, 11)], # Grades 1 to 10
+                value="3" # Default value
+            )
+            topic_input = gr.Textbox(label="🚀 Story Topic Idea:", placeholder="E.g., brave little astronaut")
+            generate_button = gr.Button(value="🎈 Get My Gemini Story!")
+        with gr.Column(scale=2):
+            gr.Markdown("### <span style='color:#DB7093;'>📖 Your Special Story (from Gemini AI):</span>")
+            passage_output = gr.Textbox(label="Read this aloud:", lines=10, interactive=False)
+            gr.Markdown("### <span style='color:#DB7093;'>🔊 Listen to the Story:</span>")
+            audio_output = gr.Audio(label="Hear how it sounds (with VITS TTS)", type="filepath") # Label updated for VITS
+    gr.Markdown("<hr style='border:1px dashed #FFB6C1;'>") # LightPink dashed separator
+    with gr.Row(visible=False) as recording_assessment_area: # Initially hidden
+        with gr.Column(scale=1):
+            gr.Markdown("### <span style='color:#32CD32;'>🤩 Your Turn to Shine! 🤩</span>")
+            student_audio_input = gr.Audio(sources=["microphone"], type="filepath", label="🎤 Record yourself reading the story! Press the mic, then stop.", elem_id="student_audio_input")
+            assess_button = gr.Button(value="🧐 Check My Reading!", elem_id="assess_button")
+        with gr.Column(scale=2):
+            gr.Markdown("### <span style='color:#32CD32;'>💡 Word Detective Feedback:</span>")
+            feedback_output = gr.Markdown(value="Your amazing feedback will pop up here! ✨", elem_id="feedback_output")
+            highlighted_passage_output = gr.Markdown(value="See your reading journey here! 🗺️", elem_id="highlighted_passage_output")
+    generate_button.click(
+        fn=generate_story_and_audio_for_ui,
+        inputs=[student_name_input, student_grade_input, topic_input],
+        outputs=[
+            passage_output,
+            audio_output,
+            recording_assessment_area, # Directly control visibility of the row
+            original_passage_state
+        ]
+    )
+    assess_button.click(
+        fn=assess_student_reading_ui,
+        inputs=[original_passage_state, student_audio_input],
+        outputs=[feedback_output, highlighted_passage_output]
+    )
+    gr.Markdown(
+        """
+        ---
+        <div style="text-align: center; font-size: 0.9em; color: #555;">
+        Built with ❤️ for the Agentic Demo Track Hackathon! Tag: <code>agent-demo-track</code>
+        <br>Stories by Google Gemini, voices by ESPnet VITS @ HF, and listening by Whisper @ HF.
+        </div>
+        """
+    )
+# --- Launching the App ---
+if __name__ == "__main__":
+    if not GEMINI_API_CONFIGURED:
+        print("🚨 GOOGLE_API_KEY not configured for local testing or failed to initialize!")
+        print("Please set it: export GOOGLE_API_KEY='your_key_here'")
+    if not tts_client:
+        print("🚨 ESPnet VITS TTS client (espnet/kan-bayashi_ljspeech_vits) could not be initialized. TTS will not work.")
+    if not whisper_stt_client:
+        print("🚨 Whisper STT client (abidlabs/whisper-large-v2) could not be initialized. STT will not work.")
+    app.launch(debug=True) # Set share=True for a temporary public link if running locally