from langchain_core.prompts import ChatPromptTemplate from pydantic import BaseModel, Field from src.config.llm import model from src.utils.logger import logger from .prompt import evaluation_prompt from langgraph.checkpoint.memory import InMemorySaver from typing import List, Dict, Any from src.agents.role_play.flow import role_play_agent # Define the structured output format class ResponseFormatter(BaseModel): """Structured output format for conversation evaluation""" score: int = Field( ..., description="Overall conversation score out of 100", ge=0, le=100 ) feedback: str = Field(..., description="Overall feedback summary") strengths: List[str] = Field(..., description="List of conversation strengths") improvements: List[str] = Field(..., description="List of areas for improvement") suggestions: List[str] = Field( ..., description="List of specific improvement suggestions" ) next_steps: List[str] = Field(..., description="List of recommended next steps") words_used: List[str] = Field(..., description="List of key words used from the scenario") perfect_response: str = Field(..., description="An example of a perfect response for this scenario") impressive_words: List[str] = Field(..., description="List of impressive or advanced words used by the learner") # Create the prompt template async def evaluate_conversation( session_id: str, learner_level: str = "beginner", scenario_title: str = "", scenario_description: str = "", key_vocabulary: str = "", ) -> Dict[str, Any]: """ Evaluate a conversation based on the session ID and provide feedback. Args: session_id: The thread ID for the conversation learner_level: The English level of the learner scenario_title: Title of the conversation scenario scenario_description: Description of the conversation scenario key_vocabulary: Key vocabulary words from the scenario Returns: Dict containing evaluation results including score and feedback """ logger.info(f"Evaluating conversation for session_id: {session_id}") config = {"configurable": {"thread_id": session_id}} snapshot = await role_play_agent().aget_state(config) messages = snapshot.values.get("messages", []) if not messages: return { "score": 0, "feedback": "No conversation found for this session.", "strengths": [], "improvements": [], "suggestions": [], "next_steps": [], } evaluation_prompt_template = ChatPromptTemplate.from_messages( [ ( "system", """# CONVERSATION EVALUATOR - English Learning Assessment Specialist You are **WISE Evaluator**, an expert English tutor who analyzes conversations between learners and AI roleplay partners. Your job is to provide comprehensive feedback that helps learners improve. ## Evaluation Context - **Session ID**: {session_id} - **Learner Level**: {learner_level} - **Scenario**: {scenario_title} - {scenario_description} - **Key Vocabulary**: {key_vocabulary} ## Your Evaluation Mission 1. **Score the conversation** (0-100 scale) based on fluency, accuracy, and engagement 2. **Identify strengths** - What did the learner do well? 3. **Pinpoint areas for improvement** - Where can they get better? 4. **Provide specific suggestions** - Concrete actions for improvement 5. **Recommend next steps** - What should they practice next? ## Scoring Criteria ### Fluency (30 points) - **Flow**: How naturally does the conversation progress? - **Response time**: Are there appropriate pauses or unnatural delays? - **Turn-taking**: Good balance of speaking between learner and AI? ### Accuracy (30 points) - **Grammar**: Correct sentence structures and verb forms - **Vocabulary**: Appropriate word choices and usage - **Pronunciation**: (If audio available) Clear pronunciation of words ### Engagement (20 points) - **Relevance**: Staying on topic and scenario context - **Interaction**: Active participation and questions - **Creativity**: Bringing personal experiences or unique responses ### Vocabulary Usage (20 points) - **Range**: Using diverse vocabulary from the scenario - **Accuracy**: Correct usage of key vocabulary words - **Complexity**: Appropriate challenge level for learner ## Response Format Requirements You must provide your response in the following structured format: ### SCORE: [X/100] Provide a single overall score out of 100. ### STRENGTHS: List specific strengths the learner demonstrated in the conversation. ### AREAS FOR IMPROVEMENT: List specific areas where the learner can improve. ### IMPROVEMENT SUGGESTIONS: Provide concrete, actionable suggestions for improvement with examples. ### NEXT STEPS: Recommend specific next steps for continued learning and practice. ## Important Guidelines: - **Be encouraging**: Focus on growth, not just mistakes - **Be specific**: Give concrete examples, not vague advice - **Be appropriate**: Match feedback complexity to learner level - **Be actionable**: Every suggestion should be something they can practice - **Use markdown**: Structure feedback clearly with headers and bullet points Remember: Your goal is to help learners feel motivated while giving them clear paths to improvement. Balance honest feedback with positive reinforcement. """, ), ("placeholder", "{messages}"), ] ) chain = evaluation_prompt_template | model.with_structured_output(ResponseFormatter) # Call the LLM with the formatted prompt structured_output: ResponseFormatter = await chain.ainvoke( { "session_id": session_id, "learner_level": learner_level, "scenario_title": scenario_title, "scenario_description": scenario_description, "key_vocabulary": key_vocabulary, "messages": messages, } ) # Convert structured output to dictionary result = { "score": structured_output.score, "feedback": structured_output.feedback, "strengths": structured_output.strengths, "improvements": structured_output.improvements, "suggestions": structured_output.suggestions, "next_steps": structured_output.next_steps, "words_used": structured_output.words_used, "perfect_response": structured_output.perfect_response, "impressive_words": structured_output.impressive_words, } return result