##########################################
# Step 0: Essential imports
##########################################
import streamlit as st  # Web interface
from transformers import (  # AI components
    pipeline,
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    AutoModelForCausalLM,
    AutoTokenizer
)
from datasets import load_dataset  # Voice data
import torch  # Tensor operations
import soundfile as sf  # Audio processing

##########################################
# Initial configuration (MUST BE FIRST)
##########################################
st.set_page_config(  # Set page config first
    page_title="Just Comment",
    page_icon="💬",
    layout="centered"
)

##########################################
# Optimized model loader with caching
##########################################
@st.cache_resource(show_spinner=False)
def _load_components():
    """Load and cache all models with hardware optimization"""
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Emotion classifier (fast)
    emotion_pipe = pipeline(
        "text-classification",
        model="Thea231/jhartmann_emotion_finetuning",
        device=device,
        truncation=True
    )

    # Text generator (optimized)
    text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
    text_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen1.5-0.5B",
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # TTS system (accelerated)
    tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    tts_model = SpeechT5ForTextToSpeech.from_pretrained(
        "microsoft/speecht5_tts",
        torch_dtype=torch.float16
    ).to(device)
    tts_vocoder = SpeechT5HifiGan.from_pretrained(
        "microsoft/speecht5_hifigan",
        torch_dtype=torch.float16
    ).to(device)

    # Preloaded voice profile
    speaker_emb = torch.tensor(
        load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
    ).unsqueeze(0).to(device)

    return {
        "emotion": emotion_pipe,
        "text_model": text_model,
        "text_tokenizer": text_tokenizer,
        "tts_processor": tts_processor,
        "tts_model": tts_model,
        "tts_vocoder": tts_vocoder,
        "speaker_emb": speaker_emb,
        "device": device
    }

##########################################
# User interface components
##########################################
def _show_interface():
    """Render input interface"""
    st.title("Just Comment")
    st.markdown("### I'm listening to you, my friend～")
    return st.text_area(  # Input field
        "📝 Enter your comment:",
        placeholder="Share your thoughts...",
        height=150,
        key="input"
    )

##########################################
# Core processing functions
##########################################
def _fast_emotion(text, analyzer):
    """Rapid emotion detection with input limits"""
    result = analyzer(text[:256], return_all_scores=True)[0]  # Limit input length
    emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    return max(
        (e for e in result if e['label'].lower() in emotions),
        key=lambda x: x['score'],
        default={'label': 'neutral', 'score': 0}
    )

def _build_prompt(text, emotion):
    """Template-based prompt engineering for response generation"""
    return f"{emotion.capitalize()} detected: {text}\nRespond with a coherent and supportive response."

def _generate_response(text, models):
    """Optimized text generation pipeline"""
    # Emotion detection
    emotion = _fast_emotion(text, models["emotion"])
    
    # Prompt construction
    prompt = _build_prompt(text, emotion["label"])
    
    # Generate text
    inputs = models["text_tokenizer"](
        prompt,
        return_tensors="pt",
        max_length=100,
        truncation=True
    ).to(models["device"])
    
    output = models["text_model"].generate(
        inputs.input_ids,
        max_new_tokens=100,  # Balanced length for response
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=models["text_tokenizer"].eos_token_id
    )
    
    # Process output
    response = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
    return response.strip()[:200] or "Thank you for your feedback."

def _text_to_speech(text, models):
    """High-speed audio synthesis"""
    inputs = models["tts_processor"](text=text[:150], return_tensors="pt").to(models["device"])
    
    with torch.inference_mode():  # Accelerated inference
        spectrogram = models["tts_model"].generate_speech(inputs["input_ids"], models["speaker_emb"])
        audio = models["tts_vocoder"](spectrogram)
    
    sf.write("output.wav", audio.cpu().numpy(), 16000)
    return "output.wav"

##########################################
# Main application flow
##########################################
def main():
    """Primary execution controller"""
    # Load components
    components = _load_components()
    
    # Show interface
    user_input = _show_interface()
    
    if user_input:
        # Text generation
        with st.spinner("🔍 Analyzing..."):
            response = _generate_response(user_input, components)
        
        # Display result
        st.subheader("📄 Response")
        st.markdown(f"```\n{response}\n```")  # f-string formatted
        
        # Audio generation
        with st.spinner("🔊 Synthesizing..."):
            audio_path = _text_to_speech(response, components)
            st.audio(audio_path, format="audio/wav")

if __name__ == "__main__":
    main()  # Execute the main function