########################################## # Step 0: Essential imports ########################################## import streamlit as st # Web interface from transformers import ( # AI components pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, AutoModelForCausalLM, AutoTokenizer ) from datasets import load_dataset # Voice data import torch # Tensor operations import soundfile as sf # Audio processing ########################################## # Initial configuration (MUST BE FIRST) ########################################## st.set_page_config( # Set page config first page_title="Just Comment", page_icon="πŸ’¬", layout="centered" ) ########################################## # Optimized model loader with caching ########################################## @st.cache_resource(show_spinner=False) def _load_components(): """Load and cache all models with hardware optimization""" device = "cuda" if torch.cuda.is_available() else "cpu" # Emotion classifier (fast) emotion_pipe = pipeline( "text-classification", model="Thea231/jhartmann_emotion_finetuning", device=device, truncation=True ) # Text generator (optimized) text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B") text_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen1.5-0.5B", torch_dtype=torch.float16, device_map="auto" ) # TTS system (accelerated) tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") tts_model = SpeechT5ForTextToSpeech.from_pretrained( "microsoft/speecht5_tts", torch_dtype=torch.float16 ).to(device) tts_vocoder = SpeechT5HifiGan.from_pretrained( "microsoft/speecht5_hifigan", torch_dtype=torch.float16 ).to(device) # Preloaded voice profile speaker_emb = torch.tensor( load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"] ).unsqueeze(0).to(device) return { "emotion": emotion_pipe, "text_model": text_model, "text_tokenizer": text_tokenizer, "tts_processor": tts_processor, "tts_model": tts_model, "tts_vocoder": tts_vocoder, "speaker_emb": speaker_emb, "device": device } ########################################## # User interface components ########################################## def _show_interface(): """Render input interface""" st.title("Just Comment") st.markdown("### I'm listening to you, my friend~") return st.text_area( # Input field "πŸ“ Enter your comment:", placeholder="Share your thoughts...", height=150, key="input" ) ########################################## # Core processing functions ########################################## def _fast_emotion(text, analyzer): """Rapid emotion detection with input limits""" result = analyzer(text[:256], return_all_scores=True)[0] # Limit input length emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'] return max( (e for e in result if e['label'].lower() in emotions), key=lambda x: x['score'], default={'label': 'neutral', 'score': 0} ) def _build_prompt(text, emotion): """Template-based prompt engineering for response generation""" return f"{emotion.capitalize()} detected: {text}\nRespond with a coherent and supportive response." def _generate_response(text, models): """Optimized text generation pipeline""" # Emotion detection emotion = _fast_emotion(text, models["emotion"]) # Prompt construction prompt = _build_prompt(text, emotion["label"]) # Generate text inputs = models["text_tokenizer"]( prompt, return_tensors="pt", max_length=100, truncation=True ).to(models["device"]) output = models["text_model"].generate( inputs.input_ids, max_new_tokens=100, # Balanced length for response temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=models["text_tokenizer"].eos_token_id ) # Process output response = models["text_tokenizer"].decode(output[0], skip_special_tokens=True) return response.strip()[:200] or "Thank you for your feedback." def _text_to_speech(text, models): """High-speed audio synthesis""" inputs = models["tts_processor"](text=text[:150], return_tensors="pt").to(models["device"]) with torch.inference_mode(): # Accelerated inference spectrogram = models["tts_model"].generate_speech(inputs["input_ids"], models["speaker_emb"]) audio = models["tts_vocoder"](spectrogram) sf.write("output.wav", audio.cpu().numpy(), 16000) return "output.wav" ########################################## # Main application flow ########################################## def main(): """Primary execution controller""" # Load components components = _load_components() # Show interface user_input = _show_interface() if user_input: # Text generation with st.spinner("πŸ” Analyzing..."): response = _generate_response(user_input, components) # Display result st.subheader("πŸ“„ Response") st.markdown(f"```\n{response}\n```") # f-string formatted # Audio generation with st.spinner("πŸ”Š Synthesizing..."): audio_path = _text_to_speech(response, components) st.audio(audio_path, format="audio/wav") if __name__ == "__main__": main() # Execute the main function