Spaces:
Build error
Build error
| # ====================================== | |
| # Package Import | |
| # ====================================== | |
| import streamlit as st | |
| from PIL import Image | |
| import time | |
| from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer | |
| from typing import Tuple | |
| from datasets import load_dataset | |
| import soundfile as sf | |
| import torch | |
| # ====================================== | |
| # Basic Initialization | |
| # ====================================== | |
| # Initialize image captioning pipeline with pretrained model | |
| # Model source: Hugging Face Model Hub | |
| _image_caption_pipeline = pipeline( | |
| task="image-to-text", | |
| model="noamrot/FuseCap_Image_Captioning" | |
| ) | |
| # Global model configuration constants | |
| _MODEL_NAME = "Qwen/Qwen3-1.7B" | |
| _THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation | |
| # Initialize model components once | |
| _tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME) | |
| _model = AutoModelForCausalLM.from_pretrained( | |
| _MODEL_NAME, | |
| torch_dtype="auto", | |
| device_map="auto" | |
| ) | |
| # Initialize TTS components once to avoid reloading | |
| _SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
| _EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| _DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0) | |
| # ====================================== | |
| # Function settings | |
| # ====================================== | |
| def generate_image_caption(input_image): | |
| """ | |
| Generate a textual description for an input image using a pretrained model. | |
| Args: | |
| input_image (Union[PIL.Image.Image, str]): Image to process. Can be either: | |
| - A PIL Image object | |
| - A string containing a filesystem path to an image file | |
| Returns: | |
| str: Generated caption text in natural language | |
| Example: | |
| >>> from PIL import Image | |
| >>> img = Image.open("photo.jpg") | |
| >>> caption = generate_image_caption(img) | |
| >>> print(f"Caption: {caption}") | |
| """ | |
| # Process image through the captioning pipeline | |
| inference_results = _image_caption_pipeline(input_image) | |
| # Extract text from the first (and only) result dictionary | |
| caption_text = inference_results[0]['generated_text'] | |
| return caption_text | |
| def generate_story_content(system_prompt: str, user_prompt: str) -> str: | |
| """ | |
| Generates a children's story based on provided system and user prompts. | |
| Args: | |
| system_prompt: Defines the assistant's role and writing constraints | |
| user_prompt: Describes the story scenario and specific elements to include | |
| Returns: | |
| Generated story text without any thinking process metadata | |
| Raises: | |
| RuntimeError: If text generation fails at any stage | |
| Example: | |
| >>> story = generate_story_content( | |
| ... "You are a helpful children's author...", | |
| ... "Kids playing with dogs in a sunny meadow..." | |
| ... ) | |
| """ | |
| try: | |
| # Prepare chat message structure | |
| conversation_history = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ] | |
| # Format input using model-specific template | |
| formatted_input = _tokenizer.apply_chat_template( | |
| conversation_history, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=False | |
| ) | |
| # Tokenize and prepare model inputs | |
| model_inputs = _tokenizer( | |
| [formatted_input], | |
| return_tensors="pt" | |
| ).to(_model.device) | |
| # Generate text completion | |
| generated_sequences = _model.generate( | |
| **model_inputs, | |
| max_new_tokens=1000 | |
| ) | |
| # Process and clean output | |
| return _process_generated_output( | |
| generated_sequences, | |
| model_inputs.input_ids | |
| ) | |
| except Exception as error: | |
| raise RuntimeError(f"Story generation failed: {str(error)}") from error | |
| def _process_generated_output(generated_sequences: list, input_ids: list) -> str: | |
| """ | |
| Processes raw model output to extract final content. | |
| Args: | |
| generated_sequences: Raw output sequences from model generation | |
| input_ids: Original input token IDs used for generation | |
| Returns: | |
| Cleaned final content text | |
| """ | |
| # Extract new tokens excluding original prompt | |
| new_tokens = generated_sequences[0][len(input_ids[0]):].tolist() | |
| # Find separation point between thinking and final content | |
| separation_index = _find_thinking_separation(new_tokens) | |
| # Decode and clean final content | |
| return _tokenizer.decode( | |
| new_tokens[separation_index:], | |
| skip_special_tokens=True | |
| ).strip("\n") | |
| def _find_thinking_separation(token_sequence: list) -> int: | |
| """ | |
| Locates the boundary between thinking process and final content. | |
| Args: | |
| token_sequence: List of generated token IDs | |
| Returns: | |
| Index position marking the start of final content | |
| """ | |
| try: | |
| # Search from end for separation token | |
| reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID) | |
| return len(token_sequence) - reverse_position | |
| except ValueError: | |
| return 0 # Return start if token not found | |
| def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str: | |
| """ | |
| Convert text story to speech audio file using text-to-speech synthesis. | |
| Args: | |
| story_text: Input story text to synthesize | |
| output_path: Path to save generated audio (default: 'output.wav') | |
| Returns: | |
| Path to generated audio file | |
| Raises: | |
| ValueError: For empty/invalid input text | |
| RuntimeError: If audio generation fails | |
| Example: | |
| >>> generate_audio_from_story("Children playing in the park", "story_audio.wav") | |
| 'story_audio.wav' | |
| """ | |
| # Validate input text | |
| if not isinstance(story_text, str) or not story_text.strip(): | |
| raise ValueError("Input story text must be a non-empty string") | |
| try: | |
| # Generate speech with default speaker profile | |
| speech_output = _SPEECH_PIPELINE( | |
| story_text, | |
| forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING} | |
| ) | |
| # Save audio to WAV file | |
| sf.write( | |
| output_path, | |
| speech_output["audio"], | |
| samplerate=speech_output["sampling_rate"] | |
| ) | |
| return output_path | |
| except Exception as error: | |
| raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error | |
| # ====================================== | |
| # Page Configuration & Custom Styling | |
| # ====================================== | |
| st.set_page_config( | |
| page_title="Magic Story Generator", | |
| page_icon="๐ง", | |
| layout="wide", | |
| initial_sidebar_state="collapsed" | |
| ) | |
| # Custom CSS styling for child-friendly interface | |
| st.markdown(""" | |
| <style> | |
| /* Primary title styling */ | |
| .main-title { | |
| color: #E91E63; | |
| font-size: 2.8rem; | |
| text-align: center; | |
| padding: 20px; | |
| text-shadow: 2px 2px #FFC107; | |
| } | |
| /* Prompt buttons styling */ | |
| .prompt-btn { | |
| background: #4CAF50 !important; | |
| border-radius: 15px !important; | |
| padding: 15px 30px !important; | |
| font-size: 1.1rem !important; | |
| margin: 10px; | |
| } | |
| /* Story container styling */ | |
| .story-container { | |
| background: #FFF3E0; | |
| border-radius: 20px; | |
| padding: 25px; | |
| margin: 20px 0; | |
| box-shadow: 0 4px 8px rgba(0,0,0,0.1); | |
| } | |
| /* Progress spinner styling */ | |
| .stSpinner > div { | |
| font-size: 1.2rem !important; | |
| color: #9C27B0 !important; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ====================================== | |
| # Main Application Interface | |
| # ====================================== | |
| st.markdown('<p class="main-title">๐ง Welcome to Magic Story Maker!</p>', unsafe_allow_html=True) | |
| # File upload section | |
| with st.container(): | |
| st.subheader("Step 1: Upload Your Picture") | |
| uploaded_image = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"],label_visibility="collapsed") | |
| # Main processing flow | |
| if uploaded_image is not None: | |
| # Display uploaded image | |
| with st.spinner("โจ Magical image processing..."): | |
| image = Image.open(uploaded_image) | |
| st.image(image, caption="Your Magical Image", use_column_width=True) | |
| # Prompt selection section | |
| with st.container(): | |
| st.subheader("Step 2: Choose Story Style") | |
| # Create three columns for prompt buttons | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if st.button("๐ Learning Story", | |
| help="Generate educational story with life lessons", | |
| key="edu_btn"): | |
| st.session_state.selected_prompt = "educational" | |
| with col2: | |
| if st.button("๐ Fantasy Adventure", | |
| help="Create magical adventure story", | |
| key="fantasy_btn"): | |
| st.session_state.selected_prompt = "adventure" | |
| with col3: | |
| if st.button("๐ป Animal Friends", | |
| help="Make story about friendly animals", | |
| key="animal_btn"): | |
| st.session_state.selected_prompt = "animal" | |
| # Define prompt templates | |
| PROMPT_TEMPLATES = { | |
| "educational": { | |
| "system": "You are a children's educator. Create a simple 150-word story that teaches basic life skills or moral lessons.", | |
| "icon": "๐" | |
| }, | |
| "adventure": { | |
| "system": "You are a fantasy writer. Create a 150-word magical adventure story suitable for children.", | |
| "icon": "๐ " | |
| }, | |
| "animal": { | |
| "system": "You are an animal expert. Create a 150-word story about friendly animals learning together.", | |
| "icon": "๐ป" | |
| } | |
| } | |
| # Story generation section | |
| with st.spinner(f"{PROMPT_TEMPLATES[st.session_state.selected_prompt]['icon']} Creating your story..."): | |
| # Generate image caption | |
| image_caption = generate_image_caption(image) | |
| # Generate story content | |
| selected_template = PROMPT_TEMPLATES[st.session_state.selected_prompt] | |
| story_text = generate_story_content( | |
| system_prompt=selected_template["system"], | |
| user_prompt=image_caption | |
| ) | |
| # Display formatted story | |
| st.subheader("Step 3: Your Magical Story") | |
| st.markdown(f'<div class="story-container">{story_text}</div>', | |
| unsafe_allow_html=True) | |
| # Audio generation section | |
| with st.spinner("๐ฎ Preparing story narration..."): | |
| audio_file = generate_audio_from_story(story_text, "story_audio.wav") | |
| st.subheader("๐ง Listen to Your Story") | |
| st.audio(audio_file) | |
| # Help section | |
| st.markdown("---") | |
| st.subheader("๐ How to Use:") | |
| st.info(""" | |
| 1. Upload any picture (animals, nature, or people work best!) | |
| 2. Choose your favorite story style | |
| 3. Wait for magic to happen! | |
| 4. Listen to your personalized story | |
| """) |