Assignment_V1 / app.py
ccclllwww's picture
Update app.py
ac70fac verified
raw
history blame
11.4 kB
# ======================================
# Package Import
# ======================================
import streamlit as st
from PIL import Image
import time
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from typing import Tuple
from datasets import load_dataset
import soundfile as sf
import torch
# ======================================
# Basic Initialization
# ======================================
# Initialize image captioning pipeline with pretrained model
# Model source: Hugging Face Model Hub
_image_caption_pipeline = pipeline(
task="image-to-text",
model="noamrot/FuseCap_Image_Captioning"
)
# Global model configuration constants
_MODEL_NAME = "Qwen/Qwen3-1.7B"
_THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation
# Initialize model components once
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
_model = AutoModelForCausalLM.from_pretrained(
_MODEL_NAME,
torch_dtype="auto",
device_map="auto"
)
# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)
# ======================================
# Function settings
# ======================================
def generate_image_caption(input_image):
"""
Generate a textual description for an input image using a pretrained model.
Args:
input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
- A PIL Image object
- A string containing a filesystem path to an image file
Returns:
str: Generated caption text in natural language
Example:
>>> from PIL import Image
>>> img = Image.open("photo.jpg")
>>> caption = generate_image_caption(img)
>>> print(f"Caption: {caption}")
"""
# Process image through the captioning pipeline
inference_results = _image_caption_pipeline(input_image)
# Extract text from the first (and only) result dictionary
caption_text = inference_results[0]['generated_text']
return caption_text
def generate_story_content(system_prompt: str, user_prompt: str) -> str:
"""
Generates a children's story based on provided system and user prompts.
Args:
system_prompt: Defines the assistant's role and writing constraints
user_prompt: Describes the story scenario and specific elements to include
Returns:
Generated story text without any thinking process metadata
Raises:
RuntimeError: If text generation fails at any stage
Example:
>>> story = generate_story_content(
... "You are a helpful children's author...",
... "Kids playing with dogs in a sunny meadow..."
... )
"""
try:
# Prepare chat message structure
conversation_history = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
# Format input using model-specific template
formatted_input = _tokenizer.apply_chat_template(
conversation_history,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False
)
# Tokenize and prepare model inputs
model_inputs = _tokenizer(
[formatted_input],
return_tensors="pt"
).to(_model.device)
# Generate text completion
generated_sequences = _model.generate(
**model_inputs,
max_new_tokens=1000
)
# Process and clean output
return _process_generated_output(
generated_sequences,
model_inputs.input_ids
)
except Exception as error:
raise RuntimeError(f"Story generation failed: {str(error)}") from error
def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
"""
Processes raw model output to extract final content.
Args:
generated_sequences: Raw output sequences from model generation
input_ids: Original input token IDs used for generation
Returns:
Cleaned final content text
"""
# Extract new tokens excluding original prompt
new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
# Find separation point between thinking and final content
separation_index = _find_thinking_separation(new_tokens)
# Decode and clean final content
return _tokenizer.decode(
new_tokens[separation_index:],
skip_special_tokens=True
).strip("\n")
def _find_thinking_separation(token_sequence: list) -> int:
"""
Locates the boundary between thinking process and final content.
Args:
token_sequence: List of generated token IDs
Returns:
Index position marking the start of final content
"""
try:
# Search from end for separation token
reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
return len(token_sequence) - reverse_position
except ValueError:
return 0 # Return start if token not found
def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
"""
Convert text story to speech audio file using text-to-speech synthesis.
Args:
story_text: Input story text to synthesize
output_path: Path to save generated audio (default: 'output.wav')
Returns:
Path to generated audio file
Raises:
ValueError: For empty/invalid input text
RuntimeError: If audio generation fails
Example:
>>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
'story_audio.wav'
"""
# Validate input text
if not isinstance(story_text, str) or not story_text.strip():
raise ValueError("Input story text must be a non-empty string")
try:
# Generate speech with default speaker profile
speech_output = _SPEECH_PIPELINE(
story_text,
forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
)
# Save audio to WAV file
sf.write(
output_path,
speech_output["audio"],
samplerate=speech_output["sampling_rate"]
)
return output_path
except Exception as error:
raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error
# ======================================
# Page Configuration & Custom Styling
# ======================================
st.set_page_config(
page_title="Magic Story Generator",
page_icon="๐Ÿงš",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS styling for child-friendly interface
st.markdown("""
<style>
/* Primary title styling */
.main-title {
color: #E91E63;
font-size: 2.8rem;
text-align: center;
padding: 20px;
text-shadow: 2px 2px #FFC107;
}
/* Prompt buttons styling */
.prompt-btn {
background: #4CAF50 !important;
border-radius: 15px !important;
padding: 15px 30px !important;
font-size: 1.1rem !important;
margin: 10px;
}
/* Story container styling */
.story-container {
background: #FFF3E0;
border-radius: 20px;
padding: 25px;
margin: 20px 0;
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
/* Progress spinner styling */
.stSpinner > div {
font-size: 1.2rem !important;
color: #9C27B0 !important;
}
</style>
""", unsafe_allow_html=True)
# ======================================
# Main Application Interface
# ======================================
st.markdown('<p class="main-title">๐Ÿงš Welcome to Magic Story Maker!</p>', unsafe_allow_html=True)
# File upload section
with st.container():
st.subheader("Step 1: Upload Your Picture")
uploaded_image = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"],label_visibility="collapsed")
# Main processing flow
if uploaded_image is not None:
# Display uploaded image
with st.spinner("โœจ Magical image processing..."):
image = Image.open(uploaded_image)
st.image(image, caption="Your Magical Image", use_column_width=True)
# Prompt selection section
with st.container():
st.subheader("Step 2: Choose Story Style")
# Create three columns for prompt buttons
col1, col2, col3 = st.columns(3)
with col1:
if st.button("๐Ÿ“š Learning Story",
help="Generate educational story with life lessons",
key="edu_btn"):
st.session_state.selected_prompt = "educational"
with col2:
if st.button("๐ŸŒ  Fantasy Adventure",
help="Create magical adventure story",
key="fantasy_btn"):
st.session_state.selected_prompt = "adventure"
with col3:
if st.button("๐Ÿป Animal Friends",
help="Make story about friendly animals",
key="animal_btn"):
st.session_state.selected_prompt = "animal"
# Define prompt templates
PROMPT_TEMPLATES = {
"educational": {
"system": "You are a children's educator. Create a simple 150-word story that teaches basic life skills or moral lessons.",
"icon": "๐Ÿ“š"
},
"adventure": {
"system": "You are a fantasy writer. Create a 150-word magical adventure story suitable for children.",
"icon": "๐ŸŒ "
},
"animal": {
"system": "You are an animal expert. Create a 150-word story about friendly animals learning together.",
"icon": "๐Ÿป"
}
}
# Story generation section
with st.spinner(f"{PROMPT_TEMPLATES[st.session_state.selected_prompt]['icon']} Creating your story..."):
# Generate image caption
image_caption = generate_image_caption(image)
# Generate story content
selected_template = PROMPT_TEMPLATES[st.session_state.selected_prompt]
story_text = generate_story_content(
system_prompt=selected_template["system"],
user_prompt=image_caption
)
# Display formatted story
st.subheader("Step 3: Your Magical Story")
st.markdown(f'<div class="story-container">{story_text}</div>',
unsafe_allow_html=True)
# Audio generation section
with st.spinner("๐Ÿ”ฎ Preparing story narration..."):
audio_file = generate_audio_from_story(story_text, "story_audio.wav")
st.subheader("๐ŸŽง Listen to Your Story")
st.audio(audio_file)
# Help section
st.markdown("---")
st.subheader("๐ŸŒŸ How to Use:")
st.info("""
1. Upload any picture (animals, nature, or people work best!)
2. Choose your favorite story style
3. Wait for magic to happen!
4. Listen to your personalized story
""")