Spaces:

ccclllwww
/

Assignment_V1

Build error

App Files Files Community

Assignment_V1 / app.py

ccclllwww

Update app.py

ac70fac verified 8 months ago

raw

history blame

11.4 kB

	# ======================================
	# Package Import
	# ======================================

	import streamlit as st
	from PIL import Image
	import time
	from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
	from typing import Tuple
	from datasets import load_dataset
	import soundfile as sf
	import torch

	# ======================================
	# Basic Initialization
	# ======================================

	# Initialize image captioning pipeline with pretrained model
	# Model source: Hugging Face Model Hub
	_image_caption_pipeline = pipeline(
	task="image-to-text",
	model="noamrot/FuseCap_Image_Captioning"
	)

	# Global model configuration constants
	_MODEL_NAME = "Qwen/Qwen3-1.7B"
	_THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation

	# Initialize model components once
	_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
	_model = AutoModelForCausalLM.from_pretrained(
	_MODEL_NAME,
	torch_dtype="auto",
	device_map="auto"
	)

	# Initialize TTS components once to avoid reloading
	_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
	_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)

	# ======================================
	# Function settings
	# ======================================

	def generate_image_caption(input_image):
	"""
	Generate a textual description for an input image using a pretrained model.

	Args:
	input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
	- A PIL Image object
	- A string containing a filesystem path to an image file

	Returns:
	str: Generated caption text in natural language

	Example:
	>>> from PIL import Image
	>>> img = Image.open("photo.jpg")
	>>> caption = generate_image_caption(img)
	>>> print(f"Caption: {caption}")
	"""
	# Process image through the captioning pipeline
	inference_results = _image_caption_pipeline(input_image)

	# Extract text from the first (and only) result dictionary
	caption_text = inference_results[0]['generated_text']

	return caption_text

	def generate_story_content(system_prompt: str, user_prompt: str) -> str:
	"""
	Generates a children's story based on provided system and user prompts.

	Args:
	system_prompt: Defines the assistant's role and writing constraints
	user_prompt: Describes the story scenario and specific elements to include

	Returns:
	Generated story text without any thinking process metadata

	Raises:
	RuntimeError: If text generation fails at any stage

	Example:
	>>> story = generate_story_content(
	... "You are a helpful children's author...",
	... "Kids playing with dogs in a sunny meadow..."
	... )
	"""
	try:
	# Prepare chat message structure
	conversation_history = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_prompt}
	]

	# Format input using model-specific template
	formatted_input = _tokenizer.apply_chat_template(
	conversation_history,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False
	)

	# Tokenize and prepare model inputs
	model_inputs = _tokenizer(
	[formatted_input],
	return_tensors="pt"
	).to(_model.device)

	# Generate text completion
	generated_sequences = _model.generate(
	**model_inputs,
	max_new_tokens=1000
	)

	# Process and clean output
	return _process_generated_output(
	generated_sequences,
	model_inputs.input_ids
	)

	except Exception as error:
	raise RuntimeError(f"Story generation failed: {str(error)}") from error

	def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
	"""
	Processes raw model output to extract final content.

	Args:
	generated_sequences: Raw output sequences from model generation
	input_ids: Original input token IDs used for generation

	Returns:
	Cleaned final content text
	"""
	# Extract new tokens excluding original prompt
	new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()

	# Find separation point between thinking and final content
	separation_index = _find_thinking_separation(new_tokens)

	# Decode and clean final content
	return _tokenizer.decode(
	new_tokens[separation_index:],
	skip_special_tokens=True
	).strip("\n")

	def _find_thinking_separation(token_sequence: list) -> int:
	"""
	Locates the boundary between thinking process and final content.

	Args:
	token_sequence: List of generated token IDs

	Returns:
	Index position marking the start of final content
	"""
	try:
	# Search from end for separation token
	reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
	return len(token_sequence) - reverse_position
	except ValueError:
	return 0 # Return start if token not found

	def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
	"""
	Convert text story to speech audio file using text-to-speech synthesis.

	Args:
	story_text: Input story text to synthesize
	output_path: Path to save generated audio (default: 'output.wav')

	Returns:
	Path to generated audio file

	Raises:
	ValueError: For empty/invalid input text
	RuntimeError: If audio generation fails

	Example:
	>>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
	'story_audio.wav'
	"""
	# Validate input text
	if not isinstance(story_text, str) or not story_text.strip():
	raise ValueError("Input story text must be a non-empty string")

	try:
	# Generate speech with default speaker profile
	speech_output = _SPEECH_PIPELINE(
	story_text,
	forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
	)

	# Save audio to WAV file
	sf.write(
	output_path,
	speech_output["audio"],
	samplerate=speech_output["sampling_rate"]
	)

	return output_path

	except Exception as error:
	raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error


	# ======================================
	# Page Configuration & Custom Styling
	# ======================================
	st.set_page_config(
	page_title="Magic Story Generator",
	page_icon="🧚",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Custom CSS styling for child-friendly interface
	st.markdown("""
	<style>
	/* Primary title styling */
	.main-title {
	color: #E91E63;
	font-size: 2.8rem;
	text-align: center;
	padding: 20px;
	text-shadow: 2px 2px #FFC107;
	}

	/* Prompt buttons styling */
	.prompt-btn {
	background: #4CAF50 !important;
	border-radius: 15px !important;
	padding: 15px 30px !important;
	font-size: 1.1rem !important;
	margin: 10px;
	}

	/* Story container styling */
	.story-container {
	background: #FFF3E0;
	border-radius: 20px;
	padding: 25px;
	margin: 20px 0;
	box-shadow: 0 4px 8px rgba(0,0,0,0.1);
	}

	/* Progress spinner styling */
	.stSpinner > div {
	font-size: 1.2rem !important;
	color: #9C27B0 !important;
	}
	</style>
	""", unsafe_allow_html=True)

	# ======================================
	# Main Application Interface
	# ======================================
	st.markdown('<p class="main-title">🧚 Welcome to Magic Story Maker!</p>', unsafe_allow_html=True)

	# File upload section
	with st.container():
	st.subheader("Step 1: Upload Your Picture")
	uploaded_image = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"],label_visibility="collapsed")

	# Main processing flow
	if uploaded_image is not None:
	# Display uploaded image
	with st.spinner("✨ Magical image processing..."):
	image = Image.open(uploaded_image)
	st.image(image, caption="Your Magical Image", use_column_width=True)

	# Prompt selection section
	with st.container():
	st.subheader("Step 2: Choose Story Style")

	# Create three columns for prompt buttons
	col1, col2, col3 = st.columns(3)
	with col1:
	if st.button("📚 Learning Story",
	help="Generate educational story with life lessons",
	key="edu_btn"):
	st.session_state.selected_prompt = "educational"
	with col2:
	if st.button("🌠 Fantasy Adventure",
	help="Create magical adventure story",
	key="fantasy_btn"):
	st.session_state.selected_prompt = "adventure"
	with col3:
	if st.button("🐻 Animal Friends",
	help="Make story about friendly animals",
	key="animal_btn"):
	st.session_state.selected_prompt = "animal"

	# Define prompt templates
	PROMPT_TEMPLATES = {
	"educational": {
	"system": "You are a children's educator. Create a simple 150-word story that teaches basic life skills or moral lessons.",
	"icon": "📚"
	},
	"adventure": {
	"system": "You are a fantasy writer. Create a 150-word magical adventure story suitable for children.",
	"icon": "🌠"
	},
	"animal": {
	"system": "You are an animal expert. Create a 150-word story about friendly animals learning together.",
	"icon": "🐻"
	}
	}

	# Story generation section
	with st.spinner(f"{PROMPT_TEMPLATES[st.session_state.selected_prompt]['icon']} Creating your story..."):
	# Generate image caption
	image_caption = generate_image_caption(image)

	# Generate story content
	selected_template = PROMPT_TEMPLATES[st.session_state.selected_prompt]
	story_text = generate_story_content(
	system_prompt=selected_template["system"],
	user_prompt=image_caption
	)

	# Display formatted story
	st.subheader("Step 3: Your Magical Story")
	st.markdown(f'<div class="story-container">{story_text}</div>',
	unsafe_allow_html=True)

	# Audio generation section
	with st.spinner("🔮 Preparing story narration..."):
	audio_file = generate_audio_from_story(story_text, "story_audio.wav")
	st.subheader("🎧 Listen to Your Story")
	st.audio(audio_file)

	# Help section
	st.markdown("---")
	st.subheader("🌟 How to Use:")
	st.info("""
	1. Upload any picture (animals, nature, or people work best!)
	2. Choose your favorite story style
	3. Wait for magic to happen!
	4. Listen to your personalized story
	""")