Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from PIL import Image | |
| import io | |
| import soundfile as sf | |
| import numpy as np | |
| import torch | |
| from transformers import pipeline | |
| from diffusers import StableAudioPipeline | |
| # --- Configuration --- | |
| # Determine the optimal device for model inference | |
| # Prioritize CUDA (NVIDIA GPUs), then MPS (Apple Silicon), fallback to CPU | |
| DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu") | |
| # Use float16 for reduced memory and faster inference on compatible hardware (GPU/MPS) | |
| # Fallback to float32 for CPU for better stability | |
| TORCH_DTYPE = torch.float16 if DEVICE in ["cuda", "mps"] else torch.float32 | |
| # --- Cached Model Loading Functions --- | |
| def load_blip_model(): | |
| """ | |
| Loads the BLIP image captioning model using Hugging Face transformers pipeline. | |
| The model is cached to prevent reloading on every Streamlit rerun. | |
| """ | |
| try: | |
| captioner = pipeline( | |
| "image-to-text", | |
| model="Salesforce/blip-image-captioning-base", | |
| torch_dtype=TORCH_DTYPE, | |
| device=DEVICE | |
| ) | |
| return captioner | |
| except Exception as e: | |
| st.error(f"Failed to load BLIP model: {e}") | |
| return None | |
| def load_stable_audio_model(): | |
| """ | |
| Loads the Stable Audio Open Small pipeline using Hugging Face diffusers. | |
| The pipeline is cached to prevent reloading on every Streamlit rerun. | |
| """ | |
| try: | |
| # Changed model to stabilityai/stable-audio-open-small | |
| audio_pipeline = StableAudioPipeline.from_pretrained( | |
| "stabilityai/stable-audio-open-1.0", | |
| torch_dtype=TORCH_DTYPE | |
| ).to(DEVICE) | |
| return audio_pipeline | |
| except Exception as e: | |
| st.error(f"Failed to load Stable Audio model: {e}") | |
| return None | |
| # --- Audio Conversion Utility --- | |
| def convert_numpy_to_wav_bytes(audio_array: np.ndarray, sample_rate: int) -> bytes: | |
| """ | |
| Converts a NumPy audio array to an in-memory WAV byte stream. | |
| This avoids writing temporary files to disk, which is efficient and | |
| suitable for ephemeral environments like Hugging Face Spaces. | |
| """ | |
| byte_io = io.BytesIO() | |
| # Stable Audio Open's diffusers output is (channels, frames). | |
| # soundfile typically expects (frames, channels) for stereo. | |
| # Transpose if it's a 2D array (stereo) to match soundfile's expectation. | |
| if audio_array.ndim == 2 and audio_array.shape == 2: # Check if stereo (2 channels) | |
| audio_array = audio_array.T # Transpose to (frames, channels) [1] | |
| # Write the NumPy array to the in-memory BytesIO object as a WAV file [1, 2] | |
| sf.write(byte_io, audio_array, sample_rate, format='WAV', subtype='FLOAT') | |
| # IMPORTANT: Reset the stream position to the beginning before reading [3] | |
| byte_io.seek(0) | |
| return byte_io.read() | |
| # --- Streamlit App Layout --- | |
| st.set_page_config(layout="centered", page_title="Image-to-Soundscape Generator") | |
| st.title("ποΈ Image-to-Soundscape Generator πΆ") | |
| st.markdown("Upload a landscape image, and let AI transform it into a unique soundscape!") | |
| # Initialize session state for persistence across reruns [4] | |
| if "audio_bytes" not in st.session_state: | |
| st.session_state.audio_bytes = None | |
| if "image_uploaded" not in st.session_state: | |
| st.session_state.image_uploaded = False | |
| # --- UI Components --- | |
| uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jpeg", "png"]) # [5] | |
| if uploaded_file is not None: | |
| st.session_state.image_uploaded = True | |
| image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format | |
| st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6] | |
| # Button to trigger the generation pipeline | |
| if st.button("Generate Soundscape"): | |
| st.session_state.audio_bytes = None # Clear previous audio | |
| with st.spinner("Generating soundscape... This may take a moment."): # [4] | |
| try: | |
| # 1. Load BLIP model and generate caption (hidden from user) | |
| captioner = load_blip_model() | |
| if captioner is None: | |
| st.error("Image captioning model could not be loaded. Please try again.") | |
| st.session_state.image_uploaded = False # Reset to allow re-upload | |
| st.stop() | |
| # Generate caption | |
| # The BLIP pipeline expects a PIL Image object directly | |
| caption_results = captioner(image) | |
| # Extract the generated text from the pipeline's output | |
| generated_caption = caption_results[0]['generated_text'] | |
| # Optional: Enhance prompt for soundscape generation | |
| # This helps guide the audio model towards environmental sounds | |
| soundscape_prompt = f"A soundscape of {generated_caption}" | |
| # 2. Load Stable Audio model and generate audio | |
| audio_pipeline = load_stable_audio_model() | |
| if audio_pipeline is None: | |
| st.error("Audio generation model could not be loaded. Please try again.") | |
| st.session_state.image_uploaded = False # Reset to allow re-upload | |
| st.stop() | |
| # Generate audio with optimized parameters for speed [7, 8] | |
| # num_inference_steps: Lower for faster generation, higher for better quality | |
| # audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12] | |
| # negative_prompt: Helps improve perceived quality [8] | |
| audio_output = audio_pipeline( | |
| prompt=soundscape_prompt, | |
| num_inference_steps=10, # Tuned for faster generation [8] | |
| audio_end_in_s=5, # 10 seconds audio length (within 11s limit for small model) [10, 11, 12] | |
| negative_prompt="low quality, average quality, distorted" # [8] | |
| ) | |
| # Extract the NumPy array and sample rate [9] | |
| audio_numpy_array = audio_output.audios | |
| sample_rate = audio_pipeline.config.sampling_rate | |
| # 3. Convert NumPy array to WAV bytes and store in session state | |
| st.session_state.audio_bytes = convert_numpy_to_wav_bytes(audio_numpy_array, sample_rate) | |
| st.success("Soundscape generated successfully!") | |
| except Exception as e: | |
| st.error(f"An error occurred during generation: {e}") # | |
| st.session_state.audio_bytes = None # Clear any partial audio | |
| st.session_state.image_uploaded = False # Reset to allow re-upload | |
| st.exception(e) # Display full traceback for debugging | |
| # Display generated soundscape if available in session state | |
| if st.session_state.audio_bytes: | |
| st.subheader("Generated Soundscape:") | |
| st.audio(st.session_state.audio_bytes, format='audio/wav') # | |
| st.markdown("You can download the audio using the controls above.") | |
| # Reset button for new image upload | |
| if st.session_state.image_uploaded and st.button("Upload New Image"): | |
| st.session_state.audio_bytes = None | |
| st.session_state.image_uploaded = False | |
| st.rerun() # Rerun the app to clear the file uploader |