Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Running on Zero

App Files Files Community

Generate-Sound-Effects-from-Image / app.py

Bils

Update app.py

e18ae6e verified 10 months ago

raw

history blame

7.47 kB

	import spaces
	import os
	import tempfile
	import gradio as gr
	from dotenv import load_dotenv
	import torch
	from scipy.io.wavfile import write
	from diffusers import DiffusionPipeline
	from transformers import pipeline
	from pydub import AudioSegment
	import numpy as np

	# Load environment variables
	load_dotenv()
	hf_token = os.getenv("HF_TKN")

	# Device configuration
	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if device == "cuda" else torch.float32

	# Initialize models with automatic device detection
	@spaces.GPU(duration=120)
	def load_models():
	global captioning_pipeline, pipe
	captioning_pipeline = pipeline(
	"image-to-text",
	model="nlpconnect/vit-gpt2-image-captioning",
	device=0 if torch.cuda.is_available() else -1
	)
	pipe = DiffusionPipeline.from_pretrained(
	"cvssp/audioldm2",
	use_auth_token=hf_token,
	torch_dtype=torch_dtype
	).to(device)

	load_models()

	@spaces.GPU(duration=60)
	def analyze_image(image_file):
	"""Generate caption from image with error handling"""
	try:
	results = captioning_pipeline(image_file)
	if results and isinstance(results, list):
	return results[0].get("generated_text", "").strip()
	return "Could not generate caption"
	except Exception as e:
	return f"Error: {str(e)}"

	@spaces.GPU(duration=120)
	def generate_audio(prompt):
	"""Generate audio from text prompt"""
	try:
	return pipe(
	prompt=prompt,
	num_inference_steps=50,
	guidance_scale=7.5
	).audios[0]
	except Exception as e:
	print(f"Audio generation error: {str(e)}")
	return None

	def blend_audios(audio_list):
	"""Mix multiple audio arrays into one"""
	try:
	valid_audios = [arr for arr in audio_list if arr is not None]
	if not valid_audios:
	return None

	max_length = max(arr.shape[0] for arr in valid_audios)
	mixed = np.zeros(max_length)

	for arr in valid_audios:
	if arr.shape[0] < max_length:
	padded = np.pad(arr, (0, max_length - arr.shape[0]))
	else:
	padded = arr[:max_length]
	mixed += padded

	mixed = mixed / np.max(np.abs(mixed))
	_, tmp_path = tempfile.mkstemp(suffix=".wav")
	write(tmp_path, 16000, mixed)
	return tmp_path
	except Exception as e:
	print(f"Blending error: {str(e)}")
	return None

	css = """
	#col-container { max-width: 800px; margin: 0 auto; }
	.toggle-row { margin: 1rem 0; }
	.prompt-box { margin-bottom: 0.5rem; }
	.danger { color: #ff4444; font-weight: bold; }
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	# Header Section
	gr.HTML("""
	<h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
	<p style="text-align: center;">
	⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
	</p>
	""")

	# Input Mode Toggle
	input_mode = gr.Radio(
	choices=["Image Input", "Text Input"],
	value="Image Input",
	label="Select Input Mode",
	elem_classes="toggle-row"
	)

	# Image Input Section
	with gr.Column(visible=True) as image_col:
	image_upload = gr.Image(type="filepath", label="Upload Image")
	generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
	caption_display = gr.Textbox(label="Generated Description", interactive=False)

	# Text Input Section
	with gr.Column(visible=False) as text_col:
	with gr.Row():
	prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
	prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
	additional_prompts = gr.Column()
	add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
	gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")

	# Generation Controls
	generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
	audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)

	# Documentation Section
	gr.Markdown("""
	## 👥 How You Can Contribute
	We welcome contributions! Contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
	Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
	""")

	# Visitor Badge
	gr.HTML("""
	<div style="text-align: center;">
	<a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
	<img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
	</a>
	</div>
	""")

	# Input Mode Toggle Handler
	input_mode.change(
	lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
	inputs=input_mode,
	outputs=[image_col, text_col],
	concurrency_limit=1
	)

	# Image Description Generation
	generate_desc_btn.click(
	analyze_image,
	inputs=image_upload,
	outputs=caption_display,
	concurrency_limit=2
	)

	# Dynamic Prompt Addition
	def add_prompt(current_count):
	if current_count >= 5:
	return current_count, gr.update()
	new_count = current_count + 1
	new_prompt = gr.Textbox(
	label=f"Sound Prompt {new_count}",
	lines=2,
	visible=True,
	placeholder="Enter sound description..."
	)
	return new_count, new_prompt

	prompt_count = gr.State(2)
	add_prompt_btn.click(
	add_prompt,
	inputs=prompt_count,
	outputs=[prompt_count, additional_prompts],
	concurrency_limit=1
	)

	# Sound Generation Handler
	def process_inputs(mode, image_file, caption, *prompts):
	try:
	if mode == "Image Input":
	if not image_file:
	raise gr.Error("Please upload an image")
	caption = analyze_image(image_file)
	prompts = [caption]
	else:
	prompts = [p.strip() for p in prompts if p.strip()]
	if not prompts:
	raise gr.Error("Please enter at least one valid prompt")

	# Generate individual audio tracks
	audio_tracks = []
	for prompt in prompts:
	if not prompt:
	continue
	audio = generate_audio(prompt)
	if audio is not None:
	audio_tracks.append(audio)

	# Blend audio tracks
	if not audio_tracks:
	return None
	return blend_audios(audio_tracks)

	except Exception as e:
	raise gr.Error(f"Processing error: {str(e)}")

	generate_sound_btn.click(
	process_inputs,
	inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
	outputs=audio_output,
	concurrency_limit=2
	)

	if __name__ == "__main__":
	demo.launch(max_threads=4)