| import torch | |
| import gradio as gr | |
| from transformers import ViTImageProcessor, ViTModel | |
| from audiodiffusion import AudioDiffusionPipeline, ImageEncoder | |
| from pedalboard.io import AudioFile | |
| from pedalboard import Pedalboard, NoiseGate, Compressor, LowShelfFilter, Gain, HighShelfFilter, Reverb | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| generator1 = torch.Generator(device) | |
| generator2 = torch.Generator(device) | |
| pipe = AudioDiffusionPipeline.from_pretrained('Woleek/clMusDiff').to(device) | |
| processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k') | |
| extractor = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k') | |
| image_encoder = ImageEncoder(processor, extractor) | |
| board = Pedalboard([ | |
| NoiseGate(threshold_db=-60, ratio=10.0), | |
| Compressor(threshold_db=60, ratio=1.0), | |
| LowShelfFilter(cutoff_frequency_hz=220, gain_db=-10), | |
| HighShelfFilter(cutoff_frequency_hz=1200, gain_db=-10), | |
| Gain(gain_db=40), | |
| Reverb(room_size=0.5), | |
| ]) | |
| def _encode_image(image): | |
| return torch.unsqueeze(image_encoder.encode(image), axis=1).to(device) | |
| def _generate_spectrogram(condition, steps, eta): | |
| images, (sample_rate, audios) = pipe( | |
| batch_size=1, | |
| steps=steps, | |
| generator=generator1, | |
| step_generator=generator2, | |
| encoding=condition, | |
| eta=eta, | |
| return_dict=False, | |
| ) | |
| return images[0], (sample_rate, audios[0]) | |
| def _denoise_audio(audio, sr): | |
| return board(audio, sr) | |
| def run_generation(image, steps, eta): | |
| condition = _encode_image(image) | |
| spectrogram, (sr, audio) = _generate_spectrogram(condition, steps, eta) | |
| audio = _denoise_audio(audio, sr) | |
| return spectrogram, (sr, audio) | |
| with gr.Blocks(title="Image-based soundtrack generation") as demo: | |
| gr.Markdown(''' | |
| # Image-based soundtrack generation | |
| ''') | |
| with gr.Row(): | |
| with gr.Column(): | |
| image = gr.Image( | |
| type="pil", | |
| label="Conditioning image" | |
| ) | |
| steps = gr.Slider( | |
| minimum=10, | |
| maximum=1000, | |
| step=10, | |
| value=50, | |
| label="Denoising steps" | |
| ) | |
| eta = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.1, | |
| value=0.6, | |
| label="η" | |
| ) | |
| gr.Markdown(''' | |
| Eta (η) is a variable that controls the level of interpolation between deterministic (η=0.0) and stochastic (η=1.0) denoising schedule. | |
| ''') | |
| btn = gr.Button("Generate") | |
| clear = gr.ClearButton(image) | |
| with gr.Column(): | |
| spectrogram = gr.Image( | |
| label="Generated Mel spectrogram" | |
| ) | |
| audio = gr.Audio( | |
| label="Resulting audio" | |
| ) | |
| btn.click(run_generation, inputs=[image, steps, eta], outputs=[spectrogram, audio]) | |
| demo.launch() |