Spaces:
Runtime error
Runtime error
| from transformers import VitsModel, AutoTokenizer | |
| import soundfile as sf | |
| import torch | |
| from datetime import datetime | |
| import random | |
| import time | |
| from ctransformers import AutoModelForCausalLM | |
| from datetime import datetime | |
| import whisper | |
| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| from transformers import MusicgenForConditionalGeneration, AutoProcessor, set_seed | |
| import torch | |
| import numpy as np | |
| import os | |
| import argparse | |
| import gradio as gr | |
| from timeit import default_timer as timer | |
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download | |
| from model.bart import BartCaptionModel | |
| from utils.audio_utils import load_audio, STR_CH_FIRST | |
| from diffusers import DiffusionPipeline | |
| from PIL import Image | |
| def image_grid(imgs, rows, cols): | |
| assert len(imgs) == rows*cols | |
| w, h = imgs[0].size | |
| grid = Image.new('RGB', size=(cols*w, rows*h)) | |
| grid_w, grid_h = grid.size | |
| for i, img in enumerate(imgs): | |
| grid.paste(img, box=(i%cols*w, i//cols*h)) | |
| return grid | |
| def save_to_txt(text_to_save): | |
| with open('prompt.txt', 'w', encoding='utf-8') as f: | |
| f.write(text_to_save) | |
| def read_txt(): | |
| with open('prompt.txt') as f: | |
| lines = f.readlines() | |
| return lines | |
| ##### Chat z LLAMA #### | |
| ##### Chat z LLAMA #### | |
| ##### Chat z LLAMA #### | |
| params = { | |
| "max_new_tokens":512, | |
| "stop":["<end>" ,"<|endoftext|>","[", "<user>"], | |
| "temperature":0.7, | |
| "top_p":0.8, | |
| "stream":True, | |
| "batch_size": 8} | |
| whisper_model = whisper.load_model("medium").to("cuda") | |
| print("Whisper Loaded!") | |
| llm = AutoModelForCausalLM.from_pretrained("Aspik101/trurl-2-7b-pl-instruct_GGML", model_type="llama") | |
| print("LLM Loaded!") | |
| tts_model = VitsModel.from_pretrained("facebook/mms-tts-pol") | |
| tts_model.to("cuda") | |
| print("TTS Loaded!") | |
| tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-pol") | |
| pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", | |
| torch_dtype=torch.float16, | |
| use_safetensors=True, | |
| variant="fp16").to("cuda") | |
| print("DiffusionPipeline Loaded!") | |
| model_audio_gen = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to("cuda") | |
| processor_audio_gen = AutoProcessor.from_pretrained("facebook/musicgen-small") | |
| with gr.Blocks() as chat_demo: | |
| chatbot = gr.Chatbot() | |
| audio_input = gr.Audio(source="microphone", type="filepath", show_label=False) | |
| submit_audio = gr.Button("Submit Audio") | |
| clear = gr.Button("Clear") | |
| audio_output = gr.Audio('temp_file.wav', label="Generated Audio (wav)", type='filepath', autoplay=False) | |
| def translate(audio): | |
| print("__Wysyłam nagranie do whisper!") | |
| transcription = whisper_model.transcribe(audio, language="pl") | |
| return transcription["text"] | |
| def read_text(text): | |
| print("Tutaj jest tekst to przeczytania!", text[-1][-1]) | |
| inputs = tokenizer(text[-1][-1], return_tensors="pt").to("cuda") | |
| with torch.no_grad(): | |
| output = tts_model(**inputs).waveform.squeeze().cpu().numpy() | |
| sf.write('temp_file.wav', output, tts_model.config.sampling_rate) | |
| return 'temp_file.wav' | |
| def user(audio_data, history): | |
| if audio_data: | |
| user_message = translate(audio_data) | |
| print("USER!:") | |
| print("", history + [[user_message, None]]) | |
| return history + [[user_message, None]] | |
| def parse_history(hist): | |
| history_ = "" | |
| for q, a in hist: | |
| history_ += f"<user>: {q } \n" | |
| if a: | |
| history_ += f"<assistant>: {a} \n" | |
| return history_ | |
| def bot(history): | |
| print(f"When: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}") | |
| prompt = f"Jesteś AI assystentem. Odpowiadaj krótko i po polsku. {parse_history(history)}. <assistant>:" | |
| stream = llm(prompt, **params) | |
| history[-1][1] = "" | |
| answer_save = "" | |
| for character in stream: | |
| history[-1][1] += character | |
| answer_save += character | |
| time.sleep(0.005) | |
| yield history | |
| submit_audio.click(user, [audio_input, chatbot], [chatbot], queue=False).then(bot, chatbot, chatbot).then(read_text, chatbot, audio_output) | |
| clear.click(lambda: None, None, chatbot, queue=False) | |
| ##### Audio Gen #### | |
| ##### Audio Gen #### | |
| ##### Audio Gen #### | |
| sampling_rate = model_audio_gen.audio_encoder.config.sampling_rate | |
| frame_rate = model_audio_gen.audio_encoder.config.frame_rate | |
| text_encoder = model_audio_gen.get_text_encoder() | |
| def generate_audio(decade, genre, instrument, guidance_scale=8, audio_length_in_s=20, seed=0): | |
| prompt = " ".join([decade, genre, 'track with ', instrument]) | |
| save_to_txt(prompt) | |
| inputs = processor_audio_gen( | |
| text=[prompt, "drums"], | |
| padding=True, | |
| return_tensors="pt", | |
| ).to(device) | |
| with torch.no_grad(): | |
| encoder_outputs = text_encoder(**inputs) | |
| max_new_tokens = int(frame_rate * audio_length_in_s) | |
| set_seed(seed) | |
| audio_values = model_audio_gen.generate(inputs.input_ids[0][None, :], attention_mask=inputs.attention_mask, encoder_outputs=encoder_outputs, do_sample=True, guidance_scale=guidance_scale, max_new_tokens=max_new_tokens) | |
| sf.write('generated_audio.wav', audio_values.cpu()[0][0], 32_000) | |
| audio_values = (audio_values.cpu().numpy() * 32767).astype(np.int16) | |
| return (sampling_rate, audio_values) | |
| audio_gen = gr.Interface( | |
| fn=generate_audio, | |
| inputs=[ | |
| # gr.Text(label="Negative prompt", value="drums"), | |
| gr.Radio(["50s", " 60s", "70s", "80s", "90s"], label="decade", info=""), | |
| gr.Radio(["classic", "rock", "pop", "metal", "jazz", "synth"], label="genre", info=""), | |
| gr.Radio(["acoustic guitar", "electric guitar", "drums", "saxophone", "keyboard", "accordion", "fiddle"], label="instrument", info=""), | |
| gr.Slider(1.5, 10, value=8, step=0.5, label="Guidance scale"), | |
| gr.Slider(5, 30, value=20, step=5, label="Audio length in s"), | |
| # gr.Slider(0, 10, value=0, step=1, label="Seed"), | |
| ], | |
| outputs=[ | |
| gr.Audio(label="Generated Music", type="numpy"), | |
| ]#, | |
| # examples=EXAMPLES, | |
| ) | |
| #### Audio desc and Stable ### | |
| #### Audio desc and Stable ### | |
| #### Audio desc and Stable ### | |
| if os.path.isfile("transfer.pth") == False: | |
| torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth') | |
| torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav') | |
| torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3') | |
| torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav') | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav'] | |
| model = BartCaptionModel(max_length = 128) | |
| pretrained_object = torch.load('./transfer.pth', map_location='cpu') | |
| state_dict = pretrained_object['state_dict'] | |
| model.load_state_dict(state_dict) | |
| if torch.cuda.is_available(): | |
| torch.cuda.set_device(device) | |
| model = model.cuda(device) | |
| model.eval() | |
| def get_audio(audio_path, duration=10, target_sr=16000): | |
| n_samples = int(duration * target_sr) | |
| audio, sr = load_audio( | |
| path= audio_path, | |
| ch_format= STR_CH_FIRST, | |
| sample_rate= target_sr, | |
| downmix_to_mono= True, | |
| ) | |
| if len(audio.shape) == 2: | |
| audio = audio.mean(0, False) # to mono | |
| input_size = int(n_samples) | |
| if audio.shape[-1] < input_size: # pad sequence | |
| pad = np.zeros(input_size) | |
| pad[: audio.shape[-1]] = audio | |
| audio = pad | |
| ceil = int(audio.shape[-1] // n_samples) | |
| audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32')) | |
| return audio | |
| def captioning(audio_path): | |
| audio_tensor = get_audio(audio_path = audio_path) | |
| if torch.cuda.is_available(): | |
| audio_tensor = audio_tensor.to(device) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| samples=audio_tensor, | |
| num_beams=5, | |
| ) | |
| inference = "" | |
| number_of_chunks = range(audio_tensor.shape[0]) | |
| for chunk, text in zip(number_of_chunks, output): | |
| time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]" | |
| inference += f"{time}\n{text} \n \n" | |
| return inference | |
| title = "" | |
| description = "" | |
| article = "" | |
| def captioning(): | |
| audio_path = 'generated_audio.wav' | |
| audio_tensor = get_audio(audio_path=audio_path) | |
| if torch.cuda.is_available(): | |
| audio_tensor = audio_tensor.to(device) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| samples=audio_tensor, | |
| num_beams=5) | |
| inference = "" | |
| number_of_chunks = range(audio_tensor.shape[0]) | |
| for chunk, text in zip(number_of_chunks, output): | |
| time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]" | |
| inference += f"{time}\n{text} \n \n" | |
| prompt = read_txt() | |
| print(prompt[0]) | |
| # Generuj obraz na podstawie tekstu | |
| #generated_images = pipe(prompt=prompt[0]*5 + inference + prompt[0]*5).images | |
| #image = generated_images[0] | |
| num_images = 3 | |
| prompt = [prompt[0]*5 + inference + prompt[0]*5] * num_images | |
| images = pipe(prompt, height=768, width=768).images | |
| grid = image_grid(images, rows=1, cols=3) | |
| return inference, grid | |
| audio_desc = gr.Interface(fn=captioning, | |
| inputs=None, | |
| outputs=[ | |
| gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"), | |
| gr.Image(label="Generated Image") # Dodane wyjście dla obrazu | |
| ], | |
| title=title, | |
| description=description, | |
| article=article, | |
| cache_examples=False | |
| ) | |
| music = gr.Video("muzyka_AI.mp4") | |
| voice_cloning = gr.Video("voice_cloning_fraud.mp4") | |
| ##### Run Alll ####### | |
| ##### Run Alll ####### | |
| ##### Run Alll ####### | |
| demo_all = gr.TabbedInterface([music, audio_gen, audio_desc, voice_cloning, chat_demo], ["1.Music", "2.Audio Generation", "3.Image Generation", "4.Voice Cloning", "5.Chat with LLama"]) | |
| demo_all.queue() | |
| demo_all.launch() | |