Spaces:
Running
on
L40S
Running
on
L40S
| import os | |
| import sys | |
| sys.path.append('./codeclm/tokenizer') | |
| sys.path.append('./codeclm/tokenizer/Flow1dVAE') | |
| sys.path.append('.') | |
| import torch | |
| import json | |
| from omegaconf import OmegaConf | |
| from codeclm.trainer.codec_song_pl import CodecLM_PL | |
| from codeclm.models import CodecLM | |
| from separator import Separator | |
| class LeVoInference(torch.nn.Module): | |
| def __init__(self, cfg_path): | |
| super().__init__() | |
| torch.backends.cudnn.enabled = False | |
| OmegaConf.register_new_resolver("eval", lambda x: eval(x)) | |
| OmegaConf.register_new_resolver("concat", lambda *x: [xxx for xx in x for xxx in xx]) | |
| OmegaConf.register_new_resolver("get_fname", lambda: 'default') | |
| OmegaConf.register_new_resolver("load_yaml", lambda x: list(OmegaConf.load(x))) | |
| self.cfg = OmegaConf.load(cfg_path) | |
| self.cfg.mode = 'inference' | |
| self.max_duration = self.cfg.max_dur | |
| # Define model or load pretrained model | |
| model_light = CodecLM_PL(self.cfg) | |
| model_light = model_light.eval().cuda() | |
| model_light.audiolm.cfg = self.cfg | |
| self.model_lm = model_light.audiolm | |
| self.model_audio_tokenizer = model_light.audio_tokenizer | |
| self.model_seperate_tokenizer = model_light.seperate_tokenizer | |
| self.model = CodecLM(name = "tmp", | |
| lm = self.model_lm, | |
| audiotokenizer = self.model_audio_tokenizer, | |
| max_duration = self.max_duration, | |
| seperate_tokenizer = self.model_seperate_tokenizer, | |
| ) | |
| self.separator = Separator() | |
| self.default_params = dict( | |
| cfg_coef = 1.5, | |
| temperature = 1.0, | |
| top_k = 50, | |
| top_p = 0.0, | |
| record_tokens = True, | |
| record_window = 50, | |
| extend_stride = 5, | |
| duration = self.max_duration, | |
| ) | |
| self.model.set_generation_params(**self.default_params) | |
| def forward(self, lyric: str, description: str, prompt_audio_path: os.PathLike = None, params = dict()): | |
| params = {**self.default_params, **params} | |
| self.model.set_generation_params(**params) | |
| if prompt_audio_path is None: | |
| pmt_wav, vocal_wav, bgm_wav = None, None, None | |
| else: | |
| pmt_wav, vocal_wav, bgm_wav = self.separator.run(prompt_audio_path) | |
| generate_inp = { | |
| 'lyrics': [lyric.replace(" ", " ")], | |
| 'descriptions': [description], | |
| 'melody_wavs': pmt_wav, | |
| 'vocal_wavs': vocal_wav, | |
| 'bgm_wavs': bgm_wav, | |
| } | |
| with torch.autocast(device_type="cuda", dtype=torch.float16): | |
| tokens = self.model.generate(**generate_inp, return_tokens=True) | |
| if tokens.shape[-1] > 3000: | |
| tokens = tokens[..., :3000] | |
| with torch.no_grad(): | |
| wav_seperate = self.model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav) | |
| return wav_seperate[0] | |
| def build_levo_inference(): | |
| cfg_path = './conf/infer.yaml' | |
| return LeVoInference(cfg_path) | |
| if __name__ == '__main__': | |
| import sys | |
| import os | |
| import time | |
| import json | |
| import torchaudio | |
| cfg_path = sys.argv[1] | |
| save_dir = sys.argv[2] | |
| input_jsonl = sys.argv[3] | |
| model = LeVoInference(cfg_path) | |
| os.makedirs(save_dir + "/audios", exist_ok=True) | |
| with open(input_jsonl, "r") as fp: | |
| lines = fp.readlines() | |
| for line in lines: | |
| item = json.loads(line) | |
| target_wav_name = f"{save_dir}/audios/{item['idx']}.flac" | |
| descriptions = item["descriptions"] | |
| lyric = item["gt_lyric"] | |
| prompt_audio_path = item['prompt_audio_path'] | |
| wav = model(lyric, descriptions, prompt_audio_path) | |
| torchaudio.save(target_wav_name, wav.cpu().float(), model.cfg.sample_rate) | |