Spaces:

ASLP-lab
/

DiffRhythm2

Running on Zero

App Files Files Community

ASLP-lab commited on 15 days ago

Commit

3d36e93

verified ·

1 Parent(s): c0d12aa

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -3

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 import io
 import pydub
 import base64
 from muq import MuQMuLan
 from diffrhythm2.cfm import CFM
 from diffrhythm2.backbones.dit import DiT
@@ -47,7 +48,8 @@ class CNENTokenizer():
         return token
     def decode(self, token):
         return "|".join([self.id2phone[x-1] for x in token])
 def prepare_model(repo_id, device, dtype):
     diffrhythm2_ckpt_path = hf_hub_download(
         repo_id=repo_id,
@@ -121,6 +123,7 @@ def parse_lyrics(lyrics: str):
             lyrics_with_time.append(tokens)
     return lyrics_with_time
 def get_audio_prompt(model, audio_file, device, dtype):
     prompt_wav, sr = torchaudio.load(audio_file)
     prompt_wav = torchaudio.functional.resample(prompt_wav.to(device).to(dtype), sr, 24000)
@@ -132,11 +135,13 @@ def get_audio_prompt(model, audio_file, device, dtype):
         style_prompt_embed = model(wavs = prompt_wav)
     return style_prompt_embed.squeeze(0)
 def get_text_prompt(model, text, device, dtype):
     with torch.no_grad():
         style_prompt_embed = model(texts = [text])
     return style_prompt_embed.squeeze(0)
 def make_fake_stereo(audio, sampling_rate):
     left_channel = audio
     right_channel = audio.clone()
@@ -148,7 +153,8 @@ def make_fake_stereo(audio, sampling_rate):
     stereo_audio = torch.cat([left_channel, right_channel], dim=0)
     return stereo_audio
 def inference(
         model,
         decoder,
@@ -186,6 +192,7 @@ def inference(
             torchaudio.save(buffer, audio, decoder.h.sampling_rate, format=file_type)
             return buffer.getvalue()
 def inference_stream(
         model,
         decoder,
@@ -224,7 +231,7 @@ device='cuda'
 dtype=torch.float16
 diffrhythm2, mulan, lrc_tokenizer, decoder = prepare_model("ASLP-Lab/DiffRhythm2", device, dtype)
-import spaces
 @spaces.GPU
 def infer_music(
         lrc,

 import io
 import pydub
 import base64
+import spaces
 from muq import MuQMuLan
 from diffrhythm2.cfm import CFM
 from diffrhythm2.backbones.dit import DiT
         return token
     def decode(self, token):
         return "|".join([self.id2phone[x-1] for x in token])
+@spaces.GPU
 def prepare_model(repo_id, device, dtype):
     diffrhythm2_ckpt_path = hf_hub_download(
         repo_id=repo_id,
             lyrics_with_time.append(tokens)
     return lyrics_with_time
+@spaces.GPU
 def get_audio_prompt(model, audio_file, device, dtype):
     prompt_wav, sr = torchaudio.load(audio_file)
     prompt_wav = torchaudio.functional.resample(prompt_wav.to(device).to(dtype), sr, 24000)
         style_prompt_embed = model(wavs = prompt_wav)
     return style_prompt_embed.squeeze(0)
+@spaces.GPU
 def get_text_prompt(model, text, device, dtype):
     with torch.no_grad():
         style_prompt_embed = model(texts = [text])
     return style_prompt_embed.squeeze(0)
+@spaces.GPU
 def make_fake_stereo(audio, sampling_rate):
     left_channel = audio
     right_channel = audio.clone()
     stereo_audio = torch.cat([left_channel, right_channel], dim=0)
     return stereo_audio
+@spaces.GPU
 def inference(
         model,
         decoder,
             torchaudio.save(buffer, audio, decoder.h.sampling_rate, format=file_type)
             return buffer.getvalue()
+@spaces.GPU
 def inference_stream(
         model,
         decoder,
 dtype=torch.float16
 diffrhythm2, mulan, lrc_tokenizer, decoder = prepare_model("ASLP-Lab/DiffRhythm2", device, dtype)
 @spaces.GPU
 def infer_music(
         lrc,