Spaces:
Running
Running
Update generate_audio.py
Browse files- generate_audio.py +31 -12
generate_audio.py
CHANGED
|
@@ -43,10 +43,10 @@ class TTSGenerator:
|
|
| 43 |
"""
|
| 44 |
|
| 45 |
# Load Bark model and processor for Speaker 2
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
|
| 51 |
@spaces.GPU
|
| 52 |
def load_transcript(self):
|
|
@@ -82,12 +82,12 @@ class TTSGenerator:
|
|
| 82 |
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
|
| 83 |
attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
|
| 84 |
|
| 85 |
-
#
|
| 86 |
generation = self.parler_model.generate(
|
| 87 |
input_ids=input_ids,
|
| 88 |
-
attention_mask=attention_mask_input,
|
| 89 |
prompt_input_ids=prompt_input_ids,
|
| 90 |
-
prompt_attention_mask=attention_mask_prompt
|
| 91 |
)
|
| 92 |
audio_arr = generation.cpu().numpy().squeeze()
|
| 93 |
return audio_arr, self.parler_model.config.sampling_rate
|
|
@@ -105,15 +105,34 @@ class TTSGenerator:
|
|
| 105 |
int: Sampling rate.
|
| 106 |
"""
|
| 107 |
|
| 108 |
-
input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
|
| 109 |
-
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
|
| 110 |
-
generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 111 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
| 112 |
-
return audio_arr, self.parler_model.config.sampling_rate
|
|
|
|
| 113 |
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
| 114 |
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
| 115 |
# audio_arr = speech_output[0].cpu().numpy()
|
| 116 |
# return audio_arr, self.bark_sampling_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
@staticmethod
|
| 119 |
@spaces.GPU
|
|
|
|
| 43 |
"""
|
| 44 |
|
| 45 |
# Load Bark model and processor for Speaker 2
|
| 46 |
+
self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
|
| 47 |
+
self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
|
| 48 |
+
self.bark_sampling_rate = 24000
|
| 49 |
+
self.voice_preset = "v2/en_speaker_6"
|
| 50 |
|
| 51 |
@spaces.GPU
|
| 52 |
def load_transcript(self):
|
|
|
|
| 82 |
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
|
| 83 |
attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
|
| 84 |
|
| 85 |
+
# Pass all required arguments to generate() for reliable behavior
|
| 86 |
generation = self.parler_model.generate(
|
| 87 |
input_ids=input_ids,
|
| 88 |
+
attention_mask=attention_mask_input, # Set attention mask for input IDs
|
| 89 |
prompt_input_ids=prompt_input_ids,
|
| 90 |
+
prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
|
| 91 |
)
|
| 92 |
audio_arr = generation.cpu().numpy().squeeze()
|
| 93 |
return audio_arr, self.parler_model.config.sampling_rate
|
|
|
|
| 105 |
int: Sampling rate.
|
| 106 |
"""
|
| 107 |
|
| 108 |
+
# input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
|
| 109 |
+
# prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
|
| 110 |
+
# generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 111 |
+
# audio_arr = generation.cpu().numpy().squeeze()
|
| 112 |
+
# return audio_arr, self.parler_model.config.sampling_rate
|
| 113 |
+
|
| 114 |
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
| 115 |
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
| 116 |
# audio_arr = speech_output[0].cpu().numpy()
|
| 117 |
# return audio_arr, self.bark_sampling_rate
|
| 118 |
+
# Tokenize input text and obtain input IDs and attention mask
|
| 119 |
+
inputs = self.bark_processor(text, voice_preset=self.voice_preset, return_tensors="pt", padding=True).to(self.device)
|
| 120 |
+
input_ids = inputs.input_ids.to(self.device)
|
| 121 |
+
attention_mask = inputs.attention_mask.to(self.device)
|
| 122 |
+
|
| 123 |
+
# Generate speech output with both input IDs and attention mask
|
| 124 |
+
speech_output = self.bark_model.generate(
|
| 125 |
+
input_ids=input_ids,
|
| 126 |
+
attention_mask=attention_mask,
|
| 127 |
+
temperature=0.9,
|
| 128 |
+
semantic_temperature=0.8
|
| 129 |
+
)
|
| 130 |
+
# Convert the generated audio to numpy array
|
| 131 |
+
audio_arr = speech_output[0].cpu().numpy()
|
| 132 |
+
return audio_arr, self.bark_sampling_rate
|
| 133 |
+
|
| 134 |
+
# Convert the generated audio to numpy array
|
| 135 |
+
audio_arr = speech_output[0].cpu().numpy()
|
| 136 |
|
| 137 |
@staticmethod
|
| 138 |
@spaces.GPU
|