Spaces:

yasserrmd
/

NotebookLlama

Running

App Files Files Community

yasserrmd commited on Oct 30, 2024

Commit

71df28b

verified ·

1 Parent(s): 6b720a3

Create generate_audio.py

Browse files

Files changed (1) hide show

generate_audio.py +133 -0

generate_audio.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# generate_audio.py
+import pickle
+import torch
+import numpy as np
+from tqdm import tqdm
+from transformers import BarkModel, AutoProcessor, AutoTokenizer
+from parler_tts import ParlerTTSForConditionalGeneration
+from scipy.io import wavfile
+from pydub import AudioSegment
+import io
+import ast
+class TTSGenerator:
+    """
+    A class to generate podcast-style audio from a transcript using ParlerTTS and Bark models.
+    """
+    def __init__(self, transcript_file_path):
+        """
+        Initialize the TTS generator with the path to the rewritten transcript file.
+        Args:
+            transcript_file_path (str): Path to the file containing the rewritten transcript.
+        """
+        self.transcript_file_path = transcript_file_path
+        self.output_audio_path = './resources/_podcast.mp3'
+        # Set device
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load Parler model and tokenizer for Speaker 1
+        self.parler_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(self.device)
+        self.parler_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
+        self.speaker1_description = """
+        Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
+        """
+        # Load Bark model and processor for Speaker 2
+        self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
+        self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
+        self.bark_sampling_rate = 24000
+        self.voice_preset = "v2/en_speaker_6"
+    def load_transcript(self):
+        """
+        Loads the rewritten transcript from the specified file.
+        Returns:
+            list: The content of the transcript as a list of tuples (speaker, text).
+        """
+        with open(self.transcript_file_path, 'rb') as f:
+            return ast.literal_eval(pickle.load(f))
+    def generate_speaker1_audio(self, text):
+        """
+        Generate audio for Speaker 1 using ParlerTTS.
+        Args:
+            text (str): Text to be synthesized for Speaker 1.
+        Returns:
+            np.array: Audio array.
+            int: Sampling rate.
+        """
+        input_ids = self.parler_tokenizer(self.speaker1_description, return_tensors="pt").input_ids.to(self.device)
+        prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
+        generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+        audio_arr = generation.cpu().numpy().squeeze()
+        return audio_arr, self.parler_model.config.sampling_rate
+    def generate_speaker2_audio(self, text):
+        """
+        Generate audio for Speaker 2 using Bark.
+        Args:
+            text (str): Text to be synthesized for Speaker 2.
+        Returns:
+            np.array: Audio array.
+            int: Sampling rate.
+        """
+        inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
+        speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
+        audio_arr = speech_output[0].cpu().numpy()
+        return audio_arr, self.bark_sampling_rate
+    @staticmethod
+    def numpy_to_audio_segment(audio_arr, sampling_rate):
+        """
+        Convert numpy array to AudioSegment.
+        Args:
+            audio_arr (np.array): Numpy array of audio data.
+            sampling_rate (int): Sampling rate of the audio.
+        Returns:
+            AudioSegment: Converted audio segment.
+        """
+        audio_int16 = (audio_arr * 32767).astype(np.int16)
+        byte_io = io.BytesIO()
+        wavfile.write(byte_io, sampling_rate, audio_int16)
+        byte_io.seek(0)
+        return AudioSegment.from_wav(byte_io)
+    def generate_audio(self):
+        """
+        Converts the transcript into audio and saves it to a file.
+        Returns:
+            str: Path to the saved audio file.
+        """
+        transcript = self.load_transcript()
+        final_audio = None
+        for speaker, text in tqdm(transcript, desc="Generating podcast segments", unit="segment"):
+            if speaker == "Speaker 1":
+                audio_arr, rate = self.generate_speaker1_audio(text)
+            else:  # Speaker 2
+                audio_arr, rate = self.generate_speaker2_audio(text)
+            # Convert to AudioSegment
+            audio_segment = self.numpy_to_audio_segment(audio_arr, rate)
+            # Add segment to final audio
+            if final_audio is None:
+                final_audio = audio_segment
+            else:
+                final_audio += audio_segment
+        # Export final audio to MP3
+        final_audio.export(self.output_audio_path, format="mp3", bitrate="192k", parameters=["-q:a", "0"])
+        return self.output_audio_path