Spaces:
Sleeping
Sleeping
| # generate_transcript.py | |
| import torch | |
| from accelerate import Accelerator | |
| import transformers | |
| import pickle | |
| from tqdm import tqdm | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| class TranscriptGenerator: | |
| """ | |
| A class to generate a conversational podcast transcript from cleaned text. | |
| """ | |
| def __init__(self, text_file_path, model_name="meta-llama/Llama-3.1-70B-Instruct"): | |
| """ | |
| Initialize with the path to the cleaned text file and the model name. | |
| Args: | |
| text_file_path (str): Path to the file containing cleaned PDF text. | |
| model_name (str): Name of the language model to use. | |
| """ | |
| self.text_file_path = text_file_path | |
| self.output_path = './resources/data.pkl' | |
| self.model_name = model_name | |
| self.accelerator = Accelerator() | |
| self.model = transformers.pipeline( | |
| "text-generation", | |
| model=self.model_name, | |
| model_kwargs={"torch_dtype": torch.bfloat16}, | |
| device_map="auto" | |
| ) | |
| self.system_prompt = """ | |
| You are a world-class podcast writer, you have worked as a ghost writer for Joe Rogan, Lex Fridman, Ben Shapiro, Tim Ferris. | |
| We are in an alternate universe where actually you have been writing every line they say and they just stream it into their brains. | |
| Your job is to write word by word, even "umm, hmmm, right" interruptions by the second speaker based on the PDF upload. | |
| Keep it extremely engaging, with realistic anecdotes, tangents, and interruptions. | |
| Speaker 1: Leads and teaches. Speaker 2: Asks follow-up questions, gets excited or confused. | |
| ALWAYS START YOUR RESPONSE DIRECTLY WITH SPEAKER 1: | |
| STRICTLY THE DIALOGUES. | |
| """ | |
| def load_text(self): | |
| """ | |
| Reads the cleaned text file and returns its content. | |
| Returns: | |
| str: Content of the cleaned text file. | |
| """ | |
| encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] | |
| for encoding in encodings: | |
| try: | |
| with open(self.text_file_path, 'r', encoding=encoding) as file: | |
| content = file.read() | |
| print(f"Successfully read file using {encoding} encoding.") | |
| return content | |
| except (UnicodeDecodeError, FileNotFoundError): | |
| continue | |
| print(f"Error: Could not decode file '{self.text_file_path}' with any common encoding.") | |
| return None | |
| def generate_transcript(self): | |
| """ | |
| Generates a podcast-style transcript and saves it as a pickled file. | |
| Returns: | |
| str: Path to the file where the transcript is saved. | |
| """ | |
| input_text = self.load_text() | |
| if input_text is None: | |
| return None | |
| messages = [ | |
| {"role": "system", "content": self.system_prompt}, | |
| {"role": "user", "content": input_text} | |
| ] | |
| output = self.model( | |
| messages, | |
| max_new_tokens=8126, | |
| temperature=1 | |
| ) | |
| transcript = output[0]["generated_text"] | |
| # Save the transcript as a pickle file | |
| with open(self.output_path, 'wb') as f: | |
| pickle.dump(transcript, f) | |
| return self.output_path | |