Spaces:

emirhanbilgic
/

read-my-pdf-outloud

Running

App Files Files Community

read-my-pdf-outloud / app.py

emirhanbilgic

Update app.py

6ec69c0 verified about 1 year ago

raw

history blame

5.43 kB

	import numpy as np
	import gradio as gr
	import torch
	from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor
	from parler_tts import ParlerTTSForConditionalGeneration
	from PyPDF2 import PdfReader
	import re
	import textwrap
	import soundfile as sf

	# Device configuration
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Initialize models and tokenizers
	tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
	tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
	feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1")
	SAMPLE_RATE = feature_extractor.sampling_rate
	SEED = 42

	# Helper function to extract text from a PDF
	def pdf_to_text(pdf_file):
	with open(pdf_file, 'rb') as file:
	pdf_reader = PdfReader(file)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() or ""
	return text

	# Helper function to split text into sentences using regex
	def split_text_into_sentences(text):
	sentence_endings = re.compile(r'[.!?]')
	sentences = sentence_endings.split(text)
	return [sentence.strip() for sentence in sentences if sentence.strip()]

	# Translation function
	def translate(source_text, source_lang, target_lang, batch_size=16):
	model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
	tokenizer = MarianTokenizer.from_pretrained(model_name)
	model = MarianMTModel.from_pretrained(model_name).to(device)

	text_chunks = textwrap.wrap(source_text, 512)
	translated_text = ""

	for i in range(0, len(text_chunks), batch_size):
	text_batch = text_chunks[i:i+batch_size]
	input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
	output_ids = model.generate(input_ids, max_new_tokens=512)

	for output in output_ids:
	output_text = tokenizer.decode(output, skip_special_tokens=True)
	translated_text += output_text + " "

	return translated_text

	# Function to combine audio arrays
	def combine_audio_arrays(audio_list):
	combined_audio = np.concatenate(audio_list, axis=0)
	return combined_audio

	# Function to generate audio for a single sentence
	def generate_single_wav_from_text(sentence, description):
	torch.manual_seed(SEED)
	inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
	prompt = tts_tokenizer(sentence, return_tensors="pt").to(device)

	generation = tts_model.generate(
	input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
	prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
	)
	audio_arr = generation.cpu().numpy().squeeze()
	return SAMPLE_RATE, audio_arr

	# Gradio Interface
	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text")
	pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False)
	text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...")
	translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
	source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
	target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
	description = gr.Textbox(label="Voice Description", lines=2,
	value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
	run_button = gr.Button("Generate Audio", variant="primary")
	with gr.Column():
	audio_output = gr.Audio(label="Generated Audio")
	markdown_output = gr.Markdown()

	def handle_input(input_mode, pdf_input, text_input):
	if input_mode == "Upload PDF":
	return pdf_to_text(pdf_input.name)
	else:
	return text_input

	def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description):
	text = handle_input(input_mode, pdf_input, text_input)

	if translate_checkbox:
	text = translate(text, source_lang, target_lang)

	sentences = split_text_into_sentences(text)
	all_audio = []
	all_text = ""
	for sentence in sentences:
	sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
	all_audio.append(audio_arr)
	combined_audio = combine_audio_arrays(all_audio)
	all_text += f"Sentence: {sentence}\n\n"
	yield (sample_rate, combined_audio), all_text

	input_mode.change(
	fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")],
	inputs=input_mode,
	outputs=[pdf_input, text_input]
	)

	run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])

	demo.launch(share=True)