Spaces:

emirhanbilgic
/

read-my-pdf-outloud

Running

App Files Files Community

emirhanbilgic commited on Aug 11, 2024

Commit

6ec69c0

verified ·

1 Parent(s): 224badc

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -58

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
-import spaces
 import gradio as gr
 import torch
-from transformers import MarianTokenizer, MarianMTModel
 from parler_tts import ParlerTTSForConditionalGeneration
-from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 from PyPDF2 import PdfReader
 import re
 import textwrap
 import soundfile as sf
-import numpy as np
 # Device configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -21,13 +19,12 @@ SAMPLE_RATE = feature_extractor.sampling_rate
 SEED = 42
 # Helper function to extract text from a PDF
-def pdf_to_text(pdf_path):
-    with open(pdf_path, 'rb') as file:
         pdf_reader = PdfReader(file)
         text = ""
-        for page_num in range(len(pdf_reader.pages)):
-            page = pdf_reader.pages[page_num]
-            text += page.extract_text()
     return text
 # Helper function to split text into sentences using regex
@@ -37,10 +34,8 @@ def split_text_into_sentences(text):
     return [sentence.strip() for sentence in sentences if sentence.strip()]
 # Translation function
-@spaces.GPU(duration=120)
 def translate(source_text, source_lang, target_lang, batch_size=16):
     model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
     tokenizer = MarianTokenizer.from_pretrained(model_name)
     model = MarianMTModel.from_pretrained(model_name).to(device)
@@ -58,19 +53,16 @@ def translate(source_text, source_lang, target_lang, batch_size=16):
     return translated_text
-# Function to preprocess the text (normalization, punctuation)
-def preprocess(text):
-    text = text.replace("-", " ")
-    if text[-1] not in ".!?":
-        text += "."
-    return text
 # Function to generate audio for a single sentence
-@spaces.GPU(duration=120)
 def generate_single_wav_from_text(sentence, description):
-    set_seed(SEED)
     inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
-    prompt = tts_tokenizer(preprocess(sentence), return_tensors="pt").to(device)
     generation = tts_model.generate(
         input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
@@ -83,7 +75,9 @@ def generate_single_wav_from_text(sentence, description):
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
-            pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
             translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
             source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
             target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
@@ -94,55 +88,34 @@ with gr.Blocks() as demo:
             audio_output = gr.Audio(label="Generated Audio")
             markdown_output = gr.Markdown()
-    # Helper function to combine audio arrays
-    def combine_audio_arrays(audio_list):
-        combined_audio = np.concatenate(audio_list, axis=0)
-        return combined_audio
-    # Adjust the handle_process function to accumulate and combine audio
-    def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
-        # Extract and process text from PDF
-        print("Extracting text from PDF...")
-        text = pdf_to_text(pdf_input.name)
-        print(f"Extracted text: {text[:100]}...")  # Display the first 100 characters for a quick preview
-        # Perform translation if enabled
         if translate_checkbox:
-            print("Translating text...")
             text = translate(text, source_lang, target_lang)
-            print(f"Translated text: {text[:100]}...")  # Display the first 100 characters for a quick preview
         sentences = split_text_into_sentences(text)
         all_audio = []
         all_text = ""
         for sentence in sentences:
-            print(f"Processing sentence: {sentence[:50]}...")  # Display the first 50 characters for a quick preview
             sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
             all_audio.append(audio_arr)
             combined_audio = combine_audio_arrays(all_audio)
             all_text += f"**Sentence**: {sentence}\n\n"
-            # Yield the accumulated results
-            yield sample_rate, combined_audio, all_text
-        print("Processing complete.")
-    # Update the Gradio interface pipeline function to handle combined audio
-    def run_pipeline(pdf_input, translate_checkbox, source_lang, target_lang, description):
-        # Stream outputs to Gradio interface
-        for sample_rate, combined_audio, markdown_text in handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
-            yield (sample_rate, combined_audio), markdown_text
-    def handle_translation_toggle(translate_checkbox):
-        if translate_checkbox:
-            return gr.update(visible=True), gr.update(visible=True)
-        else:
-            return gr.update(visible=False), gr.update(visible=False)
-    translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
-    source_lang.change(fn=lambda lang: gr.update(choices={"en": ["de", "fr", "tr"], "tr": ["en"], "de": ["en", "fr"], "fr": ["en", "de"]}.get(lang, [])), inputs=source_lang, outputs=target_lang)
-    run_button.click(run_pipeline, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])
-demo.queue()
-demo.launch(share=True)

+import numpy as np
 import gradio as gr
 import torch
+from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor
 from parler_tts import ParlerTTSForConditionalGeneration
 from PyPDF2 import PdfReader
 import re
 import textwrap
 import soundfile as sf
 # Device configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 SEED = 42
 # Helper function to extract text from a PDF
+def pdf_to_text(pdf_file):
+    with open(pdf_file, 'rb') as file:
         pdf_reader = PdfReader(file)
         text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text() or ""
     return text
 # Helper function to split text into sentences using regex
     return [sentence.strip() for sentence in sentences if sentence.strip()]
 # Translation function
 def translate(source_text, source_lang, target_lang, batch_size=16):
     model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
     tokenizer = MarianTokenizer.from_pretrained(model_name)
     model = MarianMTModel.from_pretrained(model_name).to(device)
     return translated_text
+# Function to combine audio arrays
+def combine_audio_arrays(audio_list):
+    combined_audio = np.concatenate(audio_list, axis=0)
+    return combined_audio
 # Function to generate audio for a single sentence
 def generate_single_wav_from_text(sentence, description):
+    torch.manual_seed(SEED)
     inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
+    prompt = tts_tokenizer(sentence, return_tensors="pt").to(device)
     generation = tts_model.generate(
         input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
+            input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text")
+            pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False)
+            text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...")
             translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
             source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
             target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
             audio_output = gr.Audio(label="Generated Audio")
             markdown_output = gr.Markdown()
+    def handle_input(input_mode, pdf_input, text_input):
+        if input_mode == "Upload PDF":
+            return pdf_to_text(pdf_input.name)
+        else:
+            return text_input
+    def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description):
+        text = handle_input(input_mode, pdf_input, text_input)
         if translate_checkbox:
             text = translate(text, source_lang, target_lang)
         sentences = split_text_into_sentences(text)
         all_audio = []
         all_text = ""
         for sentence in sentences:
             sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
             all_audio.append(audio_arr)
             combined_audio = combine_audio_arrays(all_audio)
             all_text += f"**Sentence**: {sentence}\n\n"
+            yield (sample_rate, combined_audio), all_text
+    input_mode.change(
+        fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")],
+        inputs=input_mode,
+        outputs=[pdf_input, text_input]
+    )
+    run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])
+demo.launch(share=True)