Spaces:
Runtime error
Runtime error
| import PyPDF2 | |
| import pdfplumber | |
| from pdfminer.high_level import extract_pages, extract_text | |
| from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure | |
| import re | |
| import torch | |
| import transformers | |
| from transformers import pipeline | |
| from datasets import load_dataset | |
| import soundfile as sf | |
| from IPython.display import Audio | |
| from datasets import load_dataset | |
| import sentencepiece as spm | |
| import os | |
| import tempfile | |
| import gradio as gr | |
| description = """**SpeechAbstractor**\n | |
| This app enables users to upload academic articles in PDF format, specifically focusing on abstracts. | |
| It efficiently summarizes the abstract and provides an audio playback of the summarized content. | |
| Below are some example PDFs for you to experiment with. Feel free to explore the functionality of SpeechAbstractor!""" | |
| examples = [ | |
| ["Article_7.pdf"],["Article_11.pdf"] | |
| ] | |
| #reporting the functions created for the part 1 | |
| def text_extraction(element): | |
| line_text = element.get_text() | |
| line_formats = [] | |
| for text_line in element: | |
| if isinstance(text_line, LTTextContainer): | |
| for character in text_line: | |
| if isinstance(character, LTChar): | |
| line_formats.append(character.fontname) | |
| line_formats.append(character.size) | |
| format_per_line = list(set(line_formats)) | |
| return (line_text, format_per_line) | |
| def read_pdf(pdf_pathy): | |
| pdfFileObj = open(pdf_pathy, 'rb') | |
| pdfReaded = PyPDF2.PdfReader(pdfFileObj) | |
| text_per_pagy = {} | |
| for pagenum, page in enumerate(extract_pages(pdf_pathy)): | |
| print("Elaborating Page_" +str(pagenum)) | |
| pageObj = pdfReaded.pages[pagenum] | |
| page_text = [] | |
| line_format = [] | |
| page_content = [] | |
| pdf = pdfplumber.open(pdf_pathy) | |
| page_elements = [(element.y1, element) for element in page._objs] | |
| page_elements.sort(key=lambda a: a[0], reverse=True) | |
| for i,component in enumerate(page_elements): | |
| pos= component[0] | |
| element = component[1] | |
| if isinstance(element, LTTextContainer): | |
| (line_text, format_per_line) = text_extraction(element) | |
| page_text.append(line_text) | |
| line_format.append(format_per_line) | |
| page_content.append(line_text) | |
| dctkey = 'Page_'+str(pagenum) | |
| text_per_pagy[dctkey]= [page_text, line_format, page_content] | |
| pdfFileObj.close() | |
| return text_per_pagy | |
| def clean_text(text): | |
| # remove extra spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def extract_abstract(text_per_pagy): | |
| abstract_text = "" | |
| for page_num, page_text in text_per_pagy.items(): | |
| if page_text: | |
| page_text = page_text.replace("- ", "") | |
| start_index = page_text.find("Abstract") | |
| if start_index != -1: | |
| start_index += len("Abstract") + 1 | |
| end_markers = ["Introduction", "Summary", "Overview", "Background", "Contents"] | |
| end_index = -1 | |
| for marker in end_markers: | |
| temp_index = page_text.find(marker, start_index) | |
| if temp_index != -1: | |
| end_index = temp_index | |
| break | |
| if end_index == -1: | |
| end_index = len(page_text) | |
| abstract = page_text[start_index:end_index].strip() | |
| abstract_text += " " + abstract | |
| break | |
| return abstract_text | |
| #let's define a main function that gets the uploaded file (pdf) to do the job | |
| def main_function(uploaded_filepath): | |
| #put a control to see if there is a file uploaded | |
| if uploaded_filepath is None: | |
| return "No file loaded", None | |
| #read and process the file according to read_pdf | |
| text_per_pagy = read_pdf(uploaded_filepath) | |
| #cleaning the text and getting the abstract using the 2 other functions | |
| for key, value in text_per_pagy.items(): | |
| cleaned_text = clean_text(' '.join(value[0])) | |
| text_per_pagy[key] = cleaned_text | |
| abstract_text = extract_abstract(text_per_pagy) | |
| #abstract the summary with my pipeline and model, deciding the length | |
| summarizer = pipeline("summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") | |
| summary = summarizer(abstract_text, max_length=65, do_sample=False)[0]['summary_text'] | |
| #generating the audio from the text, with my pipeline and model | |
| synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
| embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
| speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
| speech = synthesiser(summary, forward_params={"speaker_embeddings": speaker_embedding}) | |
| #saving the audio in a temporary file | |
| audio_file_path = "summary.wav" | |
| sf.write(audio_file_path, speech["audio"], samplerate=speech["sampling_rate"]) | |
| #the function returns the 2 pieces we need | |
| return summary, audio_file_path | |
| #let's communicate with gradio what it has to put in | |
| iface = gr.Interface( | |
| fn=main_function, | |
| inputs=gr.File(type="filepath"), | |
| outputs=[gr.Textbox(label="Summary Text"), gr.Audio(label="Summary Audio", type="filepath")], | |
| description=description, | |
| examples=examples | |
| ) | |
| #launching the app | |
| if __name__ == "__main__": | |
| iface.launch() | |