Spaces:

stiv14
/

pdf-multilanguage-qa-role

Sleeping

App Files Files Community

stivenDR14 commited on Mar 10

Commit

4779173

1 Parent(s): 4fafabd

chroma_db in the state of Gradio

Browse files

Files changed (2) hide show

app.py +16 -15
pdf_processor.py +21 -24

app.py CHANGED Viewed

@@ -57,17 +57,17 @@ class PDFProcessorUI:
         else:
             return gr.update(visible=False), gr.update(visible=False)
-    def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
-        return self.processor.process_pdf(pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
-    def qa_interface(self, message, history, ai_model, type_model, api_key, project_id_watsonx):
-        return self.processor.get_qa_response(message, history, ai_model, type_model, api_key, project_id_watsonx)
-    def summarize_interface(self, ai_model, type_model, api_key, project_id_watsonx):
-        return self.processor.get_summary(ai_model, type_model, api_key, project_id_watsonx)
-    def specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
-        return self.processor.get_specialist_opinion(ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
     def upload_file(files):
         file_paths = [file.name for file in files]
@@ -75,6 +75,7 @@ class PDFProcessorUI:
     def create_ui(self):
         with gr.Blocks() as demo:
             title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
             with gr.Row():
@@ -164,8 +165,8 @@ class PDFProcessorUI:
                         label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
                         lines=10
                     )
-                specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
                 specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
             language_dropdown.change(
@@ -210,31 +211,31 @@ class PDFProcessorUI:
             chat_placeholder.submit(
                 fn=self.qa_interface,
-                inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
                 outputs=[chatbot]
             )
             process_btn.click(
                 fn=self.process_pdf,
-                inputs=[pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
-                outputs=[process_output]
             )
             summarize_btn.click(
                 fn=self.summarize_interface,
-                inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
                 outputs=[summary_output]
             )
             specialist_btn.click(
                 fn=self.specialist_opinion,
-                inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
                 outputs=[specialist_output]
             )
             chat_btn.click(
                 fn=self.qa_interface,
-                inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
                 outputs=[chatbot]
             )

         else:
             return gr.update(visible=False), gr.update(visible=False)
+    def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
+        return self.processor.process_pdf(vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
+    def qa_interface(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx):
+        return self.processor.get_qa_response(vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx)
+    def summarize_interface(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx):
+        return self.processor.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx)
+    def specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
+        return self.processor.get_specialist_opinion(vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
     def upload_file(files):
         file_paths = [file.name for file in files]
     def create_ui(self):
         with gr.Blocks() as demo:
+            vectorstore = gr.State()
             title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
             with gr.Row():
                         label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
                         lines=10
                     )
                 specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
+                specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
             language_dropdown.change(
             chat_placeholder.submit(
                 fn=self.qa_interface,
+                inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
                 outputs=[chatbot]
             )
             process_btn.click(
                 fn=self.process_pdf,
+                inputs=[vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
+                outputs=[process_output, vectorstore]
             )
             summarize_btn.click(
                 fn=self.summarize_interface,
+                inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
                 outputs=[summary_output]
             )
             specialist_btn.click(
                 fn=self.specialist_opinion,
+                inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
                 outputs=[specialist_output]
             )
             chat_btn.click(
                 fn=self.qa_interface,
+                inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
                 outputs=[chatbot]
             )

pdf_processor.py CHANGED Viewed

@@ -95,7 +95,6 @@ def authenticate_watsonx(api_key):
 class PDFProcessor:
     def __init__(self):
-        self.vectorstore = None
         self.language = "English"
     def set_language(self, language):
@@ -145,7 +144,7 @@ class PDFProcessor:
         return current_llm, embeding_model
     @spaces.GPU
-    def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
         defined_chunk_size = 1000
         defined_chunk_overlap = 150
         if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
@@ -178,13 +177,13 @@ class PDFProcessor:
                 _, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
                 #delete all documents from the vectorstore
-                if self.vectorstore:
-                    self.vectorstore.delete_collection()
                 chromadb.api.client.SharedSystemClient.clear_system_cache()
                 new_client = chromadb.EphemeralClient()
-                self.vectorstore = Chroma.from_documents(
                     documents=texts,
                     embedding=embeddings,
                     client=new_client,
@@ -192,19 +191,19 @@ class PDFProcessor:
                     #persist_directory="./chroma_db"
                 )
-                return TRANSLATIONS[self.language]["pdf_processed"] #+ f" ---- Chunks: {len(self.vectorstore.get()["documents"])}"
         else:
-            return TRANSLATIONS[self.language]["load_pdf_first"]
     @spaces.GPU
-    def get_qa_response(self, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
-        if not self.vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
-        retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
         qa_chain = RetrievalQA.from_chain_type(
             llm=current_llm,
@@ -222,13 +221,14 @@ class PDFProcessor:
         return result["result"] + "\n\nSources: " + page_labels_text
     @spaces.GPU
-    def summarizer_by_k_top_n(self, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
-        if not self.vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
         # Get all documents from the vectorstore
-        retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
         documents = retriever.invoke('Summary of the document and key points')
         if just_get_documments:
@@ -239,7 +239,7 @@ class PDFProcessor:
         return final_summary
         # Get the top k documents by score
-    def get_summary(self, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
         final_summary_prompt = PromptTemplate(
             input_variables=["texts", "language"],
@@ -255,11 +255,11 @@ class PDFProcessor:
             """
         )
-        return self.summarizer_by_k_top_n(ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
     @spaces.GPU
-    def get_specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
         questions_prompt = PromptTemplate(
             input_variables=["text", "specialist_prompt", "language"],
             template="""
@@ -303,22 +303,19 @@ class PDFProcessor:
             Answer:
             """
         )
-        if not self.vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
-        summary_text = self.get_summary(ai_model, type_model, api_key, project_id_watsonx, True, 10)
         questions_chain = questions_prompt | current_llm
         questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
         print(questions)
-        #clean the questions variable, delete all the text before the json and after the json
-        questions = questions.split("{")[1]
-        questions = questions.split("}")[0]
-        questions = questions.strip()
-        print(questions)
         questions = json.loads(questions)
         print(questions)
@@ -328,7 +325,7 @@ class PDFProcessor:
         else:
             questions["aspects"] = questions["aspects"]
-        aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
         return aspects_text

 class PDFProcessor:
     def __init__(self):
         self.language = "English"
     def set_language(self, language):
         return current_llm, embeding_model
     @spaces.GPU
+    def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
         defined_chunk_size = 1000
         defined_chunk_overlap = 150
         if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
                 _, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
                 #delete all documents from the vectorstore
+                if vectorstore:
+                    vectorstore.delete_collection()
                 chromadb.api.client.SharedSystemClient.clear_system_cache()
                 new_client = chromadb.EphemeralClient()
+                vectorstore = Chroma.from_documents(
                     documents=texts,
                     embedding=embeddings,
                     client=new_client,
                     #persist_directory="./chroma_db"
                 )
+                return TRANSLATIONS[self.language]["pdf_processed"], vectorstore #+ f" ---- Chunks: {len(vectorstore.get()["documents"])}"
         else:
+            return TRANSLATIONS[self.language]["load_pdf_first"], None
     @spaces.GPU
+    def get_qa_response(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
+        if not vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
+        retriever = vectorstore.as_retriever(search_kwargs={"k": k})
         qa_chain = RetrievalQA.from_chain_type(
             llm=current_llm,
         return result["result"] + "\n\nSources: " + page_labels_text
     @spaces.GPU
+    def summarizer_by_k_top_n(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
+        print("Summarizer by k top n in language: ", self.language)
+        if not vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
         # Get all documents from the vectorstore
+        retriever = vectorstore.as_retriever(search_kwargs={"k": k})
         documents = retriever.invoke('Summary of the document and key points')
         if just_get_documments:
         return final_summary
         # Get the top k documents by score
+    def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
         final_summary_prompt = PromptTemplate(
             input_variables=["texts", "language"],
             """
         )
+        return self.summarizer_by_k_top_n(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
     @spaces.GPU
+    def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
         questions_prompt = PromptTemplate(
             input_variables=["text", "specialist_prompt", "language"],
             template="""
             Answer:
             """
         )
+        if not vectorstore:
             return TRANSLATIONS[self.language]["load_pdf_first"]
+        print(ai_model)
+        print(type_model)
         current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
+        summary_text = self.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx, True, 10)
         questions_chain = questions_prompt | current_llm
         questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
         print(questions)
         questions = json.loads(questions)
         print(questions)
         else:
             questions["aspects"] = questions["aspects"]
+        aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(vectorstore, aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
         return aspects_text