Spaces:
Sleeping
Sleeping
stivenDR14
commited on
Commit
·
4779173
1
Parent(s):
4fafabd
chroma_db in the state of Gradio
Browse files- app.py +16 -15
- pdf_processor.py +21 -24
app.py
CHANGED
|
@@ -57,17 +57,17 @@ class PDFProcessorUI:
|
|
| 57 |
else:
|
| 58 |
return gr.update(visible=False), gr.update(visible=False)
|
| 59 |
|
| 60 |
-
def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
| 61 |
-
return self.processor.process_pdf(pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
|
| 62 |
|
| 63 |
-
def qa_interface(self, message, history, ai_model, type_model, api_key, project_id_watsonx):
|
| 64 |
-
return self.processor.get_qa_response(message, history, ai_model, type_model, api_key, project_id_watsonx)
|
| 65 |
|
| 66 |
-
def summarize_interface(self, ai_model, type_model, api_key, project_id_watsonx):
|
| 67 |
-
return self.processor.get_summary(ai_model, type_model, api_key, project_id_watsonx)
|
| 68 |
|
| 69 |
-
def specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
| 70 |
-
return self.processor.get_specialist_opinion(ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
|
| 71 |
|
| 72 |
def upload_file(files):
|
| 73 |
file_paths = [file.name for file in files]
|
|
@@ -75,6 +75,7 @@ class PDFProcessorUI:
|
|
| 75 |
|
| 76 |
def create_ui(self):
|
| 77 |
with gr.Blocks() as demo:
|
|
|
|
| 78 |
title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
|
| 79 |
|
| 80 |
with gr.Row():
|
|
@@ -164,8 +165,8 @@ class PDFProcessorUI:
|
|
| 164 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
| 165 |
lines=10
|
| 166 |
)
|
| 167 |
-
specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
|
| 168 |
specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
|
|
|
|
| 169 |
|
| 170 |
|
| 171 |
language_dropdown.change(
|
|
@@ -210,31 +211,31 @@ class PDFProcessorUI:
|
|
| 210 |
|
| 211 |
chat_placeholder.submit(
|
| 212 |
fn=self.qa_interface,
|
| 213 |
-
inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 214 |
outputs=[chatbot]
|
| 215 |
)
|
| 216 |
|
| 217 |
process_btn.click(
|
| 218 |
fn=self.process_pdf,
|
| 219 |
-
inputs=[pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 220 |
-
outputs=[process_output]
|
| 221 |
)
|
| 222 |
|
| 223 |
summarize_btn.click(
|
| 224 |
fn=self.summarize_interface,
|
| 225 |
-
inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 226 |
outputs=[summary_output]
|
| 227 |
)
|
| 228 |
|
| 229 |
specialist_btn.click(
|
| 230 |
fn=self.specialist_opinion,
|
| 231 |
-
inputs=[ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
|
| 232 |
outputs=[specialist_output]
|
| 233 |
)
|
| 234 |
|
| 235 |
chat_btn.click(
|
| 236 |
fn=self.qa_interface,
|
| 237 |
-
inputs=[chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 238 |
outputs=[chatbot]
|
| 239 |
)
|
| 240 |
|
|
|
|
| 57 |
else:
|
| 58 |
return gr.update(visible=False), gr.update(visible=False)
|
| 59 |
|
| 60 |
+
def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
| 61 |
+
return self.processor.process_pdf(vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx)
|
| 62 |
|
| 63 |
+
def qa_interface(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx):
|
| 64 |
+
return self.processor.get_qa_response(vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx)
|
| 65 |
|
| 66 |
+
def summarize_interface(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx):
|
| 67 |
+
return self.processor.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx)
|
| 68 |
|
| 69 |
+
def specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
| 70 |
+
return self.processor.get_specialist_opinion(vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt)
|
| 71 |
|
| 72 |
def upload_file(files):
|
| 73 |
file_paths = [file.name for file in files]
|
|
|
|
| 75 |
|
| 76 |
def create_ui(self):
|
| 77 |
with gr.Blocks() as demo:
|
| 78 |
+
vectorstore = gr.State()
|
| 79 |
title = gr.Markdown(TRANSLATIONS[self.current_language]["title"])
|
| 80 |
|
| 81 |
with gr.Row():
|
|
|
|
| 165 |
label=TRANSLATIONS[self.current_language]["mini_analysis_title"],
|
| 166 |
lines=10
|
| 167 |
)
|
|
|
|
| 168 |
specialist_btn = gr.Button(TRANSLATIONS[self.current_language]["specialist_btn"])
|
| 169 |
+
specialist_output = gr.Textbox(label=TRANSLATIONS[self.current_language]["specialist_output"], lines=20)
|
| 170 |
|
| 171 |
|
| 172 |
language_dropdown.change(
|
|
|
|
| 211 |
|
| 212 |
chat_placeholder.submit(
|
| 213 |
fn=self.qa_interface,
|
| 214 |
+
inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 215 |
outputs=[chatbot]
|
| 216 |
)
|
| 217 |
|
| 218 |
process_btn.click(
|
| 219 |
fn=self.process_pdf,
|
| 220 |
+
inputs=[vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 221 |
+
outputs=[process_output, vectorstore]
|
| 222 |
)
|
| 223 |
|
| 224 |
summarize_btn.click(
|
| 225 |
fn=self.summarize_interface,
|
| 226 |
+
inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 227 |
outputs=[summary_output]
|
| 228 |
)
|
| 229 |
|
| 230 |
specialist_btn.click(
|
| 231 |
fn=self.specialist_opinion,
|
| 232 |
+
inputs=[vectorstore, ai_model_dropdown, type_model, api_key_input, project_id_watsonx, specialist_placeholder],
|
| 233 |
outputs=[specialist_output]
|
| 234 |
)
|
| 235 |
|
| 236 |
chat_btn.click(
|
| 237 |
fn=self.qa_interface,
|
| 238 |
+
inputs=[vectorstore, chat_placeholder, chatbot, ai_model_dropdown, type_model, api_key_input, project_id_watsonx],
|
| 239 |
outputs=[chatbot]
|
| 240 |
)
|
| 241 |
|
pdf_processor.py
CHANGED
|
@@ -95,7 +95,6 @@ def authenticate_watsonx(api_key):
|
|
| 95 |
|
| 96 |
class PDFProcessor:
|
| 97 |
def __init__(self):
|
| 98 |
-
self.vectorstore = None
|
| 99 |
self.language = "English"
|
| 100 |
|
| 101 |
def set_language(self, language):
|
|
@@ -145,7 +144,7 @@ class PDFProcessor:
|
|
| 145 |
return current_llm, embeding_model
|
| 146 |
|
| 147 |
@spaces.GPU
|
| 148 |
-
def process_pdf(self, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
| 149 |
defined_chunk_size = 1000
|
| 150 |
defined_chunk_overlap = 150
|
| 151 |
if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
|
|
@@ -178,13 +177,13 @@ class PDFProcessor:
|
|
| 178 |
_, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 179 |
|
| 180 |
#delete all documents from the vectorstore
|
| 181 |
-
if
|
| 182 |
-
|
| 183 |
|
| 184 |
chromadb.api.client.SharedSystemClient.clear_system_cache()
|
| 185 |
new_client = chromadb.EphemeralClient()
|
| 186 |
|
| 187 |
-
|
| 188 |
documents=texts,
|
| 189 |
embedding=embeddings,
|
| 190 |
client=new_client,
|
|
@@ -192,19 +191,19 @@ class PDFProcessor:
|
|
| 192 |
#persist_directory="./chroma_db"
|
| 193 |
)
|
| 194 |
|
| 195 |
-
return TRANSLATIONS[self.language]["pdf_processed"] #+ f" ---- Chunks: {len(
|
| 196 |
|
| 197 |
else:
|
| 198 |
-
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 199 |
|
| 200 |
@spaces.GPU
|
| 201 |
-
def get_qa_response(self, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
|
| 202 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 203 |
|
| 204 |
-
if not
|
| 205 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 206 |
|
| 207 |
-
retriever =
|
| 208 |
|
| 209 |
qa_chain = RetrievalQA.from_chain_type(
|
| 210 |
llm=current_llm,
|
|
@@ -222,13 +221,14 @@ class PDFProcessor:
|
|
| 222 |
return result["result"] + "\n\nSources: " + page_labels_text
|
| 223 |
|
| 224 |
@spaces.GPU
|
| 225 |
-
def summarizer_by_k_top_n(self, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
|
| 226 |
-
|
|
|
|
| 227 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 228 |
|
| 229 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 230 |
# Get all documents from the vectorstore
|
| 231 |
-
retriever =
|
| 232 |
documents = retriever.invoke('Summary of the document and key points')
|
| 233 |
|
| 234 |
if just_get_documments:
|
|
@@ -239,7 +239,7 @@ class PDFProcessor:
|
|
| 239 |
return final_summary
|
| 240 |
|
| 241 |
# Get the top k documents by score
|
| 242 |
-
def get_summary(self, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
| 243 |
|
| 244 |
final_summary_prompt = PromptTemplate(
|
| 245 |
input_variables=["texts", "language"],
|
|
@@ -255,11 +255,11 @@ class PDFProcessor:
|
|
| 255 |
"""
|
| 256 |
)
|
| 257 |
|
| 258 |
-
return self.summarizer_by_k_top_n(ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
|
| 259 |
|
| 260 |
|
| 261 |
@spaces.GPU
|
| 262 |
-
def get_specialist_opinion(self, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
| 263 |
questions_prompt = PromptTemplate(
|
| 264 |
input_variables=["text", "specialist_prompt", "language"],
|
| 265 |
template="""
|
|
@@ -303,22 +303,19 @@ class PDFProcessor:
|
|
| 303 |
Answer:
|
| 304 |
"""
|
| 305 |
)
|
| 306 |
-
if not
|
| 307 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 308 |
|
|
|
|
|
|
|
| 309 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 310 |
|
| 311 |
-
summary_text = self.get_summary(ai_model, type_model, api_key, project_id_watsonx, True, 10)
|
| 312 |
questions_chain = questions_prompt | current_llm
|
| 313 |
questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
|
| 314 |
|
| 315 |
print(questions)
|
| 316 |
|
| 317 |
-
#clean the questions variable, delete all the text before the json and after the json
|
| 318 |
-
questions = questions.split("{")[1]
|
| 319 |
-
questions = questions.split("}")[0]
|
| 320 |
-
questions = questions.strip()
|
| 321 |
-
print(questions)
|
| 322 |
questions = json.loads(questions)
|
| 323 |
|
| 324 |
print(questions)
|
|
@@ -328,7 +325,7 @@ class PDFProcessor:
|
|
| 328 |
else:
|
| 329 |
questions["aspects"] = questions["aspects"]
|
| 330 |
|
| 331 |
-
aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
|
| 332 |
|
| 333 |
return aspects_text
|
| 334 |
|
|
|
|
| 95 |
|
| 96 |
class PDFProcessor:
|
| 97 |
def __init__(self):
|
|
|
|
| 98 |
self.language = "English"
|
| 99 |
|
| 100 |
def set_language(self, language):
|
|
|
|
| 144 |
return current_llm, embeding_model
|
| 145 |
|
| 146 |
@spaces.GPU
|
| 147 |
+
def process_pdf(self, vectorstore, pdf_file, chunk_size, chunk_overlap, ai_model, type_model, api_key, project_id_watsonx):
|
| 148 |
defined_chunk_size = 1000
|
| 149 |
defined_chunk_overlap = 150
|
| 150 |
if (ai_model == "Open AI / GPT-4o-mini" and (api_key == "")) : #or (ai_model == "IBM Granite3.1 dense / Ollama local" and type_model == "Api Key" and (api_key == "" or project_id_watsonx == "")
|
|
|
|
| 177 |
_, embeddings = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 178 |
|
| 179 |
#delete all documents from the vectorstore
|
| 180 |
+
if vectorstore:
|
| 181 |
+
vectorstore.delete_collection()
|
| 182 |
|
| 183 |
chromadb.api.client.SharedSystemClient.clear_system_cache()
|
| 184 |
new_client = chromadb.EphemeralClient()
|
| 185 |
|
| 186 |
+
vectorstore = Chroma.from_documents(
|
| 187 |
documents=texts,
|
| 188 |
embedding=embeddings,
|
| 189 |
client=new_client,
|
|
|
|
| 191 |
#persist_directory="./chroma_db"
|
| 192 |
)
|
| 193 |
|
| 194 |
+
return TRANSLATIONS[self.language]["pdf_processed"], vectorstore #+ f" ---- Chunks: {len(vectorstore.get()["documents"])}"
|
| 195 |
|
| 196 |
else:
|
| 197 |
+
return TRANSLATIONS[self.language]["load_pdf_first"], None
|
| 198 |
|
| 199 |
@spaces.GPU
|
| 200 |
+
def get_qa_response(self, vectorstore, message, history, ai_model, type_model, api_key, project_id_watsonx, k=4):
|
| 201 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 202 |
|
| 203 |
+
if not vectorstore:
|
| 204 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 205 |
|
| 206 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
| 207 |
|
| 208 |
qa_chain = RetrievalQA.from_chain_type(
|
| 209 |
llm=current_llm,
|
|
|
|
| 221 |
return result["result"] + "\n\nSources: " + page_labels_text
|
| 222 |
|
| 223 |
@spaces.GPU
|
| 224 |
+
def summarizer_by_k_top_n(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, summary_prompt, just_get_documments=False):
|
| 225 |
+
print("Summarizer by k top n in language: ", self.language)
|
| 226 |
+
if not vectorstore:
|
| 227 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 228 |
|
| 229 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 230 |
# Get all documents from the vectorstore
|
| 231 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
| 232 |
documents = retriever.invoke('Summary of the document and key points')
|
| 233 |
|
| 234 |
if just_get_documments:
|
|
|
|
| 239 |
return final_summary
|
| 240 |
|
| 241 |
# Get the top k documents by score
|
| 242 |
+
def get_summary(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, just_get_documments=False, k=10):
|
| 243 |
|
| 244 |
final_summary_prompt = PromptTemplate(
|
| 245 |
input_variables=["texts", "language"],
|
|
|
|
| 255 |
"""
|
| 256 |
)
|
| 257 |
|
| 258 |
+
return self.summarizer_by_k_top_n(vectorstore, ai_model, type_model, api_key, project_id_watsonx, k, final_summary_prompt, just_get_documments)
|
| 259 |
|
| 260 |
|
| 261 |
@spaces.GPU
|
| 262 |
+
def get_specialist_opinion(self, vectorstore, ai_model, type_model, api_key, project_id_watsonx, specialist_prompt):
|
| 263 |
questions_prompt = PromptTemplate(
|
| 264 |
input_variables=["text", "specialist_prompt", "language"],
|
| 265 |
template="""
|
|
|
|
| 303 |
Answer:
|
| 304 |
"""
|
| 305 |
)
|
| 306 |
+
if not vectorstore:
|
| 307 |
return TRANSLATIONS[self.language]["load_pdf_first"]
|
| 308 |
|
| 309 |
+
print(ai_model)
|
| 310 |
+
print(type_model)
|
| 311 |
current_llm, _ = self.set_llm(ai_model, type_model, api_key, project_id_watsonx)
|
| 312 |
|
| 313 |
+
summary_text = self.get_summary(vectorstore, ai_model, type_model, api_key, project_id_watsonx, True, 10)
|
| 314 |
questions_chain = questions_prompt | current_llm
|
| 315 |
questions = questions_chain.invoke({"text": summary_text, "specialist_prompt": specialist_prompt, "language": self.language})
|
| 316 |
|
| 317 |
print(questions)
|
| 318 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
questions = json.loads(questions)
|
| 320 |
|
| 321 |
print(questions)
|
|
|
|
| 325 |
else:
|
| 326 |
questions["aspects"] = questions["aspects"]
|
| 327 |
|
| 328 |
+
aspects_text = "\n".join([f"* {aspect}: {self.get_qa_response(vectorstore, aspect, [], ai_model, type_model, api_key, project_id_watsonx, 2)}" for aspect in questions["aspects"]])
|
| 329 |
|
| 330 |
return aspects_text
|
| 331 |
|