Nikit_PDF_Chatbot

Running

App Files Files Community

Sean-Case commited on Oct 24, 2023

Commit

ee77123

1 Parent(s): a462256

Improved prompting for csv/excel files

Browse files

Files changed (3) hide show

app.py +9 -9
chatfuncs/chatfuncs.py +12 -6
chatfuncs/ingest.py +25 -10

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import os
 os.system("pip install gradio==3.42.0")
 from typing import TypeVar
-from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
 import gradio as gr
@@ -29,17 +29,17 @@ import chatfuncs.ingest as ing
 embeddings_name = "BAAI/bge-base-en-v1.5"
-def load_embeddings(embeddings_name = "thenlper/gte-base"):
-    if embeddings_name == "hkunlp/instructor-large":
-        embeddings_func = HuggingFaceInstructEmbeddings(model_name=embeddings_name,
-        embed_instruction="Represent the paragraph for retrieval: ",
-        query_instruction="Represent the question for retrieving supporting documents: "
-        )
-    else:
-        embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
     global embeddings

 os.system("pip install gradio==3.42.0")
 from typing import TypeVar
+from langchain.embeddings import HuggingFaceEmbeddings#, HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
 import gradio as gr
 embeddings_name = "BAAI/bge-base-en-v1.5"
+def load_embeddings(embeddings_name = "BAAI/bge-base-en-v1.5"):
+    #if embeddings_name == "hkunlp/instructor-large":
+    #    embeddings_func = HuggingFaceInstructEmbeddings(model_name=embeddings_name,
+    #    embed_instruction="Represent the paragraph for retrieval: ",
+    #    query_instruction="Represent the question for retrieving supporting documents: "
+    #    )
+    #else:
+    embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
     global embeddings

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -308,6 +308,10 @@ QUESTION: {question}
     return INSTRUCTION_PROMPT, CONTENT_PROMPT
 def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
         question =  inputs["question"]
@@ -317,7 +321,7 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
         new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
-        docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 10, out_passages = 2,
                                                                           vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
                                                                           #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
@@ -333,12 +337,14 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
         # Build up sources content to add to user display
-        doc_df['meta_clean'] = [f"<b>{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
-        doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
-        #modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
-        modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
         docs_content_string = '<br><br>'.join(modified_page_content)
         sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace("  "," ")#.strip()

     return INSTRUCTION_PROMPT, CONTENT_PROMPT
+def write_out_metadata_as_string(metadata_in):
+    metadata_string = [f"{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
+    return metadata_string
 def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
         question =  inputs["question"]
         new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
+        docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 25, out_passages = 2,
                                                                           vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
                                                                           #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
         # Build up sources content to add to user display
+        doc_df['meta_clean'] = write_out_metadata_as_string(doc_df["metadata"]) # [f"<b>{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
+        # Remove meta text from the page content if it already exists there
+        doc_df['page_content_no_meta'] = doc_df.apply(lambda row: row['page_content'].replace(row['meta_clean'] + ". ", ""), axis=1)
+        doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content_no_meta'].astype(str)
+        #modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
+        modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
         docs_content_string = '<br><br>'.join(modified_page_content)
         sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace("  "," ")#.strip()

chatfuncs/ingest.py CHANGED Viewed

@@ -25,7 +25,7 @@ import pandas as pd
 import dateutil.parser
 from typing import TypeVar, List
-from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
 from langchain.vectorstores.faiss import FAISS
 from langchain.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -462,6 +462,14 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
     return documents
 def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
     """Converts a DataFrame's content to a list of Documents with metadata."""
@@ -479,6 +487,10 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
             if col != text_column:
                 metadata[col] = value
         # If chunk_size is provided, split the text into chunks
         if chunk_size:
             # Assuming you have a text splitter function similar to the PDF handling
@@ -487,14 +499,17 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
                 # Other arguments as required by the splitter
             )
             sections = text_splitter.split_text(doc_content)
             # For each section, create a Document object
             for i, section in enumerate(sections):
                 doc = Document(page_content=section,
                                metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
                 doc_sections.append(doc)
         else:
             # If no chunk_size is provided, create a single Document object for the row
             doc = Document(page_content=doc_content, metadata=metadata)
             doc_sections.append(doc)
@@ -559,16 +574,16 @@ def docs_elements_from_csv_save(docs_path="documents.csv"):
 # ## Create embeddings and save faiss vector store to the path specified in `save_to`
-def load_embeddings(model_name = "thenlper/gte-base"):
-    if model_name == "hkunlp/instructor-large":
-        embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
-        embed_instruction="Represent the paragraph for retrieval: ",
-        query_instruction="Represent the question for retrieving supporting documents: "
-        )
-    else:
-        embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
     global embeddings
@@ -576,7 +591,7 @@ def load_embeddings(model_name = "thenlper/gte-base"):
     return embeddings_func
-def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "thenlper/gte-base"):
     load_embeddings(model_name=model_name)

 import dateutil.parser
 from typing import TypeVar, List
+from langchain.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
 from langchain.vectorstores.faiss import FAISS
 from langchain.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
     return documents
+def write_out_metadata_as_string(metadata_in):
+    # If metadata_in is a single dictionary, wrap it in a list
+    if isinstance(metadata_in, dict):
+        metadata_in = [metadata_in]
+    metadata_string = [f"{'  '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
+    return metadata_string
 def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
     """Converts a DataFrame's content to a list of Documents with metadata."""
             if col != text_column:
                 metadata[col] = value
+        metadata_string = write_out_metadata_as_string(metadata)[0]
         # If chunk_size is provided, split the text into chunks
         if chunk_size:
             # Assuming you have a text splitter function similar to the PDF handling
                 # Other arguments as required by the splitter
             )
             sections = text_splitter.split_text(doc_content)
             # For each section, create a Document object
             for i, section in enumerate(sections):
+                section = '. '.join([metadata_string, section])
                 doc = Document(page_content=section,
                                metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
                 doc_sections.append(doc)
         else:
             # If no chunk_size is provided, create a single Document object for the row
+            doc_content = '. '.join([metadata_string, doc_content])
             doc = Document(page_content=doc_content, metadata=metadata)
             doc_sections.append(doc)
 # ## Create embeddings and save faiss vector store to the path specified in `save_to`
+def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
+    #if model_name == "hkunlp/instructor-large":
+    #    embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
+    #    embed_instruction="Represent the paragraph for retrieval: ",
+    #    query_instruction="Represent the question for retrieving supporting documents: "
+    #    )
+    #else:
+    embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
     global embeddings
     return embeddings_func
+def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "BAAI/bge-base-en-v1.5"):
     load_embeddings(model_name=model_name)