Spaces:
Running
Running
Sean-Case
commited on
Commit
·
ee77123
1
Parent(s):
a462256
Improved prompting for csv/excel files
Browse files- app.py +9 -9
- chatfuncs/chatfuncs.py +12 -6
- chatfuncs/ingest.py +25 -10
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import os
|
|
| 8 |
os.system("pip install gradio==3.42.0")
|
| 9 |
|
| 10 |
from typing import TypeVar
|
| 11 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
| 12 |
from langchain.vectorstores import FAISS
|
| 13 |
import gradio as gr
|
| 14 |
|
|
@@ -29,17 +29,17 @@ import chatfuncs.ingest as ing
|
|
| 29 |
|
| 30 |
embeddings_name = "BAAI/bge-base-en-v1.5"
|
| 31 |
|
| 32 |
-
def load_embeddings(embeddings_name = "
|
| 33 |
|
| 34 |
|
| 35 |
-
if embeddings_name == "hkunlp/instructor-large":
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
-
else:
|
| 42 |
-
|
| 43 |
|
| 44 |
global embeddings
|
| 45 |
|
|
|
|
| 8 |
os.system("pip install gradio==3.42.0")
|
| 9 |
|
| 10 |
from typing import TypeVar
|
| 11 |
+
from langchain.embeddings import HuggingFaceEmbeddings#, HuggingFaceInstructEmbeddings
|
| 12 |
from langchain.vectorstores import FAISS
|
| 13 |
import gradio as gr
|
| 14 |
|
|
|
|
| 29 |
|
| 30 |
embeddings_name = "BAAI/bge-base-en-v1.5"
|
| 31 |
|
| 32 |
+
def load_embeddings(embeddings_name = "BAAI/bge-base-en-v1.5"):
|
| 33 |
|
| 34 |
|
| 35 |
+
#if embeddings_name == "hkunlp/instructor-large":
|
| 36 |
+
# embeddings_func = HuggingFaceInstructEmbeddings(model_name=embeddings_name,
|
| 37 |
+
# embed_instruction="Represent the paragraph for retrieval: ",
|
| 38 |
+
# query_instruction="Represent the question for retrieving supporting documents: "
|
| 39 |
+
# )
|
| 40 |
|
| 41 |
+
#else:
|
| 42 |
+
embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name)
|
| 43 |
|
| 44 |
global embeddings
|
| 45 |
|
chatfuncs/chatfuncs.py
CHANGED
|
@@ -308,6 +308,10 @@ QUESTION: {question}
|
|
| 308 |
|
| 309 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
|
| 312 |
|
| 313 |
question = inputs["question"]
|
|
@@ -317,7 +321,7 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
| 317 |
new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
|
| 318 |
|
| 319 |
|
| 320 |
-
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val =
|
| 321 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
| 322 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
| 323 |
|
|
@@ -333,12 +337,14 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
|
|
| 333 |
|
| 334 |
|
| 335 |
# Build up sources content to add to user display
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
#modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
|
| 341 |
-
modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
|
| 342 |
docs_content_string = '<br><br>'.join(modified_page_content)
|
| 343 |
|
| 344 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
|
|
|
| 308 |
|
| 309 |
return INSTRUCTION_PROMPT, CONTENT_PROMPT
|
| 310 |
|
| 311 |
+
def write_out_metadata_as_string(metadata_in):
|
| 312 |
+
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
| 313 |
+
return metadata_string
|
| 314 |
+
|
| 315 |
def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content_prompt, extracted_memory, vectorstore, embeddings): # ,
|
| 316 |
|
| 317 |
question = inputs["question"]
|
|
|
|
| 321 |
new_question_kworded = adapt_q_from_chat_history(question, chat_history, extracted_memory) # new_question_keywords,
|
| 322 |
|
| 323 |
|
| 324 |
+
docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 25, out_passages = 2,
|
| 325 |
vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
|
| 326 |
#vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
|
| 327 |
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
# Build up sources content to add to user display
|
| 340 |
+
doc_df['meta_clean'] = write_out_metadata_as_string(doc_df["metadata"]) # [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
|
| 341 |
+
|
| 342 |
+
# Remove meta text from the page content if it already exists there
|
| 343 |
+
doc_df['page_content_no_meta'] = doc_df.apply(lambda row: row['page_content'].replace(row['meta_clean'] + ". ", ""), axis=1)
|
| 344 |
+
doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content_no_meta'].astype(str)
|
| 345 |
|
| 346 |
+
#modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
|
| 347 |
+
modified_page_content = [f" Document {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
|
|
|
|
|
|
|
|
|
|
| 348 |
docs_content_string = '<br><br>'.join(modified_page_content)
|
| 349 |
|
| 350 |
sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
|
chatfuncs/ingest.py
CHANGED
|
@@ -25,7 +25,7 @@ import pandas as pd
|
|
| 25 |
import dateutil.parser
|
| 26 |
from typing import TypeVar, List
|
| 27 |
|
| 28 |
-
from langchain.embeddings import HuggingFaceInstructEmbeddings,
|
| 29 |
from langchain.vectorstores.faiss import FAISS
|
| 30 |
from langchain.vectorstores import Chroma
|
| 31 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
@@ -462,6 +462,14 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
|
|
| 462 |
|
| 463 |
return documents
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
| 466 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
| 467 |
|
|
@@ -479,6 +487,10 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
| 479 |
if col != text_column:
|
| 480 |
metadata[col] = value
|
| 481 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
# If chunk_size is provided, split the text into chunks
|
| 483 |
if chunk_size:
|
| 484 |
# Assuming you have a text splitter function similar to the PDF handling
|
|
@@ -487,14 +499,17 @@ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Docu
|
|
| 487 |
# Other arguments as required by the splitter
|
| 488 |
)
|
| 489 |
sections = text_splitter.split_text(doc_content)
|
|
|
|
| 490 |
|
| 491 |
# For each section, create a Document object
|
| 492 |
for i, section in enumerate(sections):
|
|
|
|
| 493 |
doc = Document(page_content=section,
|
| 494 |
metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
|
| 495 |
doc_sections.append(doc)
|
| 496 |
else:
|
| 497 |
# If no chunk_size is provided, create a single Document object for the row
|
|
|
|
| 498 |
doc = Document(page_content=doc_content, metadata=metadata)
|
| 499 |
doc_sections.append(doc)
|
| 500 |
|
|
@@ -559,16 +574,16 @@ def docs_elements_from_csv_save(docs_path="documents.csv"):
|
|
| 559 |
|
| 560 |
# ## Create embeddings and save faiss vector store to the path specified in `save_to`
|
| 561 |
|
| 562 |
-
def load_embeddings(model_name = "
|
| 563 |
|
| 564 |
-
if model_name == "hkunlp/instructor-large":
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
|
| 570 |
-
else:
|
| 571 |
-
|
| 572 |
|
| 573 |
global embeddings
|
| 574 |
|
|
@@ -576,7 +591,7 @@ def load_embeddings(model_name = "thenlper/gte-base"):
|
|
| 576 |
|
| 577 |
return embeddings_func
|
| 578 |
|
| 579 |
-
def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "
|
| 580 |
|
| 581 |
load_embeddings(model_name=model_name)
|
| 582 |
|
|
|
|
| 25 |
import dateutil.parser
|
| 26 |
from typing import TypeVar, List
|
| 27 |
|
| 28 |
+
from langchain.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
|
| 29 |
from langchain.vectorstores.faiss import FAISS
|
| 30 |
from langchain.vectorstores import Chroma
|
| 31 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 462 |
|
| 463 |
return documents
|
| 464 |
|
| 465 |
+
def write_out_metadata_as_string(metadata_in):
|
| 466 |
+
# If metadata_in is a single dictionary, wrap it in a list
|
| 467 |
+
if isinstance(metadata_in, dict):
|
| 468 |
+
metadata_in = [metadata_in]
|
| 469 |
+
|
| 470 |
+
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata']
|
| 471 |
+
return metadata_string
|
| 472 |
+
|
| 473 |
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
|
| 474 |
"""Converts a DataFrame's content to a list of Documents with metadata."""
|
| 475 |
|
|
|
|
| 487 |
if col != text_column:
|
| 488 |
metadata[col] = value
|
| 489 |
|
| 490 |
+
metadata_string = write_out_metadata_as_string(metadata)[0]
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
|
| 494 |
# If chunk_size is provided, split the text into chunks
|
| 495 |
if chunk_size:
|
| 496 |
# Assuming you have a text splitter function similar to the PDF handling
|
|
|
|
| 499 |
# Other arguments as required by the splitter
|
| 500 |
)
|
| 501 |
sections = text_splitter.split_text(doc_content)
|
| 502 |
+
|
| 503 |
|
| 504 |
# For each section, create a Document object
|
| 505 |
for i, section in enumerate(sections):
|
| 506 |
+
section = '. '.join([metadata_string, section])
|
| 507 |
doc = Document(page_content=section,
|
| 508 |
metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
|
| 509 |
doc_sections.append(doc)
|
| 510 |
else:
|
| 511 |
# If no chunk_size is provided, create a single Document object for the row
|
| 512 |
+
doc_content = '. '.join([metadata_string, doc_content])
|
| 513 |
doc = Document(page_content=doc_content, metadata=metadata)
|
| 514 |
doc_sections.append(doc)
|
| 515 |
|
|
|
|
| 574 |
|
| 575 |
# ## Create embeddings and save faiss vector store to the path specified in `save_to`
|
| 576 |
|
| 577 |
+
def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
|
| 578 |
|
| 579 |
+
#if model_name == "hkunlp/instructor-large":
|
| 580 |
+
# embeddings_func = HuggingFaceInstructEmbeddings(model_name=model_name,
|
| 581 |
+
# embed_instruction="Represent the paragraph for retrieval: ",
|
| 582 |
+
# query_instruction="Represent the question for retrieving supporting documents: "
|
| 583 |
+
# )
|
| 584 |
|
| 585 |
+
#else:
|
| 586 |
+
embeddings_func = HuggingFaceEmbeddings(model_name=model_name)
|
| 587 |
|
| 588 |
global embeddings
|
| 589 |
|
|
|
|
| 591 |
|
| 592 |
return embeddings_func
|
| 593 |
|
| 594 |
+
def embed_faiss_save_to_zip(docs_out, save_to="faiss_lambeth_census_embedding", model_name = "BAAI/bge-base-en-v1.5"):
|
| 595 |
|
| 596 |
load_embeddings(model_name=model_name)
|
| 597 |
|