Spaces:
Sleeping
Sleeping
Commit
·
718e159
1
Parent(s):
6c9d07b
deepnote update
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import util as util
|
|
| 5 |
import uvicorn
|
| 6 |
import gradio as gr
|
| 7 |
from typing import List, Optional
|
|
|
|
| 8 |
|
| 9 |
app = FastAPI()
|
| 10 |
|
|
@@ -13,15 +14,9 @@ class AskRequest(BaseModel):
|
|
| 13 |
question: str
|
| 14 |
sheet_url: str
|
| 15 |
page_content_column: str
|
| 16 |
-
k: int
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
class AskRequestEx(BaseModel):
|
| 20 |
-
question: str
|
| 21 |
-
sheet_url: str
|
| 22 |
-
page_content_column: str
|
| 23 |
-
k: int
|
| 24 |
-
id_column: str
|
| 25 |
synonyms: Optional[List[List[str]]] = None
|
| 26 |
|
| 27 |
|
|
@@ -33,15 +28,17 @@ async def ask_api(request: AskRequest):
|
|
| 33 |
|
| 34 |
|
| 35 |
@app.post("/api/v2/ask")
|
| 36 |
-
async def ask_api(request:
|
| 37 |
-
|
|
|
|
| 38 |
if request.synonyms is not None:
|
| 39 |
util.SYNONYMS = request.synonyms
|
| 40 |
vectordb = faq.load_vectordb(request.sheet_url, request.page_content_column)
|
| 41 |
documents = faq.similarity_search(vectordb, request.question, k=request.k)
|
| 42 |
df_doc = util.transform_documents_to_dataframe(documents)
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
@app.delete("/api/v1/")
|
|
@@ -52,8 +49,9 @@ async def delete_vectordb_api():
|
|
| 52 |
def ask(sheet_url: str, page_content_column: str, k: int, question: str):
|
| 53 |
util.SPLIT_PAGE_BREAKS = False
|
| 54 |
vectordb = faq.load_vectordb(sheet_url, page_content_column)
|
| 55 |
-
|
| 56 |
-
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
def delete_vectordb():
|
|
@@ -63,7 +61,7 @@ def delete_vectordb():
|
|
| 63 |
with gr.Blocks() as block:
|
| 64 |
sheet_url = gr.Textbox(label="Google Sheet URL")
|
| 65 |
page_content_column = gr.Textbox(label="Question Column")
|
| 66 |
-
k = gr.Slider(
|
| 67 |
question = gr.Textbox(label="Question")
|
| 68 |
ask_button = gr.Button("Ask")
|
| 69 |
answer_output = gr.JSON(label="Answer")
|
|
|
|
| 5 |
import uvicorn
|
| 6 |
import gradio as gr
|
| 7 |
from typing import List, Optional
|
| 8 |
+
from fastapi.responses import JSONResponse
|
| 9 |
|
| 10 |
app = FastAPI()
|
| 11 |
|
|
|
|
| 14 |
question: str
|
| 15 |
sheet_url: str
|
| 16 |
page_content_column: str
|
| 17 |
+
k: int = 20
|
| 18 |
+
reload_collection: Optional[bool] = None
|
| 19 |
+
id_column: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
synonyms: Optional[List[List[str]]] = None
|
| 21 |
|
| 22 |
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
@app.post("/api/v2/ask")
|
| 31 |
+
async def ask_api(request: AskRequest):
|
| 32 |
+
if request.id_column is not None:
|
| 33 |
+
util.SPLIT_PAGE_BREAKS = True
|
| 34 |
if request.synonyms is not None:
|
| 35 |
util.SYNONYMS = request.synonyms
|
| 36 |
vectordb = faq.load_vectordb(request.sheet_url, request.page_content_column)
|
| 37 |
documents = faq.similarity_search(vectordb, request.question, k=request.k)
|
| 38 |
df_doc = util.transform_documents_to_dataframe(documents)
|
| 39 |
+
if request.id_column is not None:
|
| 40 |
+
df_doc = util.remove_duplicates_by_column(df_doc, request.id_column)
|
| 41 |
+
return JSONResponse(util.dataframe_to_dict(df_doc))
|
| 42 |
|
| 43 |
|
| 44 |
@app.delete("/api/v1/")
|
|
|
|
| 49 |
def ask(sheet_url: str, page_content_column: str, k: int, question: str):
|
| 50 |
util.SPLIT_PAGE_BREAKS = False
|
| 51 |
vectordb = faq.load_vectordb(sheet_url, page_content_column)
|
| 52 |
+
documents = faq.similarity_search(vectordb, question, k=k)
|
| 53 |
+
df_doc = util.transform_documents_to_dataframe(documents)
|
| 54 |
+
return util.dataframe_to_dict(df_doc)
|
| 55 |
|
| 56 |
|
| 57 |
def delete_vectordb():
|
|
|
|
| 61 |
with gr.Blocks() as block:
|
| 62 |
sheet_url = gr.Textbox(label="Google Sheet URL")
|
| 63 |
page_content_column = gr.Textbox(label="Question Column")
|
| 64 |
+
k = gr.Slider(1, 30, step=1, label="K")
|
| 65 |
question = gr.Textbox(label="Question")
|
| 66 |
ask_button = gr.Button("Ask")
|
| 67 |
answer_output = gr.JSON(label="Answer")
|
faq.py
CHANGED
|
@@ -32,7 +32,7 @@ def define_embedding_function(model_name: str) -> HuggingFaceEmbeddings:
|
|
| 32 |
|
| 33 |
|
| 34 |
def get_vectordb(
|
| 35 |
-
|
| 36 |
embedding_function: Embeddings,
|
| 37 |
documents: List[Document] = None,
|
| 38 |
vectordb_type: str = VECTORDB_TYPE,
|
|
@@ -44,31 +44,32 @@ def get_vectordb(
|
|
| 44 |
vectordb = AwaDB(
|
| 45 |
embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER
|
| 46 |
)
|
| 47 |
-
if not vectordb.load_local(table_name=
|
| 48 |
-
raise Exception("
|
| 49 |
else:
|
| 50 |
vectordb = AwaDB.from_documents(
|
| 51 |
documents=documents,
|
| 52 |
embedding=embedding_function,
|
| 53 |
-
table_name=
|
| 54 |
log_and_data_dir=VECTORDB_FOLDER,
|
| 55 |
)
|
| 56 |
if vectordb_type is VECTORDB_TYPES.Chroma:
|
| 57 |
if documents is None:
|
| 58 |
vectordb = Chroma(
|
| 59 |
-
collection_name=
|
| 60 |
embedding_function=embedding_function,
|
| 61 |
persist_directory=VECTORDB_FOLDER,
|
| 62 |
)
|
| 63 |
if not vectordb.get()["ids"]:
|
| 64 |
-
raise Exception("
|
| 65 |
else:
|
| 66 |
vectordb = Chroma.from_documents(
|
| 67 |
documents=documents,
|
| 68 |
embedding=embedding_function,
|
| 69 |
-
collection_name=
|
| 70 |
persist_directory=VECTORDB_FOLDER,
|
| 71 |
)
|
|
|
|
| 72 |
return vectordb
|
| 73 |
|
| 74 |
|
|
@@ -80,33 +81,33 @@ def similarity_search(
|
|
| 80 |
|
| 81 |
|
| 82 |
def load_vectordb_id(
|
| 83 |
-
|
| 84 |
page_content_column: str,
|
| 85 |
embedding_function_name: str = EMBEDDING_MODEL,
|
| 86 |
) -> VectorStore:
|
| 87 |
embedding_function = define_embedding_function(embedding_function_name)
|
| 88 |
vectordb = None
|
| 89 |
try:
|
| 90 |
-
vectordb = get_vectordb(
|
| 91 |
except Exception as e:
|
| 92 |
print(e)
|
| 93 |
-
vectordb = create_vectordb_id(
|
| 94 |
|
| 95 |
return vectordb
|
| 96 |
|
| 97 |
|
| 98 |
def create_vectordb_id(
|
| 99 |
-
|
| 100 |
page_content_column: str,
|
| 101 |
embedding_function: HuggingFaceEmbeddings = None,
|
| 102 |
) -> VectorStore:
|
| 103 |
if embedding_function is None:
|
| 104 |
embedding_function = define_embedding_function(EMBEDDING_MODEL)
|
| 105 |
|
| 106 |
-
df = util.read_df(util.xlsx_url(
|
| 107 |
documents = create_documents(df, page_content_column)
|
| 108 |
vectordb = get_vectordb(
|
| 109 |
-
|
| 110 |
)
|
| 111 |
return vectordb
|
| 112 |
|
|
@@ -115,5 +116,10 @@ def load_vectordb(sheet_url: str, page_content_column: str) -> VectorStore:
|
|
| 115 |
return load_vectordb_id(util.get_id(sheet_url), page_content_column)
|
| 116 |
|
| 117 |
|
| 118 |
-
def delete_vectordb():
|
| 119 |
shutil.rmtree(VECTORDB_FOLDER, ignore_errors=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def get_vectordb(
|
| 35 |
+
collection_id: str,
|
| 36 |
embedding_function: Embeddings,
|
| 37 |
documents: List[Document] = None,
|
| 38 |
vectordb_type: str = VECTORDB_TYPE,
|
|
|
|
| 44 |
vectordb = AwaDB(
|
| 45 |
embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER
|
| 46 |
)
|
| 47 |
+
if not vectordb.load_local(table_name=collection_id):
|
| 48 |
+
raise Exception("collection_id may not exists")
|
| 49 |
else:
|
| 50 |
vectordb = AwaDB.from_documents(
|
| 51 |
documents=documents,
|
| 52 |
embedding=embedding_function,
|
| 53 |
+
table_name=collection_id,
|
| 54 |
log_and_data_dir=VECTORDB_FOLDER,
|
| 55 |
)
|
| 56 |
if vectordb_type is VECTORDB_TYPES.Chroma:
|
| 57 |
if documents is None:
|
| 58 |
vectordb = Chroma(
|
| 59 |
+
collection_name=collection_id,
|
| 60 |
embedding_function=embedding_function,
|
| 61 |
persist_directory=VECTORDB_FOLDER,
|
| 62 |
)
|
| 63 |
if not vectordb.get()["ids"]:
|
| 64 |
+
raise Exception("collection_id may not exists")
|
| 65 |
else:
|
| 66 |
vectordb = Chroma.from_documents(
|
| 67 |
documents=documents,
|
| 68 |
embedding=embedding_function,
|
| 69 |
+
collection_name=collection_id,
|
| 70 |
persist_directory=VECTORDB_FOLDER,
|
| 71 |
)
|
| 72 |
+
vectordb.persist()
|
| 73 |
return vectordb
|
| 74 |
|
| 75 |
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
def load_vectordb_id(
|
| 84 |
+
collection_id: str,
|
| 85 |
page_content_column: str,
|
| 86 |
embedding_function_name: str = EMBEDDING_MODEL,
|
| 87 |
) -> VectorStore:
|
| 88 |
embedding_function = define_embedding_function(embedding_function_name)
|
| 89 |
vectordb = None
|
| 90 |
try:
|
| 91 |
+
vectordb = get_vectordb(collection_id=collection_id, embedding_function=embedding_function)
|
| 92 |
except Exception as e:
|
| 93 |
print(e)
|
| 94 |
+
vectordb = create_vectordb_id(collection_id, page_content_column, embedding_function)
|
| 95 |
|
| 96 |
return vectordb
|
| 97 |
|
| 98 |
|
| 99 |
def create_vectordb_id(
|
| 100 |
+
collection_id: str,
|
| 101 |
page_content_column: str,
|
| 102 |
embedding_function: HuggingFaceEmbeddings = None,
|
| 103 |
) -> VectorStore:
|
| 104 |
if embedding_function is None:
|
| 105 |
embedding_function = define_embedding_function(EMBEDDING_MODEL)
|
| 106 |
|
| 107 |
+
df = util.read_df(util.xlsx_url(collection_id), page_content_column)
|
| 108 |
documents = create_documents(df, page_content_column)
|
| 109 |
vectordb = get_vectordb(
|
| 110 |
+
collection_id=collection_id, embedding_function=embedding_function, documents=documents
|
| 111 |
)
|
| 112 |
return vectordb
|
| 113 |
|
|
|
|
| 116 |
return load_vectordb_id(util.get_id(sheet_url), page_content_column)
|
| 117 |
|
| 118 |
|
| 119 |
+
def delete_vectordb() -> None:
|
| 120 |
shutil.rmtree(VECTORDB_FOLDER, ignore_errors=True)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def delete_vectordb_current_collection(vectordb: VectorStore) -> None:
|
| 124 |
+
vectordb.delete_collection()
|
| 125 |
+
vectordb.persist()
|
util.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import pandas as pd
|
|
|
|
| 2 |
|
| 3 |
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
| 4 |
SHEET_URL_Y = "/edit#gid="
|
|
@@ -27,7 +28,7 @@ def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
|
|
| 27 |
return df
|
| 28 |
|
| 29 |
|
| 30 |
-
def split_page_breaks(df, column_name):
|
| 31 |
split_values = df[column_name].str.split("\n")
|
| 32 |
|
| 33 |
new_df = pd.DataFrame({column_name: split_values.explode()})
|
|
@@ -46,37 +47,35 @@ def split_page_breaks(df, column_name):
|
|
| 46 |
return new_df
|
| 47 |
|
| 48 |
|
| 49 |
-
def transform_documents_to_dataframe(documents):
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
metadata_keys.update(doc.metadata.keys())
|
| 53 |
|
| 54 |
-
|
| 55 |
-
for doc, _ in documents:
|
| 56 |
for key, value in doc.metadata.items():
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
-
df = pd.DataFrame(metadata_values)
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
def remove_duplicates_by_column(df, column):
|
| 67 |
-
df.drop_duplicates(subset=column, inplace=True)
|
| 68 |
-
df.reset_index(drop=True, inplace=True)
|
| 69 |
|
| 70 |
return df
|
| 71 |
|
| 72 |
|
| 73 |
-
def dataframe_to_dict(df):
|
| 74 |
df_records = df.to_dict(orient="records")
|
| 75 |
|
| 76 |
return df_records
|
| 77 |
|
| 78 |
|
| 79 |
-
def duplicate_rows_with_synonyms(df, column, synonyms):
|
| 80 |
new_rows = []
|
| 81 |
for index, row in df.iterrows():
|
| 82 |
new_rows.append(row)
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
+
from langchain.docstore.document import Document
|
| 3 |
|
| 4 |
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
| 5 |
SHEET_URL_Y = "/edit#gid="
|
|
|
|
| 28 |
return df
|
| 29 |
|
| 30 |
|
| 31 |
+
def split_page_breaks(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
|
| 32 |
split_values = df[column_name].str.split("\n")
|
| 33 |
|
| 34 |
new_df = pd.DataFrame({column_name: split_values.explode()})
|
|
|
|
| 47 |
return new_df
|
| 48 |
|
| 49 |
|
| 50 |
+
def transform_documents_to_dataframe(documents: Document) -> pd.DataFrame:
|
| 51 |
+
keys = []
|
| 52 |
+
values = {"document_score": [], "page_content": []}
|
|
|
|
| 53 |
|
| 54 |
+
for doc, score in documents:
|
|
|
|
| 55 |
for key, value in doc.metadata.items():
|
| 56 |
+
if key not in keys:
|
| 57 |
+
keys.append(key)
|
| 58 |
+
values[key] = []
|
| 59 |
+
values[key].append(value)
|
| 60 |
+
values["document_score"].append(score)
|
| 61 |
+
values["page_content"].append(doc.page_content)
|
| 62 |
|
| 63 |
+
return pd.DataFrame(values)
|
| 64 |
|
|
|
|
| 65 |
|
| 66 |
+
def remove_duplicates_by_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
|
| 67 |
+
df.drop_duplicates(subset=column_name, inplace=True, ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
return df
|
| 70 |
|
| 71 |
|
| 72 |
+
def dataframe_to_dict(df: pd.DataFrame) -> dict:
|
| 73 |
df_records = df.to_dict(orient="records")
|
| 74 |
|
| 75 |
return df_records
|
| 76 |
|
| 77 |
|
| 78 |
+
def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[list[str]]) -> pd.DataFrame:
|
| 79 |
new_rows = []
|
| 80 |
for index, row in df.iterrows():
|
| 81 |
new_rows.append(row)
|