Spaces:
Sleeping
Sleeping
Commit
·
8c2f0ba
1
Parent(s):
96db48f
deepnote update
Browse files
faq.py
CHANGED
|
@@ -8,22 +8,22 @@ from langchain.embeddings.base import Embeddings
|
|
| 8 |
from langchain.vectorstores.base import VectorStore
|
| 9 |
import os
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
|
| 17 |
|
| 18 |
def faq_id(sheet_url: str) -> str:
|
| 19 |
-
x = sheet_url.find(
|
| 20 |
-
y = sheet_url.find(
|
| 21 |
-
return sheet_url[x + len(
|
| 22 |
|
| 23 |
|
| 24 |
def xlsx_url(faq_id: str) -> str:
|
| 25 |
y = faq_id.rfind("-")
|
| 26 |
-
return
|
| 27 |
|
| 28 |
|
| 29 |
def read_df(xlsx_url: str) -> pd.DataFrame:
|
|
@@ -39,21 +39,16 @@ def embedding_function(model_name: str) -> HuggingFaceEmbeddings:
|
|
| 39 |
return HuggingFaceEmbeddings(
|
| 40 |
model_name=model_name,
|
| 41 |
encode_kwargs={"normalize_embeddings": True},
|
| 42 |
-
cache_folder=
|
| 43 |
)
|
| 44 |
|
| 45 |
|
| 46 |
def vectordb(
|
| 47 |
-
faq_id: str,
|
| 48 |
-
embedding_function: Embeddings,
|
| 49 |
-
documents: List[Document] = None
|
| 50 |
) -> VectorStore:
|
| 51 |
vectordb = None
|
| 52 |
if documents is None:
|
| 53 |
-
vectordb = AwaDB(
|
| 54 |
-
embedding=embedding_function,
|
| 55 |
-
log_and_data_dir=dir_vectordb
|
| 56 |
-
)
|
| 57 |
success = vectordb.load_local(table_name=faq_id)
|
| 58 |
if not success:
|
| 59 |
raise Exception("faq_id may not exists")
|
|
@@ -62,11 +57,13 @@ def vectordb(
|
|
| 62 |
documents=documents,
|
| 63 |
embedding=embedding_function,
|
| 64 |
table_name=faq_id,
|
| 65 |
-
log_and_data_dir=
|
| 66 |
)
|
| 67 |
return vectordb
|
| 68 |
|
| 69 |
|
| 70 |
-
def similarity_search(
|
|
|
|
|
|
|
| 71 |
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
| 72 |
-
return vectordb.similarity_search_with_relevance_scores(query=query, k=k)
|
|
|
|
| 8 |
from langchain.vectorstores.base import VectorStore
|
| 9 |
import os
|
| 10 |
|
| 11 |
+
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
|
| 12 |
+
SHEET_URL_Y = "/edit#gid="
|
| 13 |
+
SHEET_URL_Y_EXPORT = "/export?gid="
|
| 14 |
+
CACHE_FOLDER = ".embedding-model"
|
| 15 |
+
VECTORDB_FOLDER = ".vectordb"
|
| 16 |
|
| 17 |
|
| 18 |
def faq_id(sheet_url: str) -> str:
|
| 19 |
+
x = sheet_url.find(SHEET_URL_X)
|
| 20 |
+
y = sheet_url.find(SHEET_URL_Y)
|
| 21 |
+
return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]
|
| 22 |
|
| 23 |
|
| 24 |
def xlsx_url(faq_id: str) -> str:
|
| 25 |
y = faq_id.rfind("-")
|
| 26 |
+
return SHEET_URL_X + faq_id[0:y] + SHEET_URL_Y_EXPORT + faq_id[y + 1 :]
|
| 27 |
|
| 28 |
|
| 29 |
def read_df(xlsx_url: str) -> pd.DataFrame:
|
|
|
|
| 39 |
return HuggingFaceEmbeddings(
|
| 40 |
model_name=model_name,
|
| 41 |
encode_kwargs={"normalize_embeddings": True},
|
| 42 |
+
cache_folder=CACHE_FOLDER,
|
| 43 |
)
|
| 44 |
|
| 45 |
|
| 46 |
def vectordb(
|
| 47 |
+
faq_id: str, embedding_function: Embeddings, documents: List[Document] = None
|
|
|
|
|
|
|
| 48 |
) -> VectorStore:
|
| 49 |
vectordb = None
|
| 50 |
if documents is None:
|
| 51 |
+
vectordb = AwaDB(embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER)
|
|
|
|
|
|
|
|
|
|
| 52 |
success = vectordb.load_local(table_name=faq_id)
|
| 53 |
if not success:
|
| 54 |
raise Exception("faq_id may not exists")
|
|
|
|
| 57 |
documents=documents,
|
| 58 |
embedding=embedding_function,
|
| 59 |
table_name=faq_id,
|
| 60 |
+
log_and_data_dir=VECTORDB_FOLDER,
|
| 61 |
)
|
| 62 |
return vectordb
|
| 63 |
|
| 64 |
|
| 65 |
+
def similarity_search(
|
| 66 |
+
vectordb: VectorStore, query: str, k: int
|
| 67 |
+
) -> List[Tuple[Document, float]]:
|
| 68 |
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
| 69 |
+
return vectordb.similarity_search_with_relevance_scores(query=query, k=k)
|