Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
mvectors
Browse files
semantic_search/all_search_execute.py
CHANGED
|
@@ -217,7 +217,6 @@ def handler(input_,session_id):
|
|
| 217 |
if(st.session_state.input_mvector_rerank):
|
| 218 |
query_vector = cb.vectorise(query,False)
|
| 219 |
vector_field = "description_vector"
|
| 220 |
-
print("-------------COLBERT-----1-------------------------------------------------")
|
| 221 |
vector_payload = {"knn": {}}
|
| 222 |
vector_payload["knn"][vector_field]= {
|
| 223 |
"vector":query_vector,
|
|
|
|
| 217 |
if(st.session_state.input_mvector_rerank):
|
| 218 |
query_vector = cb.vectorise(query,False)
|
| 219 |
vector_field = "description_vector"
|
|
|
|
| 220 |
vector_payload = {"knn": {}}
|
| 221 |
vector_payload["knn"][vector_field]= {
|
| 222 |
"vector":query_vector,
|
utilities/mvectors.py
CHANGED
|
@@ -8,7 +8,7 @@ import json
|
|
| 8 |
|
| 9 |
runtime = boto3.client('sagemaker-runtime',aws_access_key_id=st.secrets['user_access_key'],
|
| 10 |
aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1')
|
| 11 |
-
# Load
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
| 13 |
endpoint_name = 'huggingface-pytorch-inference-2025-05-21-16-31-07-967'
|
| 14 |
|
|
@@ -20,7 +20,6 @@ def mean_pooling(token_embeddings, attention_mask):
|
|
| 20 |
|
| 21 |
|
| 22 |
def vectorise(sentence,token_level_vectors):
|
| 23 |
-
print("-------------colbert ---- 2-----------")
|
| 24 |
encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
|
| 25 |
# Get input IDs (token IDs)
|
| 26 |
input_ids = encoded_input['input_ids'][0]
|
|
@@ -48,7 +47,6 @@ def vectorise(sentence,token_level_vectors):
|
|
| 48 |
return sentence_embeddings[0].tolist()
|
| 49 |
|
| 50 |
def search(hits):
|
| 51 |
-
print("-------------COLBERT------------4------------------------------------------")
|
| 52 |
tokens,token_vectors = vectorise(st.session_state.input_text,True)
|
| 53 |
final_docs = []
|
| 54 |
for ind,j in enumerate(hits):
|
|
@@ -64,7 +62,6 @@ def search(hits):
|
|
| 64 |
doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"]
|
| 65 |
else:
|
| 66 |
doc["_source"]["gender_affinity"] = ""
|
| 67 |
-
#print(j["_source"]["title"])
|
| 68 |
source_doc_token_keys = list(j["_source"].keys())
|
| 69 |
with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")]
|
| 70 |
add_score = 0
|
|
@@ -79,26 +76,22 @@ def search(hits):
|
|
| 79 |
for m in with_s:
|
| 80 |
m_arr = m.split("-")
|
| 81 |
if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'):
|
| 82 |
-
#print("document token: "+m_arr[3])
|
| 83 |
doc_token_vector = np.array(j["_source"][m])
|
| 84 |
score = np.dot(query_token_vector,doc_token_vector)
|
| 85 |
scores.append({"doc_token":m_arr[3],"score":score})
|
| 86 |
-
|
| 87 |
-
|
| 88 |
newlist = sorted(scores, key=lambda d: d['score'], reverse=True)
|
| 89 |
max_score = newlist[0]['score']
|
| 90 |
add_score+=max_score
|
| 91 |
max_score_dict_list.append(newlist[0])
|
| 92 |
-
|
| 93 |
max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True)
|
| 94 |
print(max_score_dict_list_sorted)
|
| 95 |
-
|
| 96 |
doc["total_score"] = add_score
|
| 97 |
doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted
|
| 98 |
final_docs.append(doc)
|
| 99 |
final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True)
|
| 100 |
-
print("-------------COLBERT-----final--------")
|
| 101 |
-
print(final_docs_sorted)
|
| 102 |
return final_docs_sorted
|
| 103 |
|
| 104 |
|
|
|
|
| 8 |
|
| 9 |
runtime = boto3.client('sagemaker-runtime',aws_access_key_id=st.secrets['user_access_key'],
|
| 10 |
aws_secret_access_key=st.secrets['user_secret_key'],region_name='us-east-1')
|
| 11 |
+
# Load Tokenizer from HuggingFace Hub
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
| 13 |
endpoint_name = 'huggingface-pytorch-inference-2025-05-21-16-31-07-967'
|
| 14 |
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
def vectorise(sentence,token_level_vectors):
|
|
|
|
| 23 |
encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
|
| 24 |
# Get input IDs (token IDs)
|
| 25 |
input_ids = encoded_input['input_ids'][0]
|
|
|
|
| 47 |
return sentence_embeddings[0].tolist()
|
| 48 |
|
| 49 |
def search(hits):
|
|
|
|
| 50 |
tokens,token_vectors = vectorise(st.session_state.input_text,True)
|
| 51 |
final_docs = []
|
| 52 |
for ind,j in enumerate(hits):
|
|
|
|
| 62 |
doc["_source"]["gender_affinity"] = j["_source"]["gender_affinity"]
|
| 63 |
else:
|
| 64 |
doc["_source"]["gender_affinity"] = ""
|
|
|
|
| 65 |
source_doc_token_keys = list(j["_source"].keys())
|
| 66 |
with_s = [x for x in source_doc_token_keys if x.startswith("description-token-")]
|
| 67 |
add_score = 0
|
|
|
|
| 76 |
for m in with_s:
|
| 77 |
m_arr = m.split("-")
|
| 78 |
if(m_arr[-1]!='[SEP]' and m_arr[-1]!='[CLS]'):
|
|
|
|
| 79 |
doc_token_vector = np.array(j["_source"][m])
|
| 80 |
score = np.dot(query_token_vector,doc_token_vector)
|
| 81 |
scores.append({"doc_token":m_arr[3],"score":score})
|
| 82 |
+
|
|
|
|
| 83 |
newlist = sorted(scores, key=lambda d: d['score'], reverse=True)
|
| 84 |
max_score = newlist[0]['score']
|
| 85 |
add_score+=max_score
|
| 86 |
max_score_dict_list.append(newlist[0])
|
| 87 |
+
|
| 88 |
max_score_dict_list_sorted = sorted(max_score_dict_list, key=lambda d: d['score'], reverse=True)
|
| 89 |
print(max_score_dict_list_sorted)
|
| 90 |
+
|
| 91 |
doc["total_score"] = add_score
|
| 92 |
doc['max_score_dict_list_sorted'] = max_score_dict_list_sorted
|
| 93 |
final_docs.append(doc)
|
| 94 |
final_docs_sorted = sorted(final_docs, key=lambda d: d['total_score'], reverse=True)
|
|
|
|
|
|
|
| 95 |
return final_docs_sorted
|
| 96 |
|
| 97 |
|