Spaces:
Sleeping
Sleeping
Commit
·
08d728e
1
Parent(s):
9019d6b
We have changed the embeddings' model that was being used with one
Browse filesthat was trained with texts in Spanish. We have also gone back to
use float numbers for the embeddings in order to recover the
original level of precision.
- main_service.py +5 -4
- utils_model.py +14 -3
main_service.py
CHANGED
|
@@ -7,17 +7,18 @@ class GradioAppManager():
|
|
| 7 |
model_factory_obj = ModelFactory()
|
| 8 |
self.model = model_factory_obj.create_model(model_type)
|
| 9 |
|
| 10 |
-
def __retrieve_embeddings__(self, input_queries_df):
|
| 11 |
queries_list = input_queries_df.values
|
| 12 |
-
queries_embeddings_list = self.model.retrieve_embeddings_from_texts_list(queries_list)
|
| 13 |
|
| 14 |
return queries_embeddings_list
|
| 15 |
|
| 16 |
def build(self):
|
| 17 |
gr_input_dataframe = gr.Dataframe(headers=['queries'], datatype=['str'], row_count=2, col_count=(1, 'fixed'))
|
|
|
|
| 18 |
|
| 19 |
app = gr.Interface(fn=self.__retrieve_embeddings__,
|
| 20 |
-
inputs=[gr_input_dataframe],
|
| 21 |
outputs="dataframe")
|
| 22 |
|
| 23 |
return app
|
|
@@ -25,6 +26,6 @@ class GradioAppManager():
|
|
| 25 |
def run(self, app):
|
| 26 |
app.launch(server_name='0.0.0.0')
|
| 27 |
|
| 28 |
-
gradio_app_manager_obj = GradioAppManager('
|
| 29 |
app = gradio_app_manager_obj.build()
|
| 30 |
gradio_app_manager_obj.run(app)
|
|
|
|
| 7 |
model_factory_obj = ModelFactory()
|
| 8 |
self.model = model_factory_obj.create_model(model_type)
|
| 9 |
|
| 10 |
+
def __retrieve_embeddings__(self, input_queries_df, limitnoffeatures):
|
| 11 |
queries_list = input_queries_df.values
|
| 12 |
+
queries_embeddings_list = self.model.retrieve_embeddings_from_texts_list(queries_list, limitnoffeatures=limitnoffeatures)
|
| 13 |
|
| 14 |
return queries_embeddings_list
|
| 15 |
|
| 16 |
def build(self):
|
| 17 |
gr_input_dataframe = gr.Dataframe(headers=['queries'], datatype=['str'], row_count=2, col_count=(1, 'fixed'))
|
| 18 |
+
gr_number_limitnoffeatures = gr.Number(50)
|
| 19 |
|
| 20 |
app = gr.Interface(fn=self.__retrieve_embeddings__,
|
| 21 |
+
inputs=[gr_input_dataframe, gr_number_limitnoffeatures],
|
| 22 |
outputs="dataframe")
|
| 23 |
|
| 24 |
return app
|
|
|
|
| 26 |
def run(self, app):
|
| 27 |
app.launch(server_name='0.0.0.0')
|
| 28 |
|
| 29 |
+
gradio_app_manager_obj = GradioAppManager('multilingual-e5-large-ft-sts-spanish-matryoshka-768-64-5e')
|
| 30 |
app = gradio_app_manager_obj.build()
|
| 31 |
gradio_app_manager_obj.run(app)
|
utils_model.py
CHANGED
|
@@ -16,6 +16,9 @@ class ModelFactory():
|
|
| 16 |
if (model_type=='sentence_similarity_spanish'):
|
| 17 |
model = SentenceSimilaritySpanishModel()
|
| 18 |
|
|
|
|
|
|
|
|
|
|
| 19 |
return model
|
| 20 |
|
| 21 |
class BaseModel():
|
|
@@ -25,12 +28,11 @@ class BaseModel():
|
|
| 25 |
|
| 26 |
def retrieve_embeddings_from_single_input_text(self, input_text):
|
| 27 |
embeddings = self.model.encode(input_text, batch_size=32)
|
| 28 |
-
embeddings
|
| 29 |
-
embeddings = embeddings.astype(np.uint8).astype(str).tolist()
|
| 30 |
|
| 31 |
return embeddings
|
| 32 |
|
| 33 |
-
def retrieve_embeddings_from_texts_list(self, input_texts_list):
|
| 34 |
all_embeddings_list = []
|
| 35 |
for current_input_text_aux in input_texts_list:
|
| 36 |
embeddings = self.retrieve_embeddings_from_single_input_text(current_input_text_aux)
|
|
@@ -41,6 +43,10 @@ class BaseModel():
|
|
| 41 |
columns_list = ['text'] + [f'feature_{idx}' for idx in range(0, nof_features)]
|
| 42 |
queries_embeddings_df.columns = columns_list
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
return queries_embeddings_df
|
| 45 |
|
| 46 |
class MiniLM_L6_v2_Model(BaseModel):
|
|
@@ -52,3 +58,8 @@ class SentenceSimilaritySpanishModel(BaseModel):
|
|
| 52 |
|
| 53 |
def __init__(self):
|
| 54 |
self.model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
if (model_type=='sentence_similarity_spanish'):
|
| 17 |
model = SentenceSimilaritySpanishModel()
|
| 18 |
|
| 19 |
+
if (model_type=='multilingual-e5-large-ft-sts-spanish-matryoshka-768-64-5e'):
|
| 20 |
+
model = Multilingual_E5_Large_Ft_Sts_Spanish_Matryoshka()
|
| 21 |
+
|
| 22 |
return model
|
| 23 |
|
| 24 |
class BaseModel():
|
|
|
|
| 28 |
|
| 29 |
def retrieve_embeddings_from_single_input_text(self, input_text):
|
| 30 |
embeddings = self.model.encode(input_text, batch_size=32)
|
| 31 |
+
embeddings = embeddings.astype(np.float16).astype(str).tolist()
|
|
|
|
| 32 |
|
| 33 |
return embeddings
|
| 34 |
|
| 35 |
+
def retrieve_embeddings_from_texts_list(self, input_texts_list, limitnoffeatures=-1):
|
| 36 |
all_embeddings_list = []
|
| 37 |
for current_input_text_aux in input_texts_list:
|
| 38 |
embeddings = self.retrieve_embeddings_from_single_input_text(current_input_text_aux)
|
|
|
|
| 43 |
columns_list = ['text'] + [f'feature_{idx}' for idx in range(0, nof_features)]
|
| 44 |
queries_embeddings_df.columns = columns_list
|
| 45 |
|
| 46 |
+
if (limitnoffeatures>-1):
|
| 47 |
+
columns_to_choose = queries_embeddings_df.columns[0:limitnoffeatures+1]
|
| 48 |
+
queries_embeddings_df = queries_embeddings_df[columns_to_choose]
|
| 49 |
+
|
| 50 |
return queries_embeddings_df
|
| 51 |
|
| 52 |
class MiniLM_L6_v2_Model(BaseModel):
|
|
|
|
| 58 |
|
| 59 |
def __init__(self):
|
| 60 |
self.model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
|
| 61 |
+
|
| 62 |
+
class Multilingual_E5_Large_Ft_Sts_Spanish_Matryoshka(BaseModel):
|
| 63 |
+
|
| 64 |
+
def __init__(self):
|
| 65 |
+
self.model = SentenceTransformer('mrm8488/multilingual-e5-large-ft-sts-spanish-matryoshka-768-64-5e')
|