DanielIglesias97 commited on
Commit
08d728e
·
1 Parent(s): 9019d6b

We have changed the embeddings' model that was being used with one

Browse files

that was trained with texts in Spanish. We have also gone back to
use float numbers for the embeddings in order to recover the
original level of precision.

Files changed (2) hide show
  1. main_service.py +5 -4
  2. utils_model.py +14 -3
main_service.py CHANGED
@@ -7,17 +7,18 @@ class GradioAppManager():
7
  model_factory_obj = ModelFactory()
8
  self.model = model_factory_obj.create_model(model_type)
9
 
10
- def __retrieve_embeddings__(self, input_queries_df):
11
  queries_list = input_queries_df.values
12
- queries_embeddings_list = self.model.retrieve_embeddings_from_texts_list(queries_list)
13
 
14
  return queries_embeddings_list
15
 
16
  def build(self):
17
  gr_input_dataframe = gr.Dataframe(headers=['queries'], datatype=['str'], row_count=2, col_count=(1, 'fixed'))
 
18
 
19
  app = gr.Interface(fn=self.__retrieve_embeddings__,
20
- inputs=[gr_input_dataframe],
21
  outputs="dataframe")
22
 
23
  return app
@@ -25,6 +26,6 @@ class GradioAppManager():
25
  def run(self, app):
26
  app.launch(server_name='0.0.0.0')
27
 
28
- gradio_app_manager_obj = GradioAppManager('sentence_similarity_spanish')
29
  app = gradio_app_manager_obj.build()
30
  gradio_app_manager_obj.run(app)
 
7
  model_factory_obj = ModelFactory()
8
  self.model = model_factory_obj.create_model(model_type)
9
 
10
+ def __retrieve_embeddings__(self, input_queries_df, limitnoffeatures):
11
  queries_list = input_queries_df.values
12
+ queries_embeddings_list = self.model.retrieve_embeddings_from_texts_list(queries_list, limitnoffeatures=limitnoffeatures)
13
 
14
  return queries_embeddings_list
15
 
16
  def build(self):
17
  gr_input_dataframe = gr.Dataframe(headers=['queries'], datatype=['str'], row_count=2, col_count=(1, 'fixed'))
18
+ gr_number_limitnoffeatures = gr.Number(50)
19
 
20
  app = gr.Interface(fn=self.__retrieve_embeddings__,
21
+ inputs=[gr_input_dataframe, gr_number_limitnoffeatures],
22
  outputs="dataframe")
23
 
24
  return app
 
26
  def run(self, app):
27
  app.launch(server_name='0.0.0.0')
28
 
29
+ gradio_app_manager_obj = GradioAppManager('multilingual-e5-large-ft-sts-spanish-matryoshka-768-64-5e')
30
  app = gradio_app_manager_obj.build()
31
  gradio_app_manager_obj.run(app)
utils_model.py CHANGED
@@ -16,6 +16,9 @@ class ModelFactory():
16
  if (model_type=='sentence_similarity_spanish'):
17
  model = SentenceSimilaritySpanishModel()
18
 
 
 
 
19
  return model
20
 
21
  class BaseModel():
@@ -25,12 +28,11 @@ class BaseModel():
25
 
26
  def retrieve_embeddings_from_single_input_text(self, input_text):
27
  embeddings = self.model.encode(input_text, batch_size=32)
28
- embeddings *= 255
29
- embeddings = embeddings.astype(np.uint8).astype(str).tolist()
30
 
31
  return embeddings
32
 
33
- def retrieve_embeddings_from_texts_list(self, input_texts_list):
34
  all_embeddings_list = []
35
  for current_input_text_aux in input_texts_list:
36
  embeddings = self.retrieve_embeddings_from_single_input_text(current_input_text_aux)
@@ -41,6 +43,10 @@ class BaseModel():
41
  columns_list = ['text'] + [f'feature_{idx}' for idx in range(0, nof_features)]
42
  queries_embeddings_df.columns = columns_list
43
 
 
 
 
 
44
  return queries_embeddings_df
45
 
46
  class MiniLM_L6_v2_Model(BaseModel):
@@ -52,3 +58,8 @@ class SentenceSimilaritySpanishModel(BaseModel):
52
 
53
  def __init__(self):
54
  self.model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
 
 
 
 
 
 
16
  if (model_type=='sentence_similarity_spanish'):
17
  model = SentenceSimilaritySpanishModel()
18
 
19
+ if (model_type=='multilingual-e5-large-ft-sts-spanish-matryoshka-768-64-5e'):
20
+ model = Multilingual_E5_Large_Ft_Sts_Spanish_Matryoshka()
21
+
22
  return model
23
 
24
  class BaseModel():
 
28
 
29
  def retrieve_embeddings_from_single_input_text(self, input_text):
30
  embeddings = self.model.encode(input_text, batch_size=32)
31
+ embeddings = embeddings.astype(np.float16).astype(str).tolist()
 
32
 
33
  return embeddings
34
 
35
+ def retrieve_embeddings_from_texts_list(self, input_texts_list, limitnoffeatures=-1):
36
  all_embeddings_list = []
37
  for current_input_text_aux in input_texts_list:
38
  embeddings = self.retrieve_embeddings_from_single_input_text(current_input_text_aux)
 
43
  columns_list = ['text'] + [f'feature_{idx}' for idx in range(0, nof_features)]
44
  queries_embeddings_df.columns = columns_list
45
 
46
+ if (limitnoffeatures>-1):
47
+ columns_to_choose = queries_embeddings_df.columns[0:limitnoffeatures+1]
48
+ queries_embeddings_df = queries_embeddings_df[columns_to_choose]
49
+
50
  return queries_embeddings_df
51
 
52
  class MiniLM_L6_v2_Model(BaseModel):
 
58
 
59
  def __init__(self):
60
  self.model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
61
+
62
+ class Multilingual_E5_Large_Ft_Sts_Spanish_Matryoshka(BaseModel):
63
+
64
+ def __init__(self):
65
+ self.model = SentenceTransformer('mrm8488/multilingual-e5-large-ft-sts-spanish-matryoshka-768-64-5e')