search by company name
Browse files- app.py +4 -0
- json/semantic_search_params.json +2 -0
- src/semantic_search.py +20 -6
app.py
CHANGED
|
@@ -115,7 +115,9 @@ y_cdf, _ = dh_app.configura_distr_prob(shape, loc, scale, max_dist, precision_cd
|
|
| 115 |
# Parámetros de la de búsqueda VSS:
|
| 116 |
k = semantic_search_params["k"]
|
| 117 |
brevity_penalty = semantic_search_params["brevity_penalty"]
|
|
|
|
| 118 |
reward_for_literal = semantic_search_params["reward_for_literal"]
|
|
|
|
| 119 |
partial_match_factor = semantic_search_params["partial_match_factor"]
|
| 120 |
print(f"VSS params: k={k}, brevity_penalty={brevity_penalty}, reward_for_literal={reward_for_literal}, partial_match_factor={partial_match_factor}")
|
| 121 |
|
|
@@ -187,7 +189,9 @@ def search_theme(theme: str, page: int, *filtros_values) -> Tuple[pd.DataFrame,
|
|
| 187 |
query=query,
|
| 188 |
k=k,
|
| 189 |
brevity_penalty=brevity_penalty,
|
|
|
|
| 190 |
reward_for_literal=reward_for_literal,
|
|
|
|
| 191 |
partial_match_factor=partial_match_factor,
|
| 192 |
table_name="vector_table",
|
| 193 |
embedding_column="embeddings"
|
|
|
|
| 115 |
# Parámetros de la de búsqueda VSS:
|
| 116 |
k = semantic_search_params["k"]
|
| 117 |
brevity_penalty = semantic_search_params["brevity_penalty"]
|
| 118 |
+
min_length = semantic_search_params["min_length"]
|
| 119 |
reward_for_literal = semantic_search_params["reward_for_literal"]
|
| 120 |
+
first_term_reward = semantic_search_params["first_term_reward"]
|
| 121 |
partial_match_factor = semantic_search_params["partial_match_factor"]
|
| 122 |
print(f"VSS params: k={k}, brevity_penalty={brevity_penalty}, reward_for_literal={reward_for_literal}, partial_match_factor={partial_match_factor}")
|
| 123 |
|
|
|
|
| 189 |
query=query,
|
| 190 |
k=k,
|
| 191 |
brevity_penalty=brevity_penalty,
|
| 192 |
+
min_length = min_length,
|
| 193 |
reward_for_literal=reward_for_literal,
|
| 194 |
+
first_term_reward=first_term_reward,
|
| 195 |
partial_match_factor=partial_match_factor,
|
| 196 |
table_name="vector_table",
|
| 197 |
embedding_column="embeddings"
|
json/semantic_search_params.json
CHANGED
|
@@ -2,7 +2,9 @@
|
|
| 2 |
"semantic_search_params": {
|
| 3 |
"k": 2000,
|
| 4 |
"brevity_penalty": 0.1,
|
|
|
|
| 5 |
"reward_for_literal": 0.03,
|
|
|
|
| 6 |
"partial_match_factor": 0.8
|
| 7 |
}
|
| 8 |
}
|
|
|
|
| 2 |
"semantic_search_params": {
|
| 3 |
"k": 2000,
|
| 4 |
"brevity_penalty": 0.1,
|
| 5 |
+
"min_length": 131,
|
| 6 |
"reward_for_literal": 0.03,
|
| 7 |
+
"first_term_reward": 20,
|
| 8 |
"partial_match_factor": 0.8
|
| 9 |
}
|
| 10 |
}
|
src/semantic_search.py
CHANGED
|
@@ -9,7 +9,9 @@ def duckdb_vss_local(
|
|
| 9 |
query: str,
|
| 10 |
k: int = 1000,
|
| 11 |
brevity_penalty: float = 0.0,
|
|
|
|
| 12 |
reward_for_literal: float = 0.0,
|
|
|
|
| 13 |
partial_match_factor: float = 0.5,
|
| 14 |
table_name: str = "maestro_vector_table",
|
| 15 |
embedding_column: str = "vec",
|
|
@@ -33,10 +35,10 @@ def duckdb_vss_local(
|
|
| 33 |
# Utilizar los parámetros "debug" para mostrar columnas intermedias:
|
| 34 |
if brevity_penalty > 0:
|
| 35 |
result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
|
| 36 |
-
summary_column = 'longBusinessSummary', debug = False)
|
| 37 |
if reward_for_literal > 0:
|
| 38 |
result = reward_literals(result, query, factor = reward_for_literal,
|
| 39 |
-
partial_match_factor= partial_match_factor, distance_column = 'distance',
|
| 40 |
summary_column = 'longBusinessSummary', debug = False)
|
| 41 |
|
| 42 |
return result
|
|
@@ -46,7 +48,8 @@ def penalize_short_summaries(
|
|
| 46 |
factor: float = 0.1,
|
| 47 |
distance_column: str = 'distance',
|
| 48 |
summary_column: str = 'longBusinessSummary',
|
| 49 |
-
debug: bool = True
|
|
|
|
| 50 |
) -> pd.DataFrame:
|
| 51 |
|
| 52 |
result_df = df.copy()
|
|
@@ -59,10 +62,14 @@ def penalize_short_summaries(
|
|
| 59 |
result_df['percent_shorter'] = result_df['summary_length'].apply(
|
| 60 |
lambda x: max(0, (avg_length - x) / avg_length)
|
| 61 |
)
|
|
|
|
| 62 |
result_df['orig_distance'] = result_df[distance_column]
|
| 63 |
-
|
|
|
|
|
|
|
| 64 |
result_df[distance_column] = result_df.apply(
|
| 65 |
-
lambda row:
|
|
|
|
| 66 |
axis=1
|
| 67 |
)
|
| 68 |
|
|
@@ -77,6 +84,7 @@ def reward_literals(
|
|
| 77 |
query: str,
|
| 78 |
factor: float = 0.1,
|
| 79 |
partial_match_factor: float = 0.5,
|
|
|
|
| 80 |
distance_column: str = 'distance',
|
| 81 |
summary_column: str = 'longBusinessSummary',
|
| 82 |
debug: bool = True
|
|
@@ -89,6 +97,12 @@ def reward_literals(
|
|
| 89 |
if pd.isna(summary):
|
| 90 |
return 0
|
| 91 |
summary_lower = str(summary).lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
# Cuenta coincidencias exactas (palabras completas)
|
| 94 |
exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
|
|
@@ -108,7 +122,7 @@ def reward_literals(
|
|
| 108 |
partial_count = partial_count - exact_count
|
| 109 |
|
| 110 |
# Penalizamos las coincidencias parciales:
|
| 111 |
-
return exact_count + (partial_count * partial_match_factor)
|
| 112 |
|
| 113 |
result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
|
| 114 |
result_df['orig_distance'] = result_df[distance_column]
|
|
|
|
| 9 |
query: str,
|
| 10 |
k: int = 1000,
|
| 11 |
brevity_penalty: float = 0.0,
|
| 12 |
+
min_length: int = 131,
|
| 13 |
reward_for_literal: float = 0.0,
|
| 14 |
+
first_term_reward: float = 20.0,
|
| 15 |
partial_match_factor: float = 0.5,
|
| 16 |
table_name: str = "maestro_vector_table",
|
| 17 |
embedding_column: str = "vec",
|
|
|
|
| 35 |
# Utilizar los parámetros "debug" para mostrar columnas intermedias:
|
| 36 |
if brevity_penalty > 0:
|
| 37 |
result = penalize_short_summaries(result, factor = brevity_penalty, distance_column = 'distance',
|
| 38 |
+
summary_column = 'longBusinessSummary', min_length=min_length, debug = False)
|
| 39 |
if reward_for_literal > 0:
|
| 40 |
result = reward_literals(result, query, factor = reward_for_literal,
|
| 41 |
+
partial_match_factor= partial_match_factor, first_term_reward=first_term_reward, distance_column = 'distance',
|
| 42 |
summary_column = 'longBusinessSummary', debug = False)
|
| 43 |
|
| 44 |
return result
|
|
|
|
| 48 |
factor: float = 0.1,
|
| 49 |
distance_column: str = 'distance',
|
| 50 |
summary_column: str = 'longBusinessSummary',
|
| 51 |
+
debug: bool = True,
|
| 52 |
+
min_length: int = 131
|
| 53 |
) -> pd.DataFrame:
|
| 54 |
|
| 55 |
result_df = df.copy()
|
|
|
|
| 62 |
result_df['percent_shorter'] = result_df['summary_length'].apply(
|
| 63 |
lambda x: max(0, (avg_length - x) / avg_length)
|
| 64 |
)
|
| 65 |
+
|
| 66 |
result_df['orig_distance'] = result_df[distance_column]
|
| 67 |
+
|
| 68 |
+
# Asignar distancia máxima para resúmenes más cortos que min_length
|
| 69 |
+
# y aplicar penalización proporcional para el resto
|
| 70 |
result_df[distance_column] = result_df.apply(
|
| 71 |
+
lambda row: max_dist if row['summary_length'] < min_length else
|
| 72 |
+
min(max_dist, row[distance_column] + (row['percent_shorter'] * factor)),
|
| 73 |
axis=1
|
| 74 |
)
|
| 75 |
|
|
|
|
| 84 |
query: str,
|
| 85 |
factor: float = 0.1,
|
| 86 |
partial_match_factor: float = 0.5,
|
| 87 |
+
first_term_reward: float = 20.0,
|
| 88 |
distance_column: str = 'distance',
|
| 89 |
summary_column: str = 'longBusinessSummary',
|
| 90 |
debug: bool = True
|
|
|
|
| 97 |
if pd.isna(summary):
|
| 98 |
return 0
|
| 99 |
summary_lower = str(summary).lower()
|
| 100 |
+
# Extraemos la primera palabra del resumen y la limpiamos de caracteres especiales
|
| 101 |
+
# Por ejemplo: "Grifols, S.A. operates as a plasma therapeutic company..." -> Extrae "Grifols", no "Grifols,"
|
| 102 |
+
first_word = summary_lower.split()[0] if summary_lower.strip() and len(summary_lower.split()) > 0 else ""
|
| 103 |
+
first_term = re.sub(r'[^\w\s]', '', first_word.lower())
|
| 104 |
+
# Comprobamos si la primera palabra coincide con la consulta (típicamente el nombre de la empresa)
|
| 105 |
+
_first_term_reward = first_term_reward if first_term == query_lower else 0
|
| 106 |
|
| 107 |
# Cuenta coincidencias exactas (palabras completas)
|
| 108 |
exact_pattern = r'\b' + re.escape(query_lower) + r'\b'
|
|
|
|
| 122 |
partial_count = partial_count - exact_count
|
| 123 |
|
| 124 |
# Penalizamos las coincidencias parciales:
|
| 125 |
+
return _first_term_reward + exact_count + (partial_count * partial_match_factor)
|
| 126 |
|
| 127 |
result_df['term_occurrences'] = result_df[summary_column].apply(count_phrase_occurrences)
|
| 128 |
result_df['orig_distance'] = result_df[distance_column]
|