Spaces:
Runtime error
Runtime error
Commit
·
bcadbe0
1
Parent(s):
2e5b810
Add seqlen
Browse files
app.py
CHANGED
|
@@ -288,6 +288,59 @@ EXTERNAL_MODEL_TO_DIM = {
|
|
| 288 |
"unsup-simcse-bert-base-uncased": 768,
|
| 289 |
}
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
MODELS_TO_SKIP = {
|
| 292 |
"baseplate/instructor-large-1", # Duplicate
|
| 293 |
"radames/e5-large", # Duplicate
|
|
@@ -341,26 +394,22 @@ for model in EXTERNAL_MODELS:
|
|
| 341 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 342 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
| 343 |
|
| 344 |
-
def
|
| 345 |
filenames = [sib.rfilename for sib in model.siblings]
|
| 346 |
-
dim = ""
|
| 347 |
if "1_Pooling/config.json" in filenames:
|
| 348 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
| 349 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
| 350 |
elif "2_Pooling/config.json" in filenames:
|
| 351 |
st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
|
| 352 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
| 353 |
-
|
| 354 |
config_path = hf_hub_download(model.modelId, filename="config.json")
|
| 355 |
config = json.load(open(config_path))
|
| 356 |
-
if
|
| 357 |
-
dim = config
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
elif "d_model" in config:
|
| 361 |
-
dim = config["d_model"]
|
| 362 |
-
return dim
|
| 363 |
-
|
| 364 |
|
| 365 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
| 366 |
api = HfApi()
|
|
@@ -381,6 +430,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 381 |
if len(res) > 1:
|
| 382 |
if add_emb_dim:
|
| 383 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
|
|
|
| 384 |
df_list.append(res)
|
| 385 |
|
| 386 |
for model in models:
|
|
@@ -414,7 +464,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 414 |
# Model & at least one result
|
| 415 |
if len(out) > 1:
|
| 416 |
if add_emb_dim:
|
| 417 |
-
out["Embedding Dimensions"] =
|
| 418 |
df_list.append(out)
|
| 419 |
df = pd.DataFrame(df_list)
|
| 420 |
# Put 'Model' column first
|
|
@@ -472,7 +522,7 @@ def get_mteb_average():
|
|
| 472 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
| 473 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
| 474 |
|
| 475 |
-
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
| 476 |
|
| 477 |
return DATA_OVERALL
|
| 478 |
|
|
|
|
| 288 |
"unsup-simcse-bert-base-uncased": 768,
|
| 289 |
}
|
| 290 |
|
| 291 |
+
|
| 292 |
+
EXTERNAL_MODEL_TO_SEQLEN = {
|
| 293 |
+
"xlm-roberta-large": 514,
|
| 294 |
+
"use-cmlm-multilingual": 512,
|
| 295 |
+
"gottbert-base": 512,
|
| 296 |
+
"cross-en-de-roberta-sentence-transformer": 514,
|
| 297 |
+
"gbert-base": 512,
|
| 298 |
+
"gbert-large": 512,
|
| 299 |
+
"gelectra-base": 512,
|
| 300 |
+
"gelectra-large": 512,
|
| 301 |
+
"gottbert-base": 512,
|
| 302 |
+
|
| 303 |
+
"LASER2": "N/A",
|
| 304 |
+
"LaBSE": 512,
|
| 305 |
+
"all-MiniLM-L12-v2": 512,
|
| 306 |
+
"all-MiniLM-L6-v2": 512,
|
| 307 |
+
"all-mpnet-base-v2": 514,
|
| 308 |
+
"allenai-specter": 512,
|
| 309 |
+
"bert-base-uncased": 512,
|
| 310 |
+
"contriever-base-msmarco": 512,
|
| 311 |
+
"glove.6B.300d": "N/A",
|
| 312 |
+
"gtr-t5-base": 512,
|
| 313 |
+
"gtr-t5-large": 512,
|
| 314 |
+
"gtr-t5-xl": 512,
|
| 315 |
+
"gtr-t5-xxl": 512,
|
| 316 |
+
"komninos": "N/A",
|
| 317 |
+
"msmarco-bert-co-condensor": 512,
|
| 318 |
+
"paraphrase-multilingual-MiniLM-L12-v2": 512,
|
| 319 |
+
"paraphrase-multilingual-mpnet-base-v2": 514,
|
| 320 |
+
"sentence-t5-base": 512,
|
| 321 |
+
"sentence-t5-large": 512,
|
| 322 |
+
"sentence-t5-xl": 512,
|
| 323 |
+
"sentence-t5-xxl": 512,
|
| 324 |
+
"sup-simcse-bert-base-uncased": 512,
|
| 325 |
+
|
| 326 |
+
"text-embedding-ada-002": 8191,
|
| 327 |
+
|
| 328 |
+
"text-similarity-ada-001": 2046,
|
| 329 |
+
"text-similarity-babbage-001": 2046,
|
| 330 |
+
"text-similarity-curie-001": 2046,
|
| 331 |
+
"text-similarity-davinci-001": 2046,
|
| 332 |
+
|
| 333 |
+
"text-search-ada-doc-001": 2046,
|
| 334 |
+
"text-search-ada-query-001": 2046,
|
| 335 |
+
"text-search-ada-001": 2046,
|
| 336 |
+
"text-search-babbage-001": 2046,
|
| 337 |
+
"text-search-curie-001": 2046,
|
| 338 |
+
"text-search-davinci-001": 2046,
|
| 339 |
+
|
| 340 |
+
"unsup-simcse-bert-base-uncased": 512,
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
|
| 344 |
MODELS_TO_SKIP = {
|
| 345 |
"baseplate/instructor-large-1", # Duplicate
|
| 346 |
"radames/e5-large", # Duplicate
|
|
|
|
| 394 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 395 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
| 396 |
|
| 397 |
+
def get_dim_seq(model):
|
| 398 |
filenames = [sib.rfilename for sib in model.siblings]
|
| 399 |
+
dim, seq = "", ""
|
| 400 |
if "1_Pooling/config.json" in filenames:
|
| 401 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
| 402 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
| 403 |
elif "2_Pooling/config.json" in filenames:
|
| 404 |
st_config_path = hf_hub_download(model.modelId, filename="2_Pooling/config.json")
|
| 405 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
| 406 |
+
if "config.json" in filenames:
|
| 407 |
config_path = hf_hub_download(model.modelId, filename="config.json")
|
| 408 |
config = json.load(open(config_path))
|
| 409 |
+
if not dim:
|
| 410 |
+
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
|
| 411 |
+
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
|
| 412 |
+
return dim, seq
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
| 415 |
api = HfApi()
|
|
|
|
| 430 |
if len(res) > 1:
|
| 431 |
if add_emb_dim:
|
| 432 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
| 433 |
+
res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
|
| 434 |
df_list.append(res)
|
| 435 |
|
| 436 |
for model in models:
|
|
|
|
| 464 |
# Model & at least one result
|
| 465 |
if len(out) > 1:
|
| 466 |
if add_emb_dim:
|
| 467 |
+
out["Embedding Dimensions"], out["Sequence Length"] = get_dim_seq(model)
|
| 468 |
df_list.append(out)
|
| 469 |
df = pd.DataFrame(df_list)
|
| 470 |
# Put 'Model' column first
|
|
|
|
| 522 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
| 523 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
| 524 |
|
| 525 |
+
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
| 526 |
|
| 527 |
return DATA_OVERALL
|
| 528 |
|