Spaces:
Running
Running
| config: | |
| REPO_ID: "mteb/leaderboard" | |
| RESULTS_REPO: mteb/results | |
| LEADERBOARD_NAME: "MTEB Leaderboard" | |
| tasks: | |
| BitextMining: | |
| icon: "🎌" | |
| metric: f1 | |
| metric_description: "[F1](https://huggingface.co/spaces/evaluate-metric/f1)" | |
| task_description: "Bitext mining is the task of finding parallel sentences in two languages." | |
| Classification: | |
| icon: "❤️" | |
| metric: accuracy | |
| metric_description: "[Accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy)" | |
| task_description: "Classification is the task of assigning a label to a text." | |
| Clustering: | |
| icon: "✨" | |
| metric: v_measure | |
| metric_description: "Validity Measure (V-measure)" | |
| task_description: "Clustering is the task of grouping similar documents together." | |
| PairClassification: | |
| icon: "🎭" | |
| metric: max_ap | |
| metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)" | |
| task_description: "Pair classification is the task of determining whether two texts are similar." | |
| Reranking: | |
| icon: "🥈" | |
| metric: map | |
| metric_description: "Mean Average Precision (MAP)" | |
| task_description: "Reranking is the task of reordering a list of documents to improve relevance." | |
| Retrieval: | |
| icon: "🔎" | |
| metric: ndcg_at_10 | |
| metric_description: "Normalized Discounted Cumulative Gain @ 10 (nDCG@10)" | |
| task_description: "Retrieval is the task of finding relevant documents for a query." | |
| STS: | |
| icon: "☘️" | |
| metric: cosine_spearman | |
| metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" | |
| task_description: "Semantic Textual Similarity is the task of determining how similar two texts are." | |
| Summarization: | |
| icon: "📜" | |
| metric: cosine_spearman | |
| metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)" | |
| task_description: "Summarization is the task of generating a summary of a text." | |
| MultilabelClassification: | |
| icon: "🏷️" | |
| metric: accuracy | |
| metric_description: "Accuracy" | |
| task_description: "Multilabel classification is the task of assigning multiple labels to a text." | |
| InstructionRetrieval: | |
| icon: "🔎📋" | |
| metric: "p-MRR" | |
| metric_description: "paired mean reciprocal rank (p-MRR)" | |
| task_description: "Retrieval w/Instructions is the task of finding relevant documents for a query that has detailed instructions." | |
| boards: | |
| en: | |
| title: English | |
| language_long: "English" | |
| has_overall: true | |
| acronym: null | |
| icon: null | |
| special_icons: null | |
| credits: null | |
| tasks: | |
| Classification: | |
| - AmazonCounterfactualClassification (en) | |
| - AmazonPolarityClassification | |
| - AmazonReviewsClassification (en) | |
| - Banking77Classification | |
| - EmotionClassification | |
| - ImdbClassification | |
| - MassiveIntentClassification (en) | |
| - MassiveScenarioClassification (en) | |
| - MTOPDomainClassification (en) | |
| - MTOPIntentClassification (en) | |
| - ToxicConversationsClassification | |
| - TweetSentimentExtractionClassification | |
| Clustering: | |
| - ArxivClusteringP2P | |
| - ArxivClusteringS2S | |
| - BiorxivClusteringP2P | |
| - BiorxivClusteringS2S | |
| - MedrxivClusteringP2P | |
| - MedrxivClusteringS2S | |
| - RedditClustering | |
| - RedditClusteringP2P | |
| - StackExchangeClustering | |
| - StackExchangeClusteringP2P | |
| - TwentyNewsgroupsClustering | |
| PairClassification: | |
| - SprintDuplicateQuestions | |
| - TwitterSemEval2015 | |
| - TwitterURLCorpus | |
| Reranking: | |
| - AskUbuntuDupQuestions | |
| - MindSmallReranking | |
| - SciDocsRR | |
| - StackOverflowDupQuestions | |
| Retrieval: | |
| - ArguAna | |
| - ClimateFEVER | |
| - CQADupstackRetrieval | |
| - DBPedia | |
| - FEVER | |
| - FiQA2018 | |
| - HotpotQA | |
| - MSMARCO | |
| - NFCorpus | |
| - NQ | |
| - QuoraRetrieval | |
| - SCIDOCS | |
| - SciFact | |
| - Touche2020 | |
| - TRECCOVID | |
| STS: | |
| - BIOSSES | |
| - SICK-R | |
| - STS12 | |
| - STS13 | |
| - STS14 | |
| - STS15 | |
| - STS16 | |
| - STS17 (en-en) | |
| - STS22 (en) | |
| - STSBenchmark | |
| Summarization: | |
| - SummEval | |
| en-x: | |
| title: "English-X" | |
| language_long: "117 (Pairs of: English & other language)" | |
| has_overall: false | |
| acronym: null | |
| icon: null | |
| special_icons: null | |
| credits: null | |
| tasks: | |
| BitextMining: ['BUCC (de-en)', 'BUCC (fr-en)', 'BUCC (ru-en)', 'BUCC (zh-en)', 'Tatoeba (afr-eng)', 'Tatoeba (amh-eng)', 'Tatoeba (ang-eng)', 'Tatoeba (ara-eng)', 'Tatoeba (arq-eng)', 'Tatoeba (arz-eng)', 'Tatoeba (ast-eng)', 'Tatoeba (awa-eng)', 'Tatoeba (aze-eng)', 'Tatoeba (bel-eng)', 'Tatoeba (ben-eng)', 'Tatoeba (ber-eng)', 'Tatoeba (bos-eng)', 'Tatoeba (bre-eng)', 'Tatoeba (bul-eng)', 'Tatoeba (cat-eng)', 'Tatoeba (cbk-eng)', 'Tatoeba (ceb-eng)', 'Tatoeba (ces-eng)', 'Tatoeba (cha-eng)', 'Tatoeba (cmn-eng)', 'Tatoeba (cor-eng)', 'Tatoeba (csb-eng)', 'Tatoeba (cym-eng)', 'Tatoeba (dan-eng)', 'Tatoeba (deu-eng)', 'Tatoeba (dsb-eng)', 'Tatoeba (dtp-eng)', 'Tatoeba (ell-eng)', 'Tatoeba (epo-eng)', 'Tatoeba (est-eng)', 'Tatoeba (eus-eng)', 'Tatoeba (fao-eng)', 'Tatoeba (fin-eng)', 'Tatoeba (fra-eng)', 'Tatoeba (fry-eng)', 'Tatoeba (gla-eng)', 'Tatoeba (gle-eng)', 'Tatoeba (glg-eng)', 'Tatoeba (gsw-eng)', 'Tatoeba (heb-eng)', 'Tatoeba (hin-eng)', 'Tatoeba (hrv-eng)', 'Tatoeba (hsb-eng)', 'Tatoeba (hun-eng)', 'Tatoeba (hye-eng)', 'Tatoeba (ido-eng)', 'Tatoeba (ile-eng)', 'Tatoeba (ina-eng)', 'Tatoeba (ind-eng)', 'Tatoeba (isl-eng)', 'Tatoeba (ita-eng)', 'Tatoeba (jav-eng)', 'Tatoeba (jpn-eng)', 'Tatoeba (kab-eng)', 'Tatoeba (kat-eng)', 'Tatoeba (kaz-eng)', 'Tatoeba (khm-eng)', 'Tatoeba (kor-eng)', 'Tatoeba (kur-eng)', 'Tatoeba (kzj-eng)', 'Tatoeba (lat-eng)', 'Tatoeba (lfn-eng)', 'Tatoeba (lit-eng)', 'Tatoeba (lvs-eng)', 'Tatoeba (mal-eng)', 'Tatoeba (mar-eng)', 'Tatoeba (max-eng)', 'Tatoeba (mhr-eng)', 'Tatoeba (mkd-eng)', 'Tatoeba (mon-eng)', 'Tatoeba (nds-eng)', 'Tatoeba (nld-eng)', 'Tatoeba (nno-eng)', 'Tatoeba (nob-eng)', 'Tatoeba (nov-eng)', 'Tatoeba (oci-eng)', 'Tatoeba (orv-eng)', 'Tatoeba (pam-eng)', 'Tatoeba (pes-eng)', 'Tatoeba (pms-eng)', 'Tatoeba (pol-eng)', 'Tatoeba (por-eng)', 'Tatoeba (ron-eng)', 'Tatoeba (rus-eng)', 'Tatoeba (slk-eng)', 'Tatoeba (slv-eng)', 'Tatoeba (spa-eng)', 'Tatoeba (sqi-eng)', 'Tatoeba (srp-eng)', 'Tatoeba (swe-eng)', 'Tatoeba (swg-eng)', 'Tatoeba (swh-eng)', 'Tatoeba (tam-eng)', 'Tatoeba (tat-eng)', 'Tatoeba (tel-eng)', 'Tatoeba (tgl-eng)', 'Tatoeba (tha-eng)', 'Tatoeba (tuk-eng)', 'Tatoeba (tur-eng)', 'Tatoeba (tzl-eng)', 'Tatoeba (uig-eng)', 'Tatoeba (ukr-eng)', 'Tatoeba (urd-eng)', 'Tatoeba (uzb-eng)', 'Tatoeba (vie-eng)', 'Tatoeba (war-eng)', 'Tatoeba (wuu-eng)', 'Tatoeba (xho-eng)', 'Tatoeba (yid-eng)', 'Tatoeba (yue-eng)', 'Tatoeba (zsm-eng)'] | |
| zh: | |
| title: Chinese | |
| language_long: Chinese | |
| has_overall: true | |
| acronym: C-MTEB | |
| icon: "🇨🇳" | |
| special_icons: | |
| Classification: "🧡" | |
| credits: "[FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding)" | |
| tasks: | |
| Classification: | |
| - AmazonReviewsClassification (zh) | |
| - IFlyTek | |
| - JDReview | |
| - MassiveIntentClassification (zh-CN) | |
| - MassiveScenarioClassification (zh-CN) | |
| - MultilingualSentiment | |
| - OnlineShopping | |
| - TNews | |
| - Waimai | |
| Clustering: | |
| - CLSClusteringP2P | |
| - CLSClusteringS2S | |
| - ThuNewsClusteringP2P | |
| - ThuNewsClusteringS2S | |
| PairClassification: | |
| - Cmnli | |
| - Ocnli | |
| Reranking: | |
| - CMedQAv1 | |
| - CMedQAv2 | |
| - MMarcoReranking | |
| - T2Reranking | |
| Retrieval: | |
| - CmedqaRetrieval | |
| - CovidRetrieval | |
| - DuRetrieval | |
| - EcomRetrieval | |
| - MedicalRetrieval | |
| - MMarcoRetrieval | |
| - T2Retrieval | |
| - VideoRetrieval | |
| STS: | |
| - AFQMC | |
| - ATEC | |
| - BQ | |
| - LCQMC | |
| - PAWSX | |
| - QBQTC | |
| - STS22 (zh) | |
| - STSB | |
| da: | |
| title: Danish | |
| language_long: Danish | |
| has_overall: false | |
| acronym: null | |
| icon: "🇩🇰" | |
| special_icons: | |
| Classification: "🤍" | |
| credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" | |
| tasks: | |
| BitextMining: | |
| - BornholmBitextMining | |
| Classification: | |
| - AngryTweetsClassification | |
| - DanishPoliticalCommentsClassification | |
| - DKHateClassification | |
| - LccSentimentClassification | |
| - MassiveIntentClassification (da) | |
| - MassiveScenarioClassification (da) | |
| - NordicLangClassification | |
| - ScalaDaClassification | |
| fr: | |
| title: French | |
| language_long: "French" | |
| has_overall: true | |
| acronym: "F-MTEB" | |
| icon: "🇫🇷" | |
| special_icons: | |
| Classification: "💙" | |
| credits: "[Lyon-NLP](https://github.com/Lyon-NLP): [Gabriel Sequeira](https://github.com/GabrielSequeira), [Imene Kerboua](https://github.com/imenelydiaker), [Wissam Siblini](https://github.com/wissam-sib), [Mathieu Ciancone](https://github.com/MathieuCiancone), [Marion Schaeffer](https://github.com/schmarion)" | |
| tasks: | |
| Classification: | |
| - AmazonReviewsClassification (fr) | |
| - MasakhaNEWSClassification (fra) | |
| - MassiveIntentClassification (fr) | |
| - MassiveScenarioClassification (fr) | |
| - MTOPDomainClassification (fr) | |
| - MTOPIntentClassification (fr) | |
| Clustering: | |
| - AlloProfClusteringP2P | |
| - AlloProfClusteringS2S | |
| - HALClusteringS2S | |
| - MLSUMClusteringP2P (fr) | |
| - MLSUMClusteringS2S (fr) | |
| - MasakhaNEWSClusteringP2P (fra) | |
| - MasakhaNEWSClusteringS2S (fra) | |
| PairClassification: | |
| - OpusparcusPC (fr) | |
| - PawsXPairClassification (fr) | |
| Reranking: | |
| - AlloprofReranking | |
| - SyntecReranking | |
| Retrieval: | |
| - AlloprofRetrieval | |
| - BSARDRetrieval | |
| - MintakaRetrieval (fr) | |
| - SyntecRetrieval | |
| - XPQARetrieval (fr) | |
| STS: | |
| - STS22 (fr) | |
| - STSBenchmarkMultilingualSTS (fr) | |
| - SICKFr | |
| Summarization: | |
| - SummEvalFr | |
| 'no': | |
| title: Norwegian | |
| language_long: "Norwegian Bokmål" | |
| has_overall: false | |
| acronym: null | |
| icon: "🇳🇴" | |
| special_icons: | |
| Classification: "💙" | |
| credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" | |
| tasks: | |
| Classification: | |
| - NoRecClassification | |
| - NordicLangClassification | |
| - NorwegianParliament | |
| - MassiveIntentClassification (nb) | |
| - MassiveScenarioClassification (nb) | |
| - ScalaNbClassification | |
| instructions: | |
| title: English | |
| language_long: "English" | |
| has_overall: false | |
| acronym: null | |
| icon: null | |
| credits: "[Orion Weller, FollowIR](https://arxiv.org/abs/2403.15246)" | |
| tasks: | |
| InstructionRetrieval: | |
| - Robust04InstructionRetrieval | |
| - News21InstructionRetrieval | |
| - Core17InstructionRetrieval | |
| de: | |
| title: German | |
| language_long: "German" | |
| has_overall: false | |
| acronym: null | |
| icon: "🇩🇪" | |
| special_icons: null | |
| credits: "[Silvan](https://github.com/slvnwhrl), [Sam Heymann](https://github.com/sam-hey)" | |
| tasks: | |
| Clustering: | |
| - BlurbsClusteringP2P | |
| - BlurbsClusteringS2S | |
| - TenKGnadClusteringP2P | |
| - TenKGnadClusteringS2S | |
| Retrieval: | |
| - GermanQuAD-Retrieval | |
| - GermanDPR | |
| - XMarket (de) | |
| - GerDaLIR | |
| STS: | |
| - GermanSTSBenchmark | |
| - STS22 (de-en) | |
| PairClassification: | |
| - FalseFriendsGermanEnglish | |
| - PawsXPairClassification (de) | |
| Reranking: | |
| - MIRACLReranking (de) | |
| Classification: | |
| - AmazonCounterfactualClassification (de) | |
| - AmazonReviewsClassification (de) | |
| - MTOPDomainClassification (de) | |
| - MTOPIntentClassification (de) | |
| - MassiveIntentClassification (de) | |
| - MassiveScenarioClassification (de) | |
| pl: | |
| title: Polish | |
| language_long: Polish | |
| has_overall: true | |
| acronym: null | |
| icon: "🇵🇱" | |
| special_icons: | |
| Classification: "🤍" | |
| credits: "[Rafał Poświata](https://github.com/rafalposwiata)" | |
| tasks: | |
| Classification: | |
| - AllegroReviews | |
| - CBD | |
| - MassiveIntentClassification (pl) | |
| - MassiveScenarioClassification (pl) | |
| - PAC | |
| - PolEmo2.0-IN | |
| - PolEmo2.0-OUT | |
| Clustering: | |
| - 8TagsClustering | |
| PairClassification: | |
| - CDSC-E | |
| - PPC | |
| - PSC | |
| - SICK-E-PL | |
| Retrieval: | |
| - ArguAna-PL | |
| - DBPedia-PL | |
| - FiQA-PL | |
| - HotpotQA-PL | |
| - MSMARCO-PL | |
| - NFCorpus-PL | |
| - NQ-PL | |
| - Quora-PL | |
| - SCIDOCS-PL | |
| - SciFact-PL | |
| - TRECCOVID-PL | |
| STS: | |
| - CDSC-R | |
| - SICK-R-PL | |
| - STS22 (pl) | |
| ru: | |
| title: Russian | |
| language_long: "Russian" | |
| has_overall: true | |
| acronym: null | |
| icon: "🇷🇺" | |
| special_icons: null | |
| credits: "[Roman Solomatin](https://github.com/Samoed) and SaluteDevices: [Alena Fenogenova](https://github.com/Alenush), [Aleksandr Abramov](https://github.com/Ab1992ao), [Artem Snegirev](https://github.com/artemsnegirev), [Anna Maksimova](https://github.com/anpalmak2003), [Maria Tikhonova](https://github.com/MariyaTikhonova)" | |
| tasks: | |
| Classification: | |
| - GeoreviewClassification | |
| - HeadlineClassification | |
| - InappropriatenessClassification | |
| - KinopoiskClassification | |
| - RuReviewsClassification | |
| - RuSciBenchGRNTIClassification | |
| - RuSciBenchOECDClassification | |
| - MassiveIntentClassification (ru) | |
| - MassiveScenarioClassification (ru) | |
| Clustering: | |
| - GeoreviewClusteringP2P | |
| - RuSciBenchGRNTIClusteringP2P | |
| - RuSciBenchOECDClusteringP2P | |
| PairClassification: | |
| - TERRa | |
| Reranking: | |
| - RuBQReranking | |
| - MIRACLReranking (ru) | |
| Retrieval: | |
| - RiaNewsRetrieval | |
| - RuBQRetrieval | |
| - MIRACLRetrieval (ru) | |
| STS: | |
| - RUParaPhraserSTS | |
| - RuSTSBenchmarkSTS | |
| - STS22 (ru) | |
| MultilabelClassification: | |
| - CEDRClassification | |
| - SensitiveTopicsClassification | |
| se: | |
| title: Swedish | |
| language_long: Swedish | |
| has_overall: false | |
| acronym: null | |
| icon: "🇸🇪" | |
| special_icons: | |
| Classification: "💛" | |
| credits: "[Kenneth Enevoldsen](https://github.com/KennethEnevoldsen), [scandinavian-embedding-benchmark](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/)" | |
| tasks: | |
| Classification: | |
| - NoRecClassification | |
| - NordicLangClassification | |
| - NorwegianParliament | |
| - MassiveIntentClassification (nb) | |
| - MassiveScenarioClassification (nb) | |
| - ScalaNbClassification | |
| other-cls: | |
| title: "Other Languages" | |
| language_long: "47 (Only languages not included in the other tabs)" | |
| has_overall: false | |
| acronym: null | |
| icon: null | |
| special_icons: | |
| Classification: "💜💚💙" | |
| credits: null | |
| tasks: | |
| Classification: ['AmazonCounterfactualClassification (de)', 'AmazonCounterfactualClassification (ja)', 'AmazonReviewsClassification (de)', 'AmazonReviewsClassification (es)', 'AmazonReviewsClassification (fr)', 'AmazonReviewsClassification (ja)', 'AmazonReviewsClassification (zh)', 'MTOPDomainClassification (de)', 'MTOPDomainClassification (es)', 'MTOPDomainClassification (fr)', 'MTOPDomainClassification (hi)', 'MTOPDomainClassification (th)', 'MTOPIntentClassification (de)', 'MTOPIntentClassification (es)', 'MTOPIntentClassification (fr)', 'MTOPIntentClassification (hi)', 'MTOPIntentClassification (th)', 'MassiveIntentClassification (af)', 'MassiveIntentClassification (am)', 'MassiveIntentClassification (ar)', 'MassiveIntentClassification (az)', 'MassiveIntentClassification (bn)', 'MassiveIntentClassification (cy)', 'MassiveIntentClassification (de)', 'MassiveIntentClassification (el)', 'MassiveIntentClassification (es)', 'MassiveIntentClassification (fa)', 'MassiveIntentClassification (fi)', 'MassiveIntentClassification (fr)', 'MassiveIntentClassification (he)', 'MassiveIntentClassification (hi)', 'MassiveIntentClassification (hu)', 'MassiveIntentClassification (hy)', 'MassiveIntentClassification (id)', 'MassiveIntentClassification (is)', 'MassiveIntentClassification (it)', 'MassiveIntentClassification (ja)', 'MassiveIntentClassification (jv)', 'MassiveIntentClassification (ka)', 'MassiveIntentClassification (km)', 'MassiveIntentClassification (kn)', 'MassiveIntentClassification (ko)', 'MassiveIntentClassification (lv)', 'MassiveIntentClassification (ml)', 'MassiveIntentClassification (mn)', 'MassiveIntentClassification (ms)', 'MassiveIntentClassification (my)', 'MassiveIntentClassification (nl)', 'MassiveIntentClassification (pt)', 'MassiveIntentClassification (ro)', 'MassiveIntentClassification (ru)', 'MassiveIntentClassification (sl)', 'MassiveIntentClassification (sq)', 'MassiveIntentClassification (sw)', 'MassiveIntentClassification (ta)', 'MassiveIntentClassification (te)', 'MassiveIntentClassification (th)', 'MassiveIntentClassification (tl)', 'MassiveIntentClassification (tr)', 'MassiveIntentClassification (ur)', 'MassiveIntentClassification (vi)', 'MassiveIntentClassification (zh-TW)', 'MassiveScenarioClassification (af)', 'MassiveScenarioClassification (am)', 'MassiveScenarioClassification (ar)', 'MassiveScenarioClassification (az)', 'MassiveScenarioClassification (bn)', 'MassiveScenarioClassification (cy)', 'MassiveScenarioClassification (de)', 'MassiveScenarioClassification (el)', 'MassiveScenarioClassification (es)', 'MassiveScenarioClassification (fa)', 'MassiveScenarioClassification (fi)', 'MassiveScenarioClassification (fr)', 'MassiveScenarioClassification (he)', 'MassiveScenarioClassification (hi)', 'MassiveScenarioClassification (hu)', 'MassiveScenarioClassification (hy)', 'MassiveScenarioClassification (id)', 'MassiveScenarioClassification (is)', 'MassiveScenarioClassification (it)', 'MassiveScenarioClassification (ja)', 'MassiveScenarioClassification (jv)', 'MassiveScenarioClassification (ka)', 'MassiveScenarioClassification (km)', 'MassiveScenarioClassification (kn)', 'MassiveScenarioClassification (ko)', 'MassiveScenarioClassification (lv)', 'MassiveScenarioClassification (ml)', 'MassiveScenarioClassification (mn)', 'MassiveScenarioClassification (ms)', 'MassiveScenarioClassification (my)', 'MassiveScenarioClassification (nl)', 'MassiveScenarioClassification (pt)', 'MassiveScenarioClassification (ro)', 'MassiveScenarioClassification (ru)', 'MassiveScenarioClassification (sl)', 'MassiveScenarioClassification (sq)', 'MassiveScenarioClassification (sw)', 'MassiveScenarioClassification (ta)', 'MassiveScenarioClassification (te)', 'MassiveScenarioClassification (th)', 'MassiveScenarioClassification (tl)', 'MassiveScenarioClassification (tr)', 'MassiveScenarioClassification (ur)', 'MassiveScenarioClassification (vi)', 'MassiveScenarioClassification (zh-TW)'] | |
| other-sts: | |
| title: Other | |
| language_long: "Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish (Only language combos not included in the other tabs)" | |
| has_overall: false | |
| acronym: null | |
| icon: null | |
| special_icons: null | |
| credits: null | |
| tasks: | |
| STS: ["STS17 (ar-ar)", "STS17 (en-ar)", "STS17 (en-de)", "STS17 (en-tr)", "STS17 (es-en)", "STS17 (es-es)", "STS17 (fr-en)", "STS17 (it-en)", "STS17 (ko-ko)", "STS17 (nl-en)", "STS22 (ar)", "STS22 (de)", "STS22 (de-en)", "STS22 (de-fr)", "STS22 (de-pl)", "STS22 (es)", "STS22 (es-en)", "STS22 (es-it)", "STS22 (fr)", "STS22 (fr-pl)", "STS22 (it)", "STS22 (pl)", "STS22 (pl-en)", "STS22 (ru)", "STS22 (tr)", "STS22 (zh-en)", "STSBenchmark"] | |
| law: | |
| title: Law | |
| language_long: "English, German, Chinese" | |
| has_overall: false | |
| acronym: null | |
| icon: "⚖️" | |
| special_icons: null | |
| credits: "[Voyage AI](https://www.voyageai.com/)" | |
| tasks: | |
| Retrieval: | |
| - AILACasedocs | |
| - AILAStatutes | |
| - GerDaLIRSmall | |
| - LeCaRDv2 | |
| - LegalBenchConsumerContractsQA | |
| - LegalBenchCorporateLobbying | |
| - LegalQuAD | |
| - LegalSummarization | |
| longembed: | |
| title: LongEmbed | |
| language_long: "English" | |
| has_overall: false | |
| acronym: null | |
| icon: "📚" | |
| special_icons: null | |
| credits: "[LongEmbed (Dawei Zhu et al.)](https://arxiv.org/abs/2404.12096v2)" | |
| metric: nDCG@10 (for NarrativeQA, QMSum, SummScreenFD, WikimQA) & nDCG@1 (for passkey and needle) | |
| tasks: | |
| Retrieval: | |
| - LEMBNarrativeQARetrieval | |
| - LEMBNeedleRetrieval | |
| - LEMBPasskeyRetrieval | |
| - LEMBQMSumRetrieval | |
| - LEMBSummScreenFDRetrieval | |
| - LEMBWikimQARetrieval | |
| rar-b: | |
| title: RAR-b | |
| language_long: "English" | |
| has_overall: false | |
| acronym: null | |
| icon: "📚" | |
| special_icons: null | |
| credits: "[RAR-b (Chenghao Xiao et al.)](https://arxiv.org/abs/2404.06347/)" | |
| metric: nDCG@10 | |
| tasks: | |
| Retrieval: | |
| - ARCChallenge | |
| - AlphaNLI | |
| - HellaSwag | |
| - PIQA | |
| - Quail | |
| - RARbCode | |
| - RARbMath | |
| - SIQA | |
| - SpartQA | |
| - TempReasonL1 | |
| - TempReasonL2Fact | |
| - TempReasonL2Pure | |
| - TempReasonL3Fact | |
| - TempReasonL3Pure | |
| - WinoGrande | |
| bright: | |
| title: BRIGHT | |
| language_long: "English" | |
| has_overall: false | |
| acronym: null | |
| icon: "🌟" | |
| special_icons: null | |
| credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](https://brightbenchmark.github.io/)" | |
| metric: nDCG@10 | |
| split: standard | |
| desc: This tab only allows submissions with the original queries; not results from LLM rewritten queries or using reranking. | |
| tasks: | |
| Retrieval: | |
| - BrightRetrieval (biology) | |
| - BrightRetrieval (earth_science) | |
| - BrightRetrieval (economics) | |
| - BrightRetrieval (psychology) | |
| - BrightRetrieval (robotics) | |
| - BrightRetrieval (stackoverflow) | |
| - BrightRetrieval (sustainable_living) | |
| - BrightRetrieval (pony) | |
| - BrightRetrieval (leetcode) | |
| - BrightRetrieval (aops) | |
| - BrightRetrieval (theoremqa_theorems) | |
| - BrightRetrieval (theoremqa_questions) | |
| bright_long: | |
| title: BRIGHT Long | |
| language_long: "English" | |
| has_overall: false | |
| acronym: null | |
| icon: "🌟" | |
| special_icons: null | |
| credits: "[BRIGHT (Hongjin Su, Howard Yen, Mengzhou Xia et al.)](https://brightbenchmark.github.io/)" | |
| metric: Recall@1 | |
| split: long | |
| desc: This tab is for the long document setting of BRIGHT. | |
| tasks: | |
| Retrieval: | |
| - BrightRetrieval (biology) | |
| - BrightRetrieval (earth_science) | |
| - BrightRetrieval (economics) | |
| - BrightRetrieval (psychology) | |
| - BrightRetrieval (robotics) | |
| - BrightRetrieval (stackoverflow) | |
| - BrightRetrieval (sustainable_living) | |
| - BrightRetrieval (pony) | |
| coir: | |
| title: CoIR | |
| language_long: "Code" | |
| has_overall: false | |
| acronym: null | |
| icon: "💻" | |
| special_icons: null | |
| credits: "[Samoed](https://github.com/Samoed) and [monikernemo](https://github.com/monikernemo) and [CoIR (Xiangyang Li, Kuicai Dong, Yi Quan Lee et al.)](https://arxiv.org/abs/2407.02883)" | |
| metric: nDCG@10 | |
| tasks: | |
| Retrieval: | |
| - AppsRetrieval | |
| - CodeFeedbackMT | |
| - CodeFeedbackST | |
| - CodeSearchNetCCRetrieval (python) | |
| - CodeSearchNetCCRetrieval (javascript) | |
| - CodeSearchNetCCRetrieval (go) | |
| - CodeSearchNetCCRetrieval (ruby) | |
| - CodeSearchNetCCRetrieval (java) | |
| - CodeSearchNetCCRetrieval (php) | |
| - CodeSearchNetRetrieval (python) | |
| - CodeSearchNetRetrieval (javascript) | |
| - CodeSearchNetRetrieval (go) | |
| - CodeSearchNetRetrieval (ruby) | |
| - CodeSearchNetRetrieval (java) | |
| - CodeSearchNetRetrieval (php) | |
| - CodeTransOceanContest | |
| - CodeTransOceanDL | |
| - CosQA | |
| - StackOverflowQA | |
| - SyntheticText2SQL | |