Spaces:
Running
Running
Commit
·
743f616
1
Parent(s):
dcde789
apply suggestions
Browse files
app.py
CHANGED
|
@@ -13,7 +13,6 @@ suggested_datasets = [
|
|
| 13 |
"librispeech_asr",
|
| 14 |
"mozilla-foundation/common_voice_8_0",
|
| 15 |
"mozilla-foundation/common_voice_7_0",
|
| 16 |
-
"common_voice",
|
| 17 |
"speech-recognition-community-v2/eval_data",
|
| 18 |
]
|
| 19 |
|
|
@@ -101,38 +100,32 @@ def get_data():
|
|
| 101 |
return pd.DataFrame.from_records(data)
|
| 102 |
|
| 103 |
|
| 104 |
-
def
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
|
| 116 |
|
| 117 |
@st.cache(ttl=600)
|
| 118 |
-
def
|
| 119 |
-
lang_name = lang2name[lang] if lang in lang2name else ""
|
| 120 |
-
num_models = len(lang_df["model_id"].unique())
|
| 121 |
-
unique_datasets = sorted(lang_df["dataset"].unique())
|
| 122 |
-
num_datasets = len(unique_datasets)
|
| 123 |
msg = f"""
|
| 124 |
-
For the `{lang}` ({lang_name}) language, there are currently `{num_models}` models
|
| 125 |
-
trained on `{num_datasets}` datasets available for `automatic-speech-recognition`.
|
| 126 |
-
|
| 127 |
The models have been trained and/or evaluated on the following datasets:
|
| 128 |
"""
|
| 129 |
-
for dataset_id in
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
msg += """
|
| 132 |
Choose the dataset that is most relevant to your task and select it from the dropdown below.
|
| 133 |
"""
|
| 134 |
-
msg += suggest_datasets(unique_datasets)
|
| 135 |
-
msg += "Please click on the model's name to be redirected to its model card which includes documentation and examples on how to use it."
|
| 136 |
|
| 137 |
msg = "\n".join([line.strip() for line in msg.split("\n")])
|
| 138 |
return msg
|
|
@@ -140,7 +133,6 @@ def generate_note(lang, lang_df):
|
|
| 140 |
|
| 141 |
dataframe = get_data()
|
| 142 |
dataframe = dataframe.fillna("")
|
| 143 |
-
dataframe["model_id"] = dataframe["model_id"].apply(make_clickable)
|
| 144 |
|
| 145 |
_, col_center = st.columns([3, 6])
|
| 146 |
with col_center:
|
|
@@ -148,26 +140,40 @@ with col_center:
|
|
| 148 |
st.markdown("# Speech Recognition Models Leaderboard")
|
| 149 |
|
| 150 |
st.markdown(
|
| 151 |
-
"This is a leaderboard over all speech recognition models and datasets
|
| 152 |
-
"Please select a language you want to find a model for from the dropdown
|
| 153 |
)
|
| 154 |
|
| 155 |
-
lang = st.selectbox(
|
| 156 |
"Language",
|
| 157 |
sorted(dataframe["lang"].unique()),
|
|
|
|
| 158 |
index=0,
|
| 159 |
)
|
| 160 |
lang_df = dataframe[dataframe.lang == lang]
|
| 161 |
|
| 162 |
-
|
| 163 |
-
st.markdown(msg)
|
| 164 |
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
"Dataset",
|
| 167 |
-
|
| 168 |
index=0,
|
| 169 |
)
|
| 170 |
dataset_df = lang_df[lang_df.dataset == dataset]
|
|
|
|
|
|
|
| 171 |
if lang in cer_langs:
|
| 172 |
dataset_df = dataset_df[["model_id", "cer"]]
|
| 173 |
dataset_df.sort_values("cer", inplace=True)
|
|
@@ -183,7 +189,20 @@ dataset_df.rename(
|
|
| 183 |
inplace=True,
|
| 184 |
)
|
| 185 |
|
| 186 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
if lang in cer_langs:
|
| 189 |
st.markdown(
|
|
|
|
| 13 |
"librispeech_asr",
|
| 14 |
"mozilla-foundation/common_voice_8_0",
|
| 15 |
"mozilla-foundation/common_voice_7_0",
|
|
|
|
| 16 |
"speech-recognition-community-v2/eval_data",
|
| 17 |
]
|
| 18 |
|
|
|
|
| 100 |
return pd.DataFrame.from_records(data)
|
| 101 |
|
| 102 |
|
| 103 |
+
def sort_datasets(datasets):
|
| 104 |
+
# 1. sort by name
|
| 105 |
+
datasets = sorted(datasets)
|
| 106 |
+
# 2. bring the suggested datasets to the top and append the rest
|
| 107 |
+
datasets = sorted(
|
| 108 |
+
datasets,
|
| 109 |
+
key=lambda dataset_id: suggested_datasets.index(dataset_id)
|
| 110 |
+
if dataset_id in suggested_datasets
|
| 111 |
+
else len(suggested_datasets),
|
| 112 |
+
)
|
| 113 |
+
return datasets
|
| 114 |
|
| 115 |
|
| 116 |
@st.cache(ttl=600)
|
| 117 |
+
def generate_dataset_info(datasets):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
msg = f"""
|
|
|
|
|
|
|
|
|
|
| 119 |
The models have been trained and/or evaluated on the following datasets:
|
| 120 |
"""
|
| 121 |
+
for dataset_id in datasets:
|
| 122 |
+
if dataset_id in suggested_datasets:
|
| 123 |
+
msg += f"* [{dataset_id}](https://hf.co/datasets/{dataset_id}) *(recommended)*\n"
|
| 124 |
+
else:
|
| 125 |
+
msg += f"* [{dataset_id}](https://hf.co/datasets/{dataset_id})\n"
|
| 126 |
msg += """
|
| 127 |
Choose the dataset that is most relevant to your task and select it from the dropdown below.
|
| 128 |
"""
|
|
|
|
|
|
|
| 129 |
|
| 130 |
msg = "\n".join([line.strip() for line in msg.split("\n")])
|
| 131 |
return msg
|
|
|
|
| 133 |
|
| 134 |
dataframe = get_data()
|
| 135 |
dataframe = dataframe.fillna("")
|
|
|
|
| 136 |
|
| 137 |
_, col_center = st.columns([3, 6])
|
| 138 |
with col_center:
|
|
|
|
| 140 |
st.markdown("# Speech Recognition Models Leaderboard")
|
| 141 |
|
| 142 |
st.markdown(
|
| 143 |
+
"This is a leaderboard over all speech recognition models and datasets.\n\n"
|
| 144 |
+
"⬅ Please select a language you want to find a model for from the dropdown on the left."
|
| 145 |
)
|
| 146 |
|
| 147 |
+
lang = st.sidebar.selectbox(
|
| 148 |
"Language",
|
| 149 |
sorted(dataframe["lang"].unique()),
|
| 150 |
+
format_func=lambda key: lang2name.get(key, key),
|
| 151 |
index=0,
|
| 152 |
)
|
| 153 |
lang_df = dataframe[dataframe.lang == lang]
|
| 154 |
|
| 155 |
+
sorted_datasets = sort_datasets(lang_df["dataset"].unique())
|
|
|
|
| 156 |
|
| 157 |
+
text = generate_dataset_info(sorted_datasets)
|
| 158 |
+
st.sidebar.markdown(text)
|
| 159 |
+
|
| 160 |
+
lang_name = lang2name[lang] if lang in lang2name else ""
|
| 161 |
+
num_models = len(lang_df["model_id"].unique())
|
| 162 |
+
num_datasets = len(lang_df["dataset"].unique())
|
| 163 |
+
text = f"""
|
| 164 |
+
For the `{lang}` ({lang_name}) language, there are currently `{num_models}` model(s)
|
| 165 |
+
trained on `{num_datasets}` dataset(s) available for `automatic-speech-recognition`.
|
| 166 |
+
"""
|
| 167 |
+
st.markdown(text)
|
| 168 |
+
|
| 169 |
+
dataset = st.sidebar.selectbox(
|
| 170 |
"Dataset",
|
| 171 |
+
sorted_datasets,
|
| 172 |
index=0,
|
| 173 |
)
|
| 174 |
dataset_df = lang_df[lang_df.dataset == dataset]
|
| 175 |
+
|
| 176 |
+
# sort by WER or CER depending on the language
|
| 177 |
if lang in cer_langs:
|
| 178 |
dataset_df = dataset_df[["model_id", "cer"]]
|
| 179 |
dataset_df.sort_values("cer", inplace=True)
|
|
|
|
| 189 |
inplace=True,
|
| 190 |
)
|
| 191 |
|
| 192 |
+
st.markdown(
|
| 193 |
+
"Please click on the model's name to be redirected to its model card which includes documentation and examples on how to use it."
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# display the model ranks
|
| 197 |
+
dataset_df = dataset_df.reset_index(drop=True)
|
| 198 |
+
dataset_df.index += 1
|
| 199 |
+
|
| 200 |
+
# turn the model ids into clickable links
|
| 201 |
+
dataset_df["model_id"] = dataset_df["model_id"].apply(make_clickable)
|
| 202 |
+
|
| 203 |
+
table_html = dataset_df.to_html(escape=False)
|
| 204 |
+
table_html = table_html.replace("<th>", '<th align="left">') # left-align the headers
|
| 205 |
+
st.write(table_html, unsafe_allow_html=True)
|
| 206 |
|
| 207 |
if lang in cer_langs:
|
| 208 |
st.markdown(
|