Spaces:
Runtime error
Runtime error
Parse metadata
Browse files
app.py
CHANGED
|
@@ -8,7 +8,8 @@ from datasets import get_dataset_config_names
|
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
from huggingface_hub import list_datasets
|
| 10 |
|
| 11 |
-
from utils import get_compatible_models, get_metadata, http_get,
|
|
|
|
| 12 |
|
| 13 |
if Path(".env").is_file():
|
| 14 |
load_dotenv(".env")
|
|
@@ -29,6 +30,9 @@ TASK_TO_ID = {
|
|
| 29 |
"summarization": 8,
|
| 30 |
}
|
| 31 |
|
|
|
|
|
|
|
|
|
|
| 32 |
###########
|
| 33 |
### APP ###
|
| 34 |
###########
|
|
@@ -61,7 +65,11 @@ if metadata is None:
|
|
| 61 |
|
| 62 |
with st.expander("Advanced configuration"):
|
| 63 |
## Select task
|
| 64 |
-
selected_task = st.selectbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
### Select config
|
| 66 |
configs = get_dataset_config_names(selected_dataset)
|
| 67 |
selected_config = st.selectbox("Select a config", configs)
|
|
@@ -75,29 +83,25 @@ with st.expander("Advanced configuration"):
|
|
| 75 |
if split["config"] == selected_config:
|
| 76 |
split_names.append(split["split"])
|
| 77 |
|
| 78 |
-
selected_split = st.selectbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
##
|
| 81 |
rows_resp = http_get(
|
| 82 |
path="/rows",
|
| 83 |
domain="https://datasets-preview.huggingface.tech",
|
| 84 |
params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
|
| 85 |
).json()
|
| 86 |
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
| 87 |
-
# splits = metadata[0]["splits"]
|
| 88 |
-
# split_names = list(splits.values())
|
| 89 |
-
# eval_split = splits.get("eval_split", split_names[0])
|
| 90 |
-
|
| 91 |
-
# selected_split = st.selectbox("Select a split", split_names, index=split_names.index(eval_split))
|
| 92 |
-
|
| 93 |
-
# TODO: add a function to handle the mapping task <--> column mapping
|
| 94 |
-
# col_mapping = metadata[0]["col_mapping"]
|
| 95 |
-
# col_names = list(col_mapping.keys())
|
| 96 |
|
| 97 |
st.markdown("**Map your data columns**")
|
| 98 |
col1, col2 = st.columns(2)
|
| 99 |
|
| 100 |
# TODO: find a better way to layout these items
|
|
|
|
| 101 |
col_mapping = {}
|
| 102 |
if selected_task in ["binary_classification", "multi_class_classification"]:
|
| 103 |
with col1:
|
|
@@ -108,9 +112,15 @@ with st.expander("Advanced configuration"):
|
|
| 108 |
st.text("")
|
| 109 |
st.markdown("`target` column")
|
| 110 |
with col2:
|
| 111 |
-
text_col = st.selectbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
target_col = st.selectbox(
|
| 113 |
-
"This column should contain the labels you want to assign to the text",
|
|
|
|
|
|
|
| 114 |
)
|
| 115 |
col_mapping[text_col] = "text"
|
| 116 |
col_mapping[target_col] = "target"
|
|
@@ -127,9 +137,12 @@ with st.expander("Advanced configuration"):
|
|
| 127 |
tokens_col = st.selectbox(
|
| 128 |
"This column should contain the parts of the text (as an array of tokens) you want to assign labels to",
|
| 129 |
col_names,
|
|
|
|
| 130 |
)
|
| 131 |
tags_col = st.selectbox(
|
| 132 |
-
"This column should contain the labels to associate to each part of the text",
|
|
|
|
|
|
|
| 133 |
)
|
| 134 |
col_mapping[tokens_col] = "tokens"
|
| 135 |
col_mapping[tags_col] = "tags"
|
|
@@ -143,9 +156,15 @@ with st.expander("Advanced configuration"):
|
|
| 143 |
st.text("")
|
| 144 |
st.markdown("`target` column")
|
| 145 |
with col2:
|
| 146 |
-
text_col = st.selectbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
target_col = st.selectbox(
|
| 148 |
-
"This column should contain an example translation of the source text",
|
|
|
|
|
|
|
| 149 |
)
|
| 150 |
col_mapping[text_col] = "source"
|
| 151 |
col_mapping[target_col] = "target"
|
|
@@ -159,8 +178,16 @@ with st.expander("Advanced configuration"):
|
|
| 159 |
st.text("")
|
| 160 |
st.markdown("`target` column")
|
| 161 |
with col2:
|
| 162 |
-
text_col = st.selectbox(
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
col_mapping[text_col] = "text"
|
| 165 |
col_mapping[target_col] = "target"
|
| 166 |
|
|
@@ -183,16 +210,29 @@ with st.expander("Advanced configuration"):
|
|
| 183 |
st.text("")
|
| 184 |
st.markdown("`answers.answer_start` column")
|
| 185 |
with col2:
|
| 186 |
-
context_col = st.selectbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
question_col = st.selectbox(
|
| 188 |
-
"This column should contain the question to be answered, given the context",
|
|
|
|
|
|
|
| 189 |
)
|
| 190 |
answers_text_col = st.selectbox(
|
| 191 |
-
"This column should contain example answers to the question, extracted from the context",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
)
|
| 193 |
answers_start_col = st.selectbox(
|
| 194 |
"This column should contain the indices in the context of the first character of each answers.text",
|
| 195 |
col_names,
|
|
|
|
|
|
|
|
|
|
| 196 |
)
|
| 197 |
col_mapping[context_col] = "context"
|
| 198 |
col_mapping[question_col] = "question"
|
|
@@ -203,9 +243,8 @@ with st.form(key="form"):
|
|
| 203 |
|
| 204 |
compatible_models = get_compatible_models(selected_task, selected_dataset)
|
| 205 |
|
| 206 |
-
selected_models = st.multiselect(
|
| 207 |
-
|
| 208 |
-
)
|
| 209 |
submit_button = st.form_submit_button("Make submission")
|
| 210 |
|
| 211 |
if submit_button:
|
|
|
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
from huggingface_hub import list_datasets
|
| 10 |
|
| 11 |
+
from utils import (get_compatible_models, get_key, get_metadata, http_get,
|
| 12 |
+
http_post)
|
| 13 |
|
| 14 |
if Path(".env").is_file():
|
| 15 |
load_dotenv(".env")
|
|
|
|
| 30 |
"summarization": 8,
|
| 31 |
}
|
| 32 |
|
| 33 |
+
supported_tasks = list(TASK_TO_ID.keys())
|
| 34 |
+
|
| 35 |
+
|
| 36 |
###########
|
| 37 |
### APP ###
|
| 38 |
###########
|
|
|
|
| 65 |
|
| 66 |
with st.expander("Advanced configuration"):
|
| 67 |
## Select task
|
| 68 |
+
selected_task = st.selectbox(
|
| 69 |
+
"Select a task",
|
| 70 |
+
supported_tasks,
|
| 71 |
+
index=supported_tasks.index(metadata[0]["task_id"]) if metadata is not None else 0,
|
| 72 |
+
)
|
| 73 |
### Select config
|
| 74 |
configs = get_dataset_config_names(selected_dataset)
|
| 75 |
selected_config = st.selectbox("Select a config", configs)
|
|
|
|
| 83 |
if split["config"] == selected_config:
|
| 84 |
split_names.append(split["split"])
|
| 85 |
|
| 86 |
+
selected_split = st.selectbox(
|
| 87 |
+
"Select a split",
|
| 88 |
+
split_names,
|
| 89 |
+
index=split_names.index(metadata[0]["splits"]["eval_split"]) if metadata is not None else 0,
|
| 90 |
+
)
|
| 91 |
|
| 92 |
+
## Select columns
|
| 93 |
rows_resp = http_get(
|
| 94 |
path="/rows",
|
| 95 |
domain="https://datasets-preview.huggingface.tech",
|
| 96 |
params={"dataset": selected_dataset, "config": selected_config, "split": selected_split},
|
| 97 |
).json()
|
| 98 |
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
st.markdown("**Map your data columns**")
|
| 101 |
col1, col2 = st.columns(2)
|
| 102 |
|
| 103 |
# TODO: find a better way to layout these items
|
| 104 |
+
# TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
|
| 105 |
col_mapping = {}
|
| 106 |
if selected_task in ["binary_classification", "multi_class_classification"]:
|
| 107 |
with col1:
|
|
|
|
| 112 |
st.text("")
|
| 113 |
st.markdown("`target` column")
|
| 114 |
with col2:
|
| 115 |
+
text_col = st.selectbox(
|
| 116 |
+
"This column should contain the text you want to classify",
|
| 117 |
+
col_names,
|
| 118 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
|
| 119 |
+
)
|
| 120 |
target_col = st.selectbox(
|
| 121 |
+
"This column should contain the labels you want to assign to the text",
|
| 122 |
+
col_names,
|
| 123 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
|
| 124 |
)
|
| 125 |
col_mapping[text_col] = "text"
|
| 126 |
col_mapping[target_col] = "target"
|
|
|
|
| 137 |
tokens_col = st.selectbox(
|
| 138 |
"This column should contain the parts of the text (as an array of tokens) you want to assign labels to",
|
| 139 |
col_names,
|
| 140 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "tokens")) if metadata is not None else 0,
|
| 141 |
)
|
| 142 |
tags_col = st.selectbox(
|
| 143 |
+
"This column should contain the labels to associate to each part of the text",
|
| 144 |
+
col_names,
|
| 145 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "tags")) if metadata is not None else 0,
|
| 146 |
)
|
| 147 |
col_mapping[tokens_col] = "tokens"
|
| 148 |
col_mapping[tags_col] = "tags"
|
|
|
|
| 156 |
st.text("")
|
| 157 |
st.markdown("`target` column")
|
| 158 |
with col2:
|
| 159 |
+
text_col = st.selectbox(
|
| 160 |
+
"This column should contain the text you want to translate",
|
| 161 |
+
col_names,
|
| 162 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "source")) if metadata is not None else 0,
|
| 163 |
+
)
|
| 164 |
target_col = st.selectbox(
|
| 165 |
+
"This column should contain an example translation of the source text",
|
| 166 |
+
col_names,
|
| 167 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
|
| 168 |
)
|
| 169 |
col_mapping[text_col] = "source"
|
| 170 |
col_mapping[target_col] = "target"
|
|
|
|
| 178 |
st.text("")
|
| 179 |
st.markdown("`target` column")
|
| 180 |
with col2:
|
| 181 |
+
text_col = st.selectbox(
|
| 182 |
+
"This column should contain the text you want to summarize",
|
| 183 |
+
col_names,
|
| 184 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0,
|
| 185 |
+
)
|
| 186 |
+
target_col = st.selectbox(
|
| 187 |
+
"This column should contain an example summarization of the text",
|
| 188 |
+
col_names,
|
| 189 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0,
|
| 190 |
+
)
|
| 191 |
col_mapping[text_col] = "text"
|
| 192 |
col_mapping[target_col] = "target"
|
| 193 |
|
|
|
|
| 210 |
st.text("")
|
| 211 |
st.markdown("`answers.answer_start` column")
|
| 212 |
with col2:
|
| 213 |
+
context_col = st.selectbox(
|
| 214 |
+
"This column should contain the question's context",
|
| 215 |
+
col_names,
|
| 216 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "context")) if metadata is not None else 0,
|
| 217 |
+
)
|
| 218 |
question_col = st.selectbox(
|
| 219 |
+
"This column should contain the question to be answered, given the context",
|
| 220 |
+
col_names,
|
| 221 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "question")) if metadata is not None else 0,
|
| 222 |
)
|
| 223 |
answers_text_col = st.selectbox(
|
| 224 |
+
"This column should contain example answers to the question, extracted from the context",
|
| 225 |
+
col_names,
|
| 226 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "answers.text"))
|
| 227 |
+
if metadata is not None
|
| 228 |
+
else 0,
|
| 229 |
)
|
| 230 |
answers_start_col = st.selectbox(
|
| 231 |
"This column should contain the indices in the context of the first character of each answers.text",
|
| 232 |
col_names,
|
| 233 |
+
index=col_names.index(get_key(metadata[0]["col_mapping"], "answers.answer_start"))
|
| 234 |
+
if metadata is not None
|
| 235 |
+
else 0,
|
| 236 |
)
|
| 237 |
col_mapping[context_col] = "context"
|
| 238 |
col_mapping[question_col] = "question"
|
|
|
|
| 243 |
|
| 244 |
compatible_models = get_compatible_models(selected_task, selected_dataset)
|
| 245 |
|
| 246 |
+
selected_models = st.multiselect("Select the models you wish to evaluate", compatible_models)
|
| 247 |
+
print("Selected models:", selected_models)
|
|
|
|
| 248 |
submit_button = st.form_submit_button("Make submission")
|
| 249 |
|
| 250 |
if submit_button:
|
utils.py
CHANGED
|
@@ -48,10 +48,9 @@ def http_get(path: str, domain: str, token: str = None, params: dict = None) ->
|
|
| 48 |
|
| 49 |
|
| 50 |
def get_metadata(dataset_name: str) -> Union[Dict, None]:
|
| 51 |
-
|
| 52 |
-
data
|
| 53 |
-
|
| 54 |
-
return data[0].cardData["train-eval-index"]
|
| 55 |
else:
|
| 56 |
return None
|
| 57 |
|
|
@@ -63,3 +62,11 @@ def get_compatible_models(task, dataset_name):
|
|
| 63 |
)
|
| 64 |
compatible_models = api.list_models(filter=filt)
|
| 65 |
return [model.modelId for model in compatible_models]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
def get_metadata(dataset_name: str) -> Union[Dict, None]:
|
| 51 |
+
data = requests.get(f"https://huggingface.co/api/datasets/{dataset_name}").json()
|
| 52 |
+
if data["cardData"] is not None and "train-eval-index" in data["cardData"].keys():
|
| 53 |
+
return data["cardData"]["train-eval-index"]
|
|
|
|
| 54 |
else:
|
| 55 |
return None
|
| 56 |
|
|
|
|
| 62 |
)
|
| 63 |
compatible_models = api.list_models(filter=filt)
|
| 64 |
return [model.modelId for model in compatible_models]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def get_key(col_mapping, val):
|
| 68 |
+
for key, value in col_mapping.items():
|
| 69 |
+
if val == value:
|
| 70 |
+
return key
|
| 71 |
+
|
| 72 |
+
return "key doesn't exist"
|