Spaces:
Build error
Build error
migrate to fsspec
Browse files
app.py
CHANGED
|
@@ -9,12 +9,8 @@ import numpy as np
|
|
| 9 |
from datetime import datetime
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
-
import huggingface_hub
|
| 13 |
import pandas as pd
|
| 14 |
-
|
| 15 |
-
from huggingface_hub.file_download import repo_folder_name
|
| 16 |
-
from huggingface_hub.hf_api import RepoFile
|
| 17 |
-
from huggingface_hub.utils import EntryNotFoundError
|
| 18 |
|
| 19 |
FALLBACK_TOKEN_NAME = "HF_TOKEN"
|
| 20 |
|
|
@@ -41,20 +37,20 @@ def get_run_name_seed(run_name):
|
|
| 41 |
run_name, seed = run_name.split("-seed-")
|
| 42 |
return run_name, int(seed)
|
| 43 |
|
| 44 |
-
def fetch_repo_structure(
|
| 45 |
token = os.environ.get(FALLBACK_TOKEN_NAME)
|
| 46 |
if oauth_token:
|
| 47 |
token = oauth_token.token
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
if not runs:
|
| 53 |
return {}, gr.update(choices=[], value=None)
|
| 54 |
|
| 55 |
def process_run(run):
|
| 56 |
-
run_files =
|
| 57 |
-
return run,
|
| 58 |
|
| 59 |
with ThreadPoolExecutor() as executor:
|
| 60 |
results = list(executor.map(process_run, runs))
|
|
@@ -86,14 +82,16 @@ def select_runs_by_language(runs, current_selected, language):
|
|
| 86 |
return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
|
| 87 |
return current_selected
|
| 88 |
|
| 89 |
-
def fetch_available_tasks(
|
| 90 |
token = os.environ.get(FALLBACK_TOKEN_NAME)
|
| 91 |
|
|
|
|
| 92 |
all_tasks = defaultdict(lambda: defaultdict(dict))
|
|
|
|
| 93 |
for run in runs_to_fetch:
|
| 94 |
try:
|
| 95 |
-
files =
|
| 96 |
-
parquet_files = [f.
|
| 97 |
|
| 98 |
for full_filename in parquet_files:
|
| 99 |
task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
|
|
@@ -101,8 +99,10 @@ def fetch_available_tasks(repo_name, runs_to_fetch, checkpoint) -> dict[str, dic
|
|
| 101 |
|
| 102 |
if run not in all_tasks[task_name] or date > all_tasks[task_name][run]['date']:
|
| 103 |
all_tasks[task_name][run] = {'filename': full_filename, 'date': date}
|
| 104 |
-
except
|
| 105 |
print(f"Checkpoint not found for run: {run}")
|
|
|
|
|
|
|
| 106 |
|
| 107 |
available_tasks = {
|
| 108 |
task: {run: info['filename'] for run, info in runs.items()}
|
|
@@ -112,17 +112,17 @@ def fetch_available_tasks(repo_name, runs_to_fetch, checkpoint) -> dict[str, dic
|
|
| 112 |
|
| 113 |
return available_tasks
|
| 114 |
|
| 115 |
-
def fetch_run_results(
|
| 116 |
oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
|
| 117 |
|
| 118 |
-
task_runs_dict = fetch_available_tasks(
|
| 119 |
task_names = list(task_runs_dict.keys())
|
| 120 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
| 121 |
|
| 122 |
|
| 123 |
def render_table(df, selected_runs, metric_names):
|
| 124 |
if df is None or not selected_runs or not metric_names:
|
| 125 |
-
return None
|
| 126 |
kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
|
| 127 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
| 128 |
df = df.drop(columns=other_metrics)
|
|
@@ -130,8 +130,9 @@ def render_table(df, selected_runs, metric_names):
|
|
| 130 |
df = shorten_column_names(df, selected_runs, metric_names)
|
| 131 |
|
| 132 |
# Sample 100
|
|
|
|
| 133 |
df = df.sample(n=min(100, len(df)), random_state=42)
|
| 134 |
-
return df
|
| 135 |
|
| 136 |
def get_column_widths(df):
|
| 137 |
column_widths = []
|
|
@@ -170,19 +171,25 @@ def shorten_column_names(df, run_names: list[str], metric_names: list[str]):
|
|
| 170 |
return df
|
| 171 |
|
| 172 |
|
| 173 |
-
def load_task_data(
|
| 174 |
token = os.environ.get(FALLBACK_TOKEN_NAME)
|
| 175 |
if not runs_to_fetch or not task_name:
|
| 176 |
return None, None, None
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
def fetch_run_file(run_to_fetch):
|
| 179 |
file_path = f"details/{run_to_fetch}/{checkpoint}/{tasks_files[task_name][run_to_fetch]}"
|
| 180 |
try:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
return df, run_to_fetch
|
| 184 |
-
except
|
| 185 |
-
print(f"File not found: {
|
| 186 |
return None, run_to_fetch
|
| 187 |
|
| 188 |
with ThreadPoolExecutor() as pool:
|
|
@@ -245,7 +252,7 @@ def load_task_data(repo_name, runs_to_fetch, checkpoint, task_name, tasks_files,
|
|
| 245 |
# Join all prepared DataFrames
|
| 246 |
for df, run_name in zip(dfs, run_names):
|
| 247 |
prepared_df = prepare_df(df, run_name, task_type)
|
| 248 |
-
combined_df = combined_df.join(prepared_df, how='outer'
|
| 249 |
|
| 250 |
|
| 251 |
available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
|
|
@@ -259,7 +266,7 @@ with gr.Blocks() as demo:
|
|
| 259 |
results_df_full = gr.State(None)
|
| 260 |
tasks_files = gr.State({})
|
| 261 |
login_button = gr.LoginButton(visible=False)
|
| 262 |
-
|
| 263 |
with gr.Column():
|
| 264 |
gr.Markdown("# FineWeb experiments results explorer")
|
| 265 |
with gr.Row():
|
|
@@ -277,11 +284,14 @@ with gr.Blocks() as demo:
|
|
| 277 |
task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
|
| 278 |
metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
|
| 279 |
results_df = gr.Dataframe(interactive=False, wrap=True)
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
# Run selection
|
| 282 |
gr.on(
|
| 283 |
-
triggers=[
|
| 284 |
-
fn=fetch_repo_structure, inputs=[
|
| 285 |
)
|
| 286 |
gr.on(
|
| 287 |
triggers=[select_by_regex_button.click],
|
|
@@ -306,37 +316,37 @@ with gr.Blocks() as demo:
|
|
| 306 |
gr.on(
|
| 307 |
triggers=[fetch_res.click],
|
| 308 |
fn=fetch_run_results,
|
| 309 |
-
inputs=[
|
| 310 |
outputs=[task_name, tasks_files]
|
| 311 |
).then(
|
| 312 |
fn=load_task_data,
|
| 313 |
-
inputs=[
|
| 314 |
outputs=[results_df_full, metric_names]
|
| 315 |
).then(
|
| 316 |
fn=render_table,
|
| 317 |
inputs=[results_df_full, selected_runs, metric_names],
|
| 318 |
-
outputs=[results_df]
|
| 319 |
)
|
| 320 |
|
| 321 |
# Update results when task name or metric changes
|
| 322 |
gr.on(
|
| 323 |
triggers=[task_name.input],
|
| 324 |
fn=load_task_data,
|
| 325 |
-
inputs=[
|
| 326 |
outputs=[results_df_full, metric_names]
|
| 327 |
).then(
|
| 328 |
fn=render_table,
|
| 329 |
inputs=[results_df_full, selected_runs, metric_names],
|
| 330 |
-
outputs=[results_df]
|
| 331 |
)
|
| 332 |
|
| 333 |
gr.on(
|
| 334 |
triggers=[metric_names.input],
|
| 335 |
fn=render_table,
|
| 336 |
inputs=[results_df_full, selected_runs, metric_names],
|
| 337 |
-
outputs=[results_df]
|
| 338 |
)
|
| 339 |
|
| 340 |
-
demo.load(fn=fetch_repo_structure, inputs=[
|
| 341 |
|
| 342 |
demo.launch()
|
|
|
|
| 9 |
from datetime import datetime
|
| 10 |
|
| 11 |
import gradio as gr
|
|
|
|
| 12 |
import pandas as pd
|
| 13 |
+
from datatrove.io import DataFolder
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
FALLBACK_TOKEN_NAME = "HF_TOKEN"
|
| 16 |
|
|
|
|
| 37 |
run_name, seed = run_name.split("-seed-")
|
| 38 |
return run_name, int(seed)
|
| 39 |
|
| 40 |
+
def fetch_repo_structure(results_uri, oauth_token: gr.OAuthToken | None = None):
|
| 41 |
token = os.environ.get(FALLBACK_TOKEN_NAME)
|
| 42 |
if oauth_token:
|
| 43 |
token = oauth_token.token
|
| 44 |
|
| 45 |
+
data_folder = DataFolder(results_uri, token=token)
|
| 46 |
+
runs = [f.removeprefix("details/") for f in data_folder.list_files("details", recursive=False, include_directories=True) if f != "details"]
|
| 47 |
+
|
| 48 |
if not runs:
|
| 49 |
return {}, gr.update(choices=[], value=None)
|
| 50 |
|
| 51 |
def process_run(run):
|
| 52 |
+
run_files = [f.removeprefix(f"details/{run}/") for f in data_folder.list_files(f"details/{run}", recursive=False, include_directories=True) if f != f"details/{run}"]
|
| 53 |
+
return run, run_files
|
| 54 |
|
| 55 |
with ThreadPoolExecutor() as executor:
|
| 56 |
results = list(executor.map(process_run, runs))
|
|
|
|
| 82 |
return select_runs_by_regex(runs, current_selected, f".*-{language}-.*")
|
| 83 |
return current_selected
|
| 84 |
|
| 85 |
+
def fetch_available_tasks(results_uri, runs_to_fetch, checkpoint) -> dict[str, dict[str, str]]:
|
| 86 |
token = os.environ.get(FALLBACK_TOKEN_NAME)
|
| 87 |
|
| 88 |
+
data_folder = DataFolder(results_uri, token=token)
|
| 89 |
all_tasks = defaultdict(lambda: defaultdict(dict))
|
| 90 |
+
|
| 91 |
for run in runs_to_fetch:
|
| 92 |
try:
|
| 93 |
+
files = data_folder.list_files(f"details/{run}/{checkpoint}", recursive=False)
|
| 94 |
+
parquet_files = [f.split("/")[-1] for f in files if f.endswith('.parquet')]
|
| 95 |
|
| 96 |
for full_filename in parquet_files:
|
| 97 |
task_name, date_str = full_filename.replace('.parquet', '').rsplit('_', 1)
|
|
|
|
| 99 |
|
| 100 |
if run not in all_tasks[task_name] or date > all_tasks[task_name][run]['date']:
|
| 101 |
all_tasks[task_name][run] = {'filename': full_filename, 'date': date}
|
| 102 |
+
except FileNotFoundError:
|
| 103 |
print(f"Checkpoint not found for run: {run}")
|
| 104 |
+
|
| 105 |
+
print(all_tasks)
|
| 106 |
|
| 107 |
available_tasks = {
|
| 108 |
task: {run: info['filename'] for run, info in runs.items()}
|
|
|
|
| 112 |
|
| 113 |
return available_tasks
|
| 114 |
|
| 115 |
+
def fetch_run_results(results_uri, runs_to_fetch, checkpoint,
|
| 116 |
oauth_token: gr.OAuthToken | None = None, progress=gr.Progress()):
|
| 117 |
|
| 118 |
+
task_runs_dict = fetch_available_tasks(results_uri, runs_to_fetch, checkpoint)
|
| 119 |
task_names = list(task_runs_dict.keys())
|
| 120 |
return gr.update(choices=task_names, value=task_names[0] if task_names else None), task_runs_dict
|
| 121 |
|
| 122 |
|
| 123 |
def render_table(df, selected_runs, metric_names):
|
| 124 |
if df is None or not selected_runs or not metric_names:
|
| 125 |
+
return None, "0"
|
| 126 |
kept_metrics = [f"metric_{metric_name}_{run_name}" for run_name in selected_runs for metric_name in metric_names]
|
| 127 |
other_metrics = [col for col in df.columns if col.startswith(f"metric_") and col not in kept_metrics]
|
| 128 |
df = df.drop(columns=other_metrics)
|
|
|
|
| 130 |
df = shorten_column_names(df, selected_runs, metric_names)
|
| 131 |
|
| 132 |
# Sample 100
|
| 133 |
+
n_samples = len(df)
|
| 134 |
df = df.sample(n=min(100, len(df)), random_state=42)
|
| 135 |
+
return df, n_samples
|
| 136 |
|
| 137 |
def get_column_widths(df):
|
| 138 |
column_widths = []
|
|
|
|
| 171 |
return df
|
| 172 |
|
| 173 |
|
| 174 |
+
def load_task_data(results_uri, runs_to_fetch, checkpoint, task_name, tasks_files, progress=gr.Progress()):
|
| 175 |
token = os.environ.get(FALLBACK_TOKEN_NAME)
|
| 176 |
if not runs_to_fetch or not task_name:
|
| 177 |
return None, None, None
|
| 178 |
|
| 179 |
+
|
| 180 |
+
print(runs_to_fetch)
|
| 181 |
+
|
| 182 |
+
data_folder = DataFolder(f"filecache::{results_uri}", token=token, cache_storage="./results-cache")
|
| 183 |
+
print(tasks_files)
|
| 184 |
+
|
| 185 |
def fetch_run_file(run_to_fetch):
|
| 186 |
file_path = f"details/{run_to_fetch}/{checkpoint}/{tasks_files[task_name][run_to_fetch]}"
|
| 187 |
try:
|
| 188 |
+
with data_folder.open(file_path, "rb") as f:
|
| 189 |
+
df = pd.read_parquet(f)
|
| 190 |
return df, run_to_fetch
|
| 191 |
+
except FileNotFoundError:
|
| 192 |
+
print(f"File not found: {tasks_files[task_name][run_to_fetch]}")
|
| 193 |
return None, run_to_fetch
|
| 194 |
|
| 195 |
with ThreadPoolExecutor() as pool:
|
|
|
|
| 252 |
# Join all prepared DataFrames
|
| 253 |
for df, run_name in zip(dfs, run_names):
|
| 254 |
prepared_df = prepare_df(df, run_name, task_type)
|
| 255 |
+
combined_df = combined_df.join(prepared_df, how='outer')
|
| 256 |
|
| 257 |
|
| 258 |
available_metrics = list(set("_".join(col.split('_')[1:-1]) for col in combined_df.columns if col.startswith("metric_")))
|
|
|
|
| 266 |
results_df_full = gr.State(None)
|
| 267 |
tasks_files = gr.State({})
|
| 268 |
login_button = gr.LoginButton(visible=False)
|
| 269 |
+
results_uri = gr.Textbox(label="Results URI", value="s3://fineweb-multilingual-v1/evals/test/", visible=True)
|
| 270 |
with gr.Column():
|
| 271 |
gr.Markdown("# FineWeb experiments results explorer")
|
| 272 |
with gr.Row():
|
|
|
|
| 284 |
task_name = gr.Dropdown(choices=[], interactive=True, label="Task name")
|
| 285 |
metric_names = gr.Dropdown(choices=[], interactive=True, multiselect=True, label="Metric")
|
| 286 |
results_df = gr.Dataframe(interactive=False, wrap=True)
|
| 287 |
+
with gr.Row():
|
| 288 |
+
with gr.Column():
|
| 289 |
+
num_samples = gr.Text(interactive=False, label="# Samples")
|
| 290 |
|
| 291 |
# Run selection
|
| 292 |
gr.on(
|
| 293 |
+
triggers=[results_uri.change],
|
| 294 |
+
fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs],
|
| 295 |
)
|
| 296 |
gr.on(
|
| 297 |
triggers=[select_by_regex_button.click],
|
|
|
|
| 316 |
gr.on(
|
| 317 |
triggers=[fetch_res.click],
|
| 318 |
fn=fetch_run_results,
|
| 319 |
+
inputs=[results_uri, selected_runs, checkpoint],
|
| 320 |
outputs=[task_name, tasks_files]
|
| 321 |
).then(
|
| 322 |
fn=load_task_data,
|
| 323 |
+
inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
|
| 324 |
outputs=[results_df_full, metric_names]
|
| 325 |
).then(
|
| 326 |
fn=render_table,
|
| 327 |
inputs=[results_df_full, selected_runs, metric_names],
|
| 328 |
+
outputs=[results_df, num_samples]
|
| 329 |
)
|
| 330 |
|
| 331 |
# Update results when task name or metric changes
|
| 332 |
gr.on(
|
| 333 |
triggers=[task_name.input],
|
| 334 |
fn=load_task_data,
|
| 335 |
+
inputs=[results_uri, selected_runs, checkpoint, task_name, tasks_files],
|
| 336 |
outputs=[results_df_full, metric_names]
|
| 337 |
).then(
|
| 338 |
fn=render_table,
|
| 339 |
inputs=[results_df_full, selected_runs, metric_names],
|
| 340 |
+
outputs=[results_df, num_samples]
|
| 341 |
)
|
| 342 |
|
| 343 |
gr.on(
|
| 344 |
triggers=[metric_names.input],
|
| 345 |
fn=render_table,
|
| 346 |
inputs=[results_df_full, selected_runs, metric_names],
|
| 347 |
+
outputs=[results_df, num_samples]
|
| 348 |
)
|
| 349 |
|
| 350 |
+
demo.load(fn=fetch_repo_structure, inputs=[results_uri], outputs=[runs_checkpoints, selected_runs])
|
| 351 |
|
| 352 |
demo.launch()
|