import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd import os from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, LLM_BENCHMARKS_TEXT, INTRODUCTION_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation try: print(EVAL_REQUESTS_PATH) snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: restart_space() try: print(EVAL_RESULTS_PATH) snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except Exception: restart_space() # LIBERO Leaderboard Data LIBERO_DATA = [ ['HuggingFaceVLA/smolvla_libero', "HuggingFace", "450M", 0.90, 1.0, 1.0, "--", 0.6, 0.87, "✅ Checkpoint Available", '📄 SmolVLA Paper', "smolvla_spatial.mp4"], ['lerobot/pi0', "Physical Intelligence", "3.3B", 0.90, 0.86, 0.95, "--", 0.73, 0.86, "Reported Score Only", '📄 Pi0 Paper', "pi0.mp4"], ] LIBERO_COLUMNS = [ "Model", "Organization", "Model Size", "Spatial", "Object", "Goal", "90", "Long", "Average", "Available", "Paper", "Video" ] # Columns to display in the table (excluding Video and Organization columns) LIBERO_DISPLAY_COLUMNS = [ "Model", "Model Size", "Spatial", "Object", "Goal", "90", "Long", "Average", "Available", "Paper" ] LIBERO_DF = pd.DataFrame(LIBERO_DATA, columns=LIBERO_COLUMNS) def get_libero_leaderboard(): return LIBERO_DF def get_video_by_model_and_task(model_name, task_name): """Get video file path for a given model and task""" # Task-specific videos for each model (only SmolVLA has videos available) model_task_videos = { "SmolVLA": { "Spatial": "smolvla_spatial.mp4", "Object": "smolvla_object.mp4", "Goal": "smolvla_goal.mp4", "90": "smolvla_90.mp4", "Long": "smolvla_long.mp4" } # Pi0 videos not available yet } # Get the video for the specific model and task if model_name in model_task_videos and task_name in model_task_videos[model_name]: video_file = model_task_videos[model_name][task_name] print(f"Selected model: {model_name}, Task: {task_name}, Video file: {video_file}") return video_file else: print(f"No video available for {model_name} - {task_name}") return None LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) def init_leaderboard(dataframe): if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") print([c.type for c in fields(AutoEvalColumn)]) return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), ColumnFilter( AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)", ), ColumnFilter( AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True ), ], bool_checkboxgroup_label="Hide models", interactive=False, ) custom_css_extended = custom_css + """ /* More specific selectors to override Gradio defaults */ .gradio-container #libero-leaderboard th, #libero-leaderboard thead th, #libero-leaderboard th { font-size: 10px !important; font-weight: bold !important; padding: 6px 8px !important; } .gradio-container #libero-leaderboard td, #libero-leaderboard tbody td, #libero-leaderboard td { font-size: 12px !important; padding: 6px 8px !important; } #libero-leaderboard th:first-child, #libero-leaderboard td:first-child { min-width: 300px !important; max-width: 400px !important; width: 350px !important; } #libero-leaderboard a { color: #0066cc !important; text-decoration: none !important; } #libero-leaderboard a:hover { text-decoration: underline !important; } """ demo = gr.Blocks(css=custom_css_extended) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏆 LIBERO Leaderboard", elem_id="libero-benchmark-tab-table", id=0): # Header with image ''' with gr.Row(): gr.Markdown( """

🏆 LIBERO Leaderboard

Benchmarking Vision-Language-Action (VLA) Policies in Simulation

Made with ❤️ by HuggingFace VLA

""" ) ''' # Full-width Leaderboard Section with gr.Group(): gr.Markdown("### 🏅 Current Leaderboard") # Controls and video section in same row with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🔍 Search & Controls") search_box = gr.Textbox( label="Search models", placeholder="Type model name to search...", interactive=True ) # Define columns that are always shown (not selectable) MANDATORY_COLUMNS = ["Model", "Model Size", "Paper"] # Define columns that can be toggled SELECTABLE_COLUMNS = [col for col in LIBERO_DISPLAY_COLUMNS if col not in MANDATORY_COLUMNS] column_selector = gr.CheckboxGroup( choices=SELECTABLE_COLUMNS, value=SELECTABLE_COLUMNS, label="Select optional columns to display", interactive=True ) gr.Markdown("**Always shown:** Model, Model Size, Paper") with gr.Column(scale=1): gr.Markdown("### 🎥 Model Video Demo") gr.Markdown("Click on any model row in the table below to see its demo video") video_display = gr.Video( label="Demo video will appear here when you click on a model", height=300, autoplay=False, show_label=True, interactive=True, value=None ) # Create a simple dataframe instead of complex Leaderboard to avoid issues libero_leaderboard = gr.Dataframe( value=get_libero_leaderboard()[LIBERO_DISPLAY_COLUMNS], headers=LIBERO_DISPLAY_COLUMNS, interactive=False, wrap=True, datatype=["html", "str", "number", "number", "number", "str", "number", "number", "str", "html"], elem_id="libero-leaderboard", ) # Helper text gr.Markdown( """ **💡 Tips**: - Use the search box to find specific models - **Click on SmolVLA scores** (Spatial, Object, Goal, 90, Long) to see task-specific demo videos above - **Click on model names** to go directly to HuggingFace repositories - 🎬 **Videos available**: SmolVLA task demos | **Pi0 videos**: Coming soon! """, elem_classes="markdown-text" ) # Function to get datatype for a column def get_column_datatype(column_name): """Return the appropriate datatype for each column""" if column_name in ["Model", "Paper"]: return "html" # Contains HTML links elif column_name in ["Spatial", "Object", "Goal", "Long", "Average"]: return "number" elif column_name == "90": return "str" # Can contain "--" else: return "str" # Default for Model Size, Available, etc. # Function to filter and update the table - using a simpler approach def update_table(search_term, selected_columns): df = get_libero_leaderboard() # Filter by search term if search_term: mask = df['Model'].str.contains(search_term, case=False, na=False) df = df[mask] # Handle column filtering by replacing hidden columns with empty strings # This keeps the datatype array stable while hiding unwanted data result_df = df[LIBERO_DISPLAY_COLUMNS].copy() # Always include mandatory columns + selected optional columns MANDATORY_COLUMNS = ["Model", "Model Size", "Paper"] SELECTABLE_COLUMNS = [col for col in LIBERO_DISPLAY_COLUMNS if col not in MANDATORY_COLUMNS] # Hide unselected optional columns by replacing their content with empty strings if selected_columns is not None: for col in SELECTABLE_COLUMNS: if col not in selected_columns: result_df[col] = "" # Hide the column content but keep the structure return result_df # Function to handle row selection and display video def show_video(evt: gr.SelectData): try: print(f"Leaderboard click event: {evt}") if hasattr(evt, 'index') and evt.index is not None: if isinstance(evt.index, (list, tuple)) and len(evt.index) >= 2: row_idx = evt.index[0] col_idx = evt.index[1] else: row_idx = evt.index col_idx = 0 print(f"Selected row: {row_idx}, column: {col_idx}") # Map column indices to task names (based on LIBERO_DISPLAY_COLUMNS) # Model, Model Size, Spatial, Object, Goal, 90, Long, Average, Availability, Paper task_mapping = { 2: "Spatial", # Spatial column 3: "Object", # Object column 4: "Goal", # Goal column 5: "90", # 90 column 6: "Long" # Long column } # Only show video when clicking on score columns (columns 2-6 are the LIBERO scores) if col_idx in task_mapping and row_idx < len(LIBERO_DATA): # Extract model name from HTML link model_html = LIBERO_DATA[row_idx][0] if "smolvla" in model_html.lower(): model_name = "SmolVLA" elif "pi0" in model_html.lower(): model_name = "Pi0" else: model_name = "SmolVLA" # default task_name = task_mapping[col_idx] print(f"Model selected: {model_name}, Task: {task_name}") video_path = get_video_by_model_and_task(model_name, task_name) print(f"Video path returned: {video_path}") if video_path: return video_path else: # Return None to clear the video display and show a message in console print(f"Videos coming soon for {model_name}!") return None print("Click on a score column (Spatial, Object, Goal, 90, Long) to see task-specific video") return None except Exception as e: print(f"Error in show_video: {e}") return None # Connect the controls to table updates search_box.change(update_table, inputs=[search_box, column_selector], outputs=libero_leaderboard) column_selector.change(update_table, inputs=[search_box, column_selector], outputs=libero_leaderboard) # Connect the leaderboard selection to video display libero_leaderboard.select(show_video, outputs=video_display) with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1): # About LIBERO gr.Markdown( """ ### 📖 About LIBERO LIBERO is a **benchmark suite** for evaluating **Vision-Language-Action (VLA)** models across a variety of robotics tasks. It provides a standardized setup so researchers and developers can compare models fairly. ### 🔗 Key Resources 📄 **LIBERO Paper**: [LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning](https://arxiv.org/abs/2306.03310) 💻 **Original LIBERO Repository**: [Lifelong-Robot-Learning/LIBERO](https://github.com/Lifelong-Robot-Learning/LIBERO) **Evaluation Metrics** - 📊 Each task suite column shows the **success rate** for that specific suite (0.0 - 1.0) - 📏 **Model Size**: Parameter count (e.g., 1B, 3B) - 📈 **Average**: Mean score across all LIBERO task suites - ✅ **Availability**: Whether model checkpoints are available or scores are paper-only - 📄 **Paper**: The links to research papers - 🎥 **Video**: Click on any model row to see a demo video if available """ ) # LIBERO Task Suites Description gr.Markdown( """ ### 📋 LIBERO Task Suites LIBERO includes five task suites, each with different focuses: - 🧭 **LIBERO-Spatial** (`libero_spatial`) – tasks that require reasoning about spatial relations - 🎯 **LIBERO-Object** (`libero_object`) – tasks centered on manipulating different objects - 🏁 **LIBERO-Goal** (`libero_goal`) – goal-conditioned tasks where the robot must adapt to changing targets - ⚡ **LIBERO-90** (`libero_90`) – 90 short-horizon tasks from the LIBERO-100 collection - 🔄 **LIBERO-Long** (`libero_10`) – 10 long-horizon tasks from the LIBERO-100 collection """ ) # How to train and evaluate ''' gr.Markdown( """ --- ### 🚀 How to Contribute To add your model to LIBERO leaderboard, we suggest to check the docs for using LIBERO with [LeRobot](https://huggingface.co/docs/lerobot/libero). As a quick overview, here are the steps: **1. Train** on the LIBERO dataset: 👉 [HuggingFaceVLA/libero](https://huggingface.co/datasets/HuggingFaceVLA/libero) *(LeRobot-compatible preprocessed dataset)* 📝 *Official dataset: [physical-intelligence/libero](https://huggingface.co/datasets/physical-intelligence/libero)* **2. Evaluate** using `lerobot` with the following script: ```bash #!/bin/bash # Storage / caches RAID=/raid/jade export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers export HF_HOME=$RAID/.cache/huggingface export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot export WANDB_CACHE_DIR=$RAID/.cache/wandb export TMPDIR=$RAID/.cache/tmp mkdir -p $TMPDIR export WANDB_MODE=offline export TOKENIZERS_PARALLELISM=false export MUJOCO_GL=egl export CUDA_VISIBLE_DEVICES=2 # Configuration POLICY_PATH="/raid/jade/models/smolvla_pipe" TASK=libero_spatial ENV_TYPE="libero" BATCH_SIZE=1 N_EPISODES=1 N_ACTION_STEPS=10 # Run evaluation python src/lerobot/scripts/eval.py \\ --policy.path="$POLICY_PATH" \\ --env.type="$ENV_TYPE" \\ --eval.batch_size="$BATCH_SIZE" \\ --eval.n_episodes="$N_EPISODES" \\ --env.task=$TASK \\ --env.max_parallel_tasks=10 \\ --policy.n_action_steps=$N_ACTION_STEPS ``` **3. Submit your results** by opening a GitHub issue. We'll add your model + video to the leaderboard! ### 📋 Dataset Information When training on LIBERO tasks, make sure your dataset parquet and metadata keys follow the LeRobot convention. The environment expects: - `observation.state` → 8-dim agent state - `observation.images.image` → main camera (agentview_image) - `observation.images.image2` → wrist camera (robot0_eye_in_hand_image) ⚠️ **Important**: Cleaning the dataset upfront is more efficient than remapping keys inside the code. To avoid potential mismatches and key errors, we provide a preprocessed LIBERO dataset that is fully compatible with the current LeRobot codebase and requires no additional manipulation. **Installation** (after following [LeRobot installation](https://huggingface.co/docs/lerobot/en/installation)): ```bash pip install -e ".[libero]" export MUJOCO_GL=egl # for headless servers (HPC, cloud) ``` """ ) gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") ''' with gr.TabItem("🚀 How To Contribute! ", elem_id="llm-benchmark-tab-table", id=2): # How to Contribute Section gr.Markdown( """ # 🚀 How to Contribute to LIBERO Leaderboard To add your model to LIBERO leaderboard, we suggest to check the docs for using LIBERO with [LeRobot](https://huggingface.co/docs/lerobot/libero). As a quick overview, here are the steps: **1. Train** on the LIBERO dataset: 👉 [HuggingFaceVLA/libero](https://huggingface.co/datasets/HuggingFaceVLA/libero) *(LeRobot-compatible preprocessed dataset)* 📝 *Official dataset: [physical-intelligence/libero](https://huggingface.co/datasets/physical-intelligence/libero)* **2. Evaluate** using `lerobot` with the following script: ```bash #!/bin/bash # Storage / caches RAID=/raid/jade export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers export HF_HOME=$RAID/.cache/huggingface export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot export WANDB_CACHE_DIR=$RAID/.cache/wandb export TMPDIR=$RAID/.cache/tmp mkdir -p $TMPDIR export WANDB_MODE=offline export TOKENIZERS_PARALLELISM=false export MUJOCO_GL=egl export CUDA_VISIBLE_DEVICES=2 # Configuration POLICY_PATH="/raid/jade/models/smolvla_pipe" TASK=libero_spatial ENV_TYPE="libero" BATCH_SIZE=1 N_EPISODES=1 N_ACTION_STEPS=10 # Run evaluation python src/lerobot/scripts/eval.py \\ --policy.path="$POLICY_PATH" \\ --env.type="$ENV_TYPE" \\ --eval.batch_size="$BATCH_SIZE" \\ --eval.n_episodes="$N_EPISODES" \\ --env.task=$TASK \\ --env.max_parallel_tasks=10 \\ --policy.n_action_steps=$N_ACTION_STEPS ``` **3. Submit your results** by opening a GitHub issue. We'll add your model + video to the leaderboard! ### 📋 Dataset Information When training on LIBERO tasks, make sure your dataset parquet and metadata keys follow the LeRobot convention. The environment expects: - `observation.state` → 8-dim agent state - `observation.images.image` → main camera (agentview_image) - `observation.images.image2` → wrist camera (robot0_eye_in_hand_image) ⚠️ **Important**: Cleaning the dataset upfront is more efficient than remapping keys inside the code. To avoid potential mismatches and key errors, we provide a preprocessed LIBERO dataset that is fully compatible with the current LeRobot codebase and requires no additional manipulation. **Installation** (after following [LeRobot installation](https://huggingface.co/docs/lerobot/en/installation)): ```bash pip install -e ".[libero]" export MUJOCO_GL=egl # for headless servers (HPC, cloud) ``` --- """, elem_classes="markdown-text" ) ''' with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) ''' scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch()