libero-vla-leaderboard

Running

App Files Files Community

libero-vla-leaderboard / app.py

danaaubakirova HF Staff

fix

ae47c8c about 1 month ago

raw

history blame contribute delete

25.7 kB

	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	import pandas as pd
	import os
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download

	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	LLM_BENCHMARKS_TEXT,
	INTRODUCTION_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	COLS,
	EVAL_COLS,
	EVAL_TYPES,
	AutoEvalColumn,
	ModelType,
	fields,
	WeightType,
	Precision
	)
	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.submission.submit import add_new_eval


	def restart_space():
	API.restart_space(repo_id=REPO_ID)

	### Space initialisation
	try:
	print(EVAL_REQUESTS_PATH)
	snapshot_download(
	repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()
	try:
	print(EVAL_RESULTS_PATH)
	snapshot_download(
	repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
	)
	except Exception:
	restart_space()


	# LIBERO Leaderboard Data
	LIBERO_DATA = [
	['<a href="https://huggingface.co/HuggingFaceVLA/smolvla_libero" target="_blank">HuggingFaceVLA/smolvla_libero</a>', "HuggingFace", "450M", 0.90, 1.0, 1.0, "--", 0.6, 0.87, "✅ Checkpoint Available", '<a href="https://huggingface.co/papers/2506.01844" target="_blank">📄 SmolVLA Paper</a>', "smolvla_spatial.mp4"],
	['<a href="https://huggingface.co/lerobot/pi0" target="_blank">lerobot/pi0</a>', "Physical Intelligence", "3.3B", 0.90, 0.86, 0.95, "--", 0.73, 0.86, "Reported Score Only", '<a href="https://huggingface.co/papers/2410.24164" target="_blank">📄 Pi0 Paper</a>', "pi0.mp4"],
	]

	LIBERO_COLUMNS = [
	"Model",
	"Organization",
	"Model Size",
	"Spatial",
	"Object",
	"Goal",
	"90",
	"Long",
	"Average",
	"Available",
	"Paper",
	"Video"
	]

	# Columns to display in the table (excluding Video and Organization columns)
	LIBERO_DISPLAY_COLUMNS = [
	"Model",
	"Model Size",
	"Spatial",
	"Object",
	"Goal",
	"90",
	"Long",
	"Average",
	"Available",
	"Paper"
	]

	LIBERO_DF = pd.DataFrame(LIBERO_DATA, columns=LIBERO_COLUMNS)

	def get_libero_leaderboard():
	return LIBERO_DF

	def get_video_by_model_and_task(model_name, task_name):
	"""Get video file path for a given model and task"""
	# Task-specific videos for each model (only SmolVLA has videos available)
	model_task_videos = {
	"SmolVLA": {
	"Spatial": "smolvla_spatial.mp4",
	"Object": "smolvla_object.mp4",
	"Goal": "smolvla_goal.mp4",
	"90": "smolvla_90.mp4",
	"Long": "smolvla_long.mp4"
	}
	# Pi0 videos not available yet
	}

	# Get the video for the specific model and task
	if model_name in model_task_videos and task_name in model_task_videos[model_name]:
	video_file = model_task_videos[model_name][task_name]
	print(f"Selected model: {model_name}, Task: {task_name}, Video file: {video_file}")
	return video_file
	else:
	print(f"No video available for {model_name} - {task_name}")
	return None

	LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)

	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

	def init_leaderboard(dataframe):
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")
	print([c.type for c in fields(AutoEvalColumn)])
	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn)],
	select_columns=SelectColumns(
	default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
	hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
	filter_columns=[
	ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
	ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
	ColumnFilter(
	AutoEvalColumn.params.name,
	type="slider",
	min=0.01,
	max=150,
	label="Select the number of parameters (B)",
	),
	ColumnFilter(
	AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
	),
	],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)


	custom_css_extended = custom_css + """
	/* More specific selectors to override Gradio defaults */
	.gradio-container #libero-leaderboard th,
	#libero-leaderboard thead th,
	#libero-leaderboard th {
	font-size: 10px !important;
	font-weight: bold !important;
	padding: 6px 8px !important;
	}
	.gradio-container #libero-leaderboard td,
	#libero-leaderboard tbody td,
	#libero-leaderboard td {
	font-size: 12px !important;
	padding: 6px 8px !important;
	}
	#libero-leaderboard th:first-child,
	#libero-leaderboard td:first-child {
	min-width: 300px !important;
	max-width: 400px !important;
	width: 350px !important;
	}
	#libero-leaderboard a {
	color: #0066cc !important;
	text-decoration: none !important;
	}
	#libero-leaderboard a:hover {
	text-decoration: underline !important;
	}
	"""

	demo = gr.Blocks(css=custom_css_extended)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🏆 LIBERO Leaderboard", elem_id="libero-benchmark-tab-table", id=0):
	# Header with image
	'''
	with gr.Row():
	gr.Markdown(
	"""
	<div align="center">
	<h1>🏆 LIBERO Leaderboard</h1>
	<h3>Benchmarking <b>Vision-Language-Action (VLA)</b> Policies in Simulation</h3>
	<p style="font-size:16px;">Made with ❤️ by <b>HuggingFace VLA</b></p>
	<img src="https://libero-project.github.io/assets/images/libero_banner.png"
	alt="LIBERO Banner" style="max-width: 80%; border-radius: 12px; margin-top: 20px;">
	</div>
	"""
	)
	'''
	# Full-width Leaderboard Section
	with gr.Group():
	gr.Markdown("### 🏅 Current Leaderboard")

	# Controls and video section in same row
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🔍 Search & Controls")
	search_box = gr.Textbox(
	label="Search models",
	placeholder="Type model name to search...",
	interactive=True
	)
	# Define columns that are always shown (not selectable)
	MANDATORY_COLUMNS = ["Model", "Model Size", "Paper"]
	# Define columns that can be toggled
	SELECTABLE_COLUMNS = [col for col in LIBERO_DISPLAY_COLUMNS if col not in MANDATORY_COLUMNS]

	column_selector = gr.CheckboxGroup(
	choices=SELECTABLE_COLUMNS,
	value=SELECTABLE_COLUMNS,
	label="Select optional columns to display",
	interactive=True
	)
	gr.Markdown("Always shown: Model, Model Size, Paper")

	with gr.Column(scale=1):
	gr.Markdown("### 🎥 Model Video Demo")
	gr.Markdown("Click on any model row in the table below to see its demo video")
	video_display = gr.Video(
	label="Demo video will appear here when you click on a model",
	height=300,
	autoplay=False,
	show_label=True,
	interactive=True,
	value=None
	)

	# Create a simple dataframe instead of complex Leaderboard to avoid issues
	libero_leaderboard = gr.Dataframe(
	value=get_libero_leaderboard()[LIBERO_DISPLAY_COLUMNS],
	headers=LIBERO_DISPLAY_COLUMNS,
	interactive=False,
	wrap=True,
	datatype=["html", "str", "number", "number", "number", "str", "number", "number", "str", "html"],
	elem_id="libero-leaderboard",
	)

	# Helper text
	gr.Markdown(
	"""
	💡 Tips:
	- Use the search box to find specific models
	- Click on SmolVLA scores (Spatial, Object, Goal, 90, Long) to see task-specific demo videos above
	- Click on model names to go directly to HuggingFace repositories
	- 🎬 Videos available: SmolVLA task demos \| Pi0 videos: Coming soon!
	""",
	elem_classes="markdown-text"
	)

	# Function to get datatype for a column
	def get_column_datatype(column_name):
	"""Return the appropriate datatype for each column"""
	if column_name in ["Model", "Paper"]:
	return "html" # Contains HTML links
	elif column_name in ["Spatial", "Object", "Goal", "Long", "Average"]:
	return "number"
	elif column_name == "90":
	return "str" # Can contain "--"
	else:
	return "str" # Default for Model Size, Available, etc.

	# Function to filter and update the table - using a simpler approach
	def update_table(search_term, selected_columns):
	df = get_libero_leaderboard()

	# Filter by search term
	if search_term:
	mask = df['Model'].str.contains(search_term, case=False, na=False)
	df = df[mask]

	# Handle column filtering by replacing hidden columns with empty strings
	# This keeps the datatype array stable while hiding unwanted data
	result_df = df[LIBERO_DISPLAY_COLUMNS].copy()

	# Always include mandatory columns + selected optional columns
	MANDATORY_COLUMNS = ["Model", "Model Size", "Paper"]
	SELECTABLE_COLUMNS = [col for col in LIBERO_DISPLAY_COLUMNS if col not in MANDATORY_COLUMNS]

	# Hide unselected optional columns by replacing their content with empty strings
	if selected_columns is not None:
	for col in SELECTABLE_COLUMNS:
	if col not in selected_columns:
	result_df[col] = "" # Hide the column content but keep the structure

	return result_df

	# Function to handle row selection and display video
	def show_video(evt: gr.SelectData):
	try:
	print(f"Leaderboard click event: {evt}")
	if hasattr(evt, 'index') and evt.index is not None:
	if isinstance(evt.index, (list, tuple)) and len(evt.index) >= 2:
	row_idx = evt.index[0]
	col_idx = evt.index[1]
	else:
	row_idx = evt.index
	col_idx = 0

	print(f"Selected row: {row_idx}, column: {col_idx}")

	# Map column indices to task names (based on LIBERO_DISPLAY_COLUMNS)
	# Model, Model Size, Spatial, Object, Goal, 90, Long, Average, Availability, Paper
	task_mapping = {
	2: "Spatial", # Spatial column
	3: "Object", # Object column
	4: "Goal", # Goal column
	5: "90", # 90 column
	6: "Long" # Long column
	}

	# Only show video when clicking on score columns (columns 2-6 are the LIBERO scores)
	if col_idx in task_mapping and row_idx < len(LIBERO_DATA):
	# Extract model name from HTML link
	model_html = LIBERO_DATA[row_idx][0]
	if "smolvla" in model_html.lower():
	model_name = "SmolVLA"
	elif "pi0" in model_html.lower():
	model_name = "Pi0"
	else:
	model_name = "SmolVLA" # default

	task_name = task_mapping[col_idx]
	print(f"Model selected: {model_name}, Task: {task_name}")
	video_path = get_video_by_model_and_task(model_name, task_name)
	print(f"Video path returned: {video_path}")
	if video_path:
	return video_path
	else:
	# Return None to clear the video display and show a message in console
	print(f"Videos coming soon for {model_name}!")
	return None

	print("Click on a score column (Spatial, Object, Goal, 90, Long) to see task-specific video")
	return None
	except Exception as e:
	print(f"Error in show_video: {e}")
	return None

	# Connect the controls to table updates
	search_box.change(update_table, inputs=[search_box, column_selector], outputs=libero_leaderboard)
	column_selector.change(update_table, inputs=[search_box, column_selector], outputs=libero_leaderboard)

	# Connect the leaderboard selection to video display
	libero_leaderboard.select(show_video, outputs=video_display)

	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
	# About LIBERO
	gr.Markdown(
	"""

	### 📖 About LIBERO
	LIBERO is a benchmark suite for evaluating Vision-Language-Action (VLA) models across a variety of robotics tasks.
	It provides a standardized setup so researchers and developers can compare models fairly.

	### 🔗 Key Resources

	📄 LIBERO Paper: [LIBERO: Benchmarking Knowledge Transfer for Lifelong Robot Learning](https://arxiv.org/abs/2306.03310)
	💻 Original LIBERO Repository: [Lifelong-Robot-Learning/LIBERO](https://github.com/Lifelong-Robot-Learning/LIBERO)

	Evaluation Metrics
	- 📊 Each task suite column shows the success rate for that specific suite (0.0 - 1.0)
	- 📏 Model Size: Parameter count (e.g., 1B, 3B)
	- 📈 Average: Mean score across all LIBERO task suites
	- ✅ Availability: Whether model checkpoints are available or scores are paper-only
	- 📄 Paper: The links to research papers
	- 🎥 Video: Click on any model row to see a demo video if available

	"""
	)

	# LIBERO Task Suites Description
	gr.Markdown(
	"""
	### 📋 LIBERO Task Suites
	LIBERO includes five task suites, each with different focuses:

	- 🧭 LIBERO-Spatial (`libero_spatial`) – tasks that require reasoning about spatial relations
	- 🎯 LIBERO-Object (`libero_object`) – tasks centered on manipulating different objects
	- 🏁 LIBERO-Goal (`libero_goal`) – goal-conditioned tasks where the robot must adapt to changing targets
	- ⚡ LIBERO-90 (`libero_90`) – 90 short-horizon tasks from the LIBERO-100 collection
	- 🔄 LIBERO-Long (`libero_10`) – 10 long-horizon tasks from the LIBERO-100 collection
	"""
	)

	# How to train and evaluate
	'''
	gr.Markdown(
	"""
	---
	### 🚀 How to Contribute
	To add your model to LIBERO leaderboard, we suggest to check the docs for using LIBERO with [LeRobot](https://huggingface.co/docs/lerobot/libero).
	As a quick overview, here are the steps:

	1. Train on the LIBERO dataset:
	👉 [HuggingFaceVLA/libero](https://huggingface.co/datasets/HuggingFaceVLA/libero) (LeRobot-compatible preprocessed dataset)
	📝 Official dataset: [physical-intelligence/libero](https://huggingface.co/datasets/physical-intelligence/libero)

	2. Evaluate using `lerobot` with the following script:
	```bash
	#!/bin/bash
	# Storage / caches
	RAID=/raid/jade
	export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
	export HF_HOME=$RAID/.cache/huggingface
	export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
	export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
	export WANDB_CACHE_DIR=$RAID/.cache/wandb
	export TMPDIR=$RAID/.cache/tmp
	mkdir -p $TMPDIR
	export WANDB_MODE=offline
	export TOKENIZERS_PARALLELISM=false
	export MUJOCO_GL=egl
	export CUDA_VISIBLE_DEVICES=2
	# Configuration
	POLICY_PATH="/raid/jade/models/smolvla_pipe"
	TASK=libero_spatial
	ENV_TYPE="libero"
	BATCH_SIZE=1
	N_EPISODES=1
	N_ACTION_STEPS=10
	# Run evaluation
	python src/lerobot/scripts/eval.py \\
	--policy.path="$POLICY_PATH" \\
	--env.type="$ENV_TYPE" \\
	--eval.batch_size="$BATCH_SIZE" \\
	--eval.n_episodes="$N_EPISODES" \\
	--env.task=$TASK \\
	--env.max_parallel_tasks=10 \\
	--policy.n_action_steps=$N_ACTION_STEPS
	```

	3. Submit your results by opening a GitHub issue.
	We'll add your model + video to the leaderboard!

	### 📋 Dataset Information

	When training on LIBERO tasks, make sure your dataset parquet and metadata keys follow the LeRobot convention.
	The environment expects:
	- `observation.state` → 8-dim agent state
	- `observation.images.image` → main camera (agentview_image)
	- `observation.images.image2` → wrist camera (robot0_eye_in_hand_image)

	⚠️ Important: Cleaning the dataset upfront is more efficient than remapping keys inside the code. To avoid potential mismatches and key errors, we provide a preprocessed LIBERO dataset that is fully compatible with the current LeRobot codebase and requires no additional manipulation.

	Installation (after following [LeRobot installation](https://huggingface.co/docs/lerobot/en/installation)):
	```bash
	pip install -e ".[libero]"
	export MUJOCO_GL=egl # for headless servers (HPC, cloud)
	```
	"""
	)

	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
	'''
	with gr.TabItem("🚀 How To Contribute! ", elem_id="llm-benchmark-tab-table", id=2):
	# How to Contribute Section
	gr.Markdown(
	"""
	# 🚀 How to Contribute to LIBERO Leaderboard

	To add your model to LIBERO leaderboard, we suggest to check the docs for using LIBERO with [LeRobot](https://huggingface.co/docs/lerobot/libero).
	As a quick overview, here are the steps:

	1. Train on the LIBERO dataset:
	👉 [HuggingFaceVLA/libero](https://huggingface.co/datasets/HuggingFaceVLA/libero) (LeRobot-compatible preprocessed dataset)
	📝 Official dataset: [physical-intelligence/libero](https://huggingface.co/datasets/physical-intelligence/libero)

	2. Evaluate using `lerobot` with the following script:
	```bash
	#!/bin/bash
	# Storage / caches
	RAID=/raid/jade
	export TRANSFORMERS_CACHE=$RAID/.cache/huggingface/transformers
	export HF_HOME=$RAID/.cache/huggingface
	export HF_DATASETS_CACHE=$RAID/.cache/huggingface/datasets
	export HF_LEROBOT_HOME=$RAID/.cache/huggingface/lerobot
	export WANDB_CACHE_DIR=$RAID/.cache/wandb
	export TMPDIR=$RAID/.cache/tmp
	mkdir -p $TMPDIR
	export WANDB_MODE=offline
	export TOKENIZERS_PARALLELISM=false
	export MUJOCO_GL=egl
	export CUDA_VISIBLE_DEVICES=2
	# Configuration
	POLICY_PATH="/raid/jade/models/smolvla_pipe"
	TASK=libero_spatial
	ENV_TYPE="libero"
	BATCH_SIZE=1
	N_EPISODES=1
	N_ACTION_STEPS=10
	# Run evaluation
	python src/lerobot/scripts/eval.py \\
	--policy.path="$POLICY_PATH" \\
	--env.type="$ENV_TYPE" \\
	--eval.batch_size="$BATCH_SIZE" \\
	--eval.n_episodes="$N_EPISODES" \\
	--env.task=$TASK \\
	--env.max_parallel_tasks=10 \\
	--policy.n_action_steps=$N_ACTION_STEPS
	```

	3. Submit your results by opening a GitHub issue.
	We'll add your model + video to the leaderboard!

	### 📋 Dataset Information

	When training on LIBERO tasks, make sure your dataset parquet and metadata keys follow the LeRobot convention.
	The environment expects:
	- `observation.state` → 8-dim agent state
	- `observation.images.image` → main camera (agentview_image)
	- `observation.images.image2` → wrist camera (robot0_eye_in_hand_image)

	⚠️ Important: Cleaning the dataset upfront is more efficient than remapping keys inside the code. To avoid potential mismatches and key errors, we provide a preprocessed LIBERO dataset that is fully compatible with the current LeRobot codebase and requires no additional manipulation.

	Installation (after following [LeRobot installation](https://huggingface.co/docs/lerobot/en/installation)):
	```bash
	pip install -e ".[libero]"
	export MUJOCO_GL=egl # for headless servers (HPC, cloud)
	```

	---
	""",
	elem_classes="markdown-text"
	)

	'''
	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)
	'''
	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch()