Spaces:

JaceWei
/

PaperShow

Sleeping

PaperShow / Paper2Video /src /latex_proj /slidesrefined.tex

ZaynZhu

Clean version without large assets

7c08dc3 28 days ago

16 kB

	\documentclass{beamer}


	% Theme and Color
	\usetheme{Madrid}
	\usecolortheme{default}

	% Packages
	\usepackage[utf8]{inputenc}
	\usepackage[T1]{fontenc}
	\usepackage{amsmath, amssymb, amsfonts}
	\usepackage{booktabs}
	\usepackage{graphicx}
	\usepackage{hyperref}
	\usepackage{bm} % For bold math symbols

	% Custom commands from the source text for consistency
	\newcommand{\KL}{D_{\mathrm{KL}}}
	\def\figref#1{Figure~\ref{#1}}

	\title[Meta-Safe RL]{A CMDP-within-online framework for Meta-Safe Reinforcement Learning}
	\author{Vanshaj Khattar\inst{1} \and Yuhao Ding\inst{2} \and Bilgehan Sel\inst{1} \and Javad Lavaei\inst{2} \and Ming Jin\inst{1}}
	\institute[VT \& UCB]{
	\inst{1} Virginia Tech \\
	\inst{2} UC Berkeley
	}
	\date{\today}


	\setbeamerfont{caption}{size=\scriptsize}
	\begin{document}
	\begin{frame}
	\titlepage
	\end{frame}
	\begin{frame}{Outline}
	\tableofcontents
	\end{frame}
	\section{Motivation}
	\begin{frame}{Motivation: Why Meta-Safe RL?}
	\begin{block}{Background: Meta-Reinforcement Learning (Meta-RL)}
	\footnotesize
	\begin{itemize}
	\item Meta-RL enables agents to learn new tasks quickly with limited experience.
	\item It's a "learning-to-learn" framework successful in robotics, federated learning, etc.
	\end{itemize}
	\end{block}

	\begin{block}{The Problem: Safety is Critical}
	\footnotesize
	\begin{itemize}
	\item Many real-world applications have \alert{safety constraints} that must not be violated (e.g., robotics, autonomous driving).
	\item Existing Meta-RL methods do not adequately address these constraints.
	\item Safe RL problems are often modeled as \alert{Constrained Markov Decision Processes (CMDPs)}, but standard CMDP algorithms don't generalize efficiently to new tasks.
	\end{itemize}
	\end{block}

	\begin{block}{Our Goal}
	\footnotesize
	\begin{itemize}
	\item Develop a principled framework, \alert{Meta-Safe RL (Meta-SRL)}, that combines the fast adaptation of meta-learning with the safety guarantees of Safe RL.
	\item Provide the \alert{first provable guarantees} for learning across multiple safe RL tasks.
	\end{itemize}
	\end{block}
	\end{frame}
	\section{Related Work}
	\begin{frame}{Related Work}
	\begin{itemize}
	\item \textbf{Meta-Reinforcement Learning:}
	\begin{itemize}
	\item Focuses on learning initial conditions, hyperparameters, etc., for fast adaptation.
	\item Most work is for \alert{unconstrained} environments.
	\end{itemize}
	\item \textbf{Online Meta-Learning:}
	\begin{itemize}
	\item Provides theoretical frameworks, often for convex and decomposable loss functions.
	\item Our work extends this to the \alert{nonconvex and complex} setting of CMDPs.
	\end{itemize}
	\item \textbf{Safe RL and CMDPs:}
	\begin{itemize}
	\item A rich field with many algorithms (e.g., primal-dual, policy-based like \alert{CRPO}).
	\item However, these are designed for a \alert{single task} and are not built to generalize or adapt quickly to unseen tasks.
	\end{itemize}
	\end{itemize}
	\end{frame}
	\section{Method}
	\begin{frame}{Method: CMDP-within-Online Framework}
	\begin{block}{Core Idea}
	\footnotesize
	\footnotesize
	\begin{itemize}
	\item A \alert{meta-learner} (online algorithm) operates over a sequence of CMDP tasks.
	\item For each task $t$, the meta-learner provides an initial policy $\alert{\pi_{t,0}}$ and a learning rate $\alert{\alpha_t}$ to a \alert{within-task} Safe RL algorithm (e.g., CRPO).
	\item The goal is to minimize the \textbf{Task-Averaged Optimality Gap (TAOG)} and \textbf{Task-Averaged Constraint Violation (TACV)}.
	\end{itemize}
	\end{block}
	\begin{figure}
	\centering
	\includegraphics[width=0.3\textwidth]{illustrate.pdf}
	\caption{Conceptual illustration of the meta-learning process.}
	\label{fig:method_concept}
	\end{figure}
	\end{frame}
	\begin{frame}{Method: The Within-Task Algorithm (CRPO)}
	% Corresponds to the source text Section 2.1
	\begin{block}{Constrained Markov Decision Process (CMDP)}
	\footnotesize
	For each task $t$, the agent aims to solve:
	\begin{equation*}
	\underset{\pi}{\max} \hspace{0.1cm} J_{t,0}(\pi) \hspace{0.3cm} \text{s.t.} \hspace{0.2cm} \alert{J_{t,i}(\pi) \leq d_{t,i}}, \hspace{0.3cm} \forall i = 1,...,p
	\end{equation*}
	where $J_{t,0}$ is the expected reward and $J_{t,i}$ are expected costs.
	\end{block}

	\begin{block}{CRPO Algorithm \& Regret}
	\footnotesize
	\begin{itemize}
	\item We use the Constraint-Rectified Policy Optimization (\alert{CRPO}) algorithm.
	\item The single-task optimality gap ($R_0$) and constraint violation ($R_i$) are bounded by:
	\begin{equation*}
	R_0, R_i \leq \mathcal{O}\left( \frac{\mathbb{E}_{s \sim \nu_t^}[\alert{\KL(\pi_t^\|\pi_{t,0})}]}{\alpha_t M} + \alpha_t \right)
	\end{equation*}
	\item \textbf{Key Insight:} The performance depends heavily on the KL-divergence between the optimal policy $\pi_t^*$ and the initial policy $\pi_{t,0}$.
	\item Our meta-learner will optimize this upper bound by choosing good `$\alert{\pi_{t,0}}$` and `$\alert{\alpha_t}$`.
	\end{itemize}
	\end{block}
	\end{frame}
	\begin{frame}{Method: The Inexact Framework}
	% Corresponds to the source text Section 3.1
	\begin{block}{Challenge: Unknown Optimal Policies}
	\footnotesize
	\begin{itemize}
	\item In practice, the optimal policy $\alert{\pi_t^}$ and its state distribution $\alert{\nu_t^}$ are unknown.
	\item We only have access to a suboptimal policy $\alert{\hat{\pi}_t}$ and collected trajectory data $\alert{\mathcal{D}_t}$.
	\end{itemize}
	\end{block}

	\begin{block}{Solution: Estimate and Bound the Error}
	\footnotesize
	\begin{itemize}
	\item \textbf{Estimate:} Use the suboptimal policy $\hat{\pi}_t$ and estimate its state distribution $\hat{\nu}_t$ from data $\mathcal{D}_t$ using \alert{DualDICE}.
	\item \textbf{Inexact Loss:} The meta-learner optimizes an inexact loss function:
	$$ \hat{f}_{t}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t\|\phi)] $$
	\item \textbf{Bound the Error:} We prove a bound on the estimation error:
	$$ \|\mathbb{E}_{\nu_t^}[\KL(\pi_t^\|\phi)] - \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t\|\phi)]\| \leq \alert{\epsilon_t} $$
	This bound (Thm. 3.1) is derived using novel techniques from \alert{tame geometry}.
	\end{itemize}
	\end{block}
	\end{frame}
	\begin{frame}{Method: Dynamic Regret \& Adaptive Learning Rates}
	% Corresponds to the source text Section 3.3
	\begin{block}{Challenge: Adapting to Dynamic Environments}
	\footnotesize
	\begin{itemize}
	\item A fixed meta-initialization may not be optimal if the environment changes over time.
	\item Setting the learning rate $\alpha_t$ optimally requires knowledge of future tasks.
	\end{itemize}
	\end{block}

	\begin{block}{Solution: Separate Online Learners}
	\footnotesize
	\begin{itemize}
	\item We decompose the regret upper bound into two components.
	\item We use two parallel Online Gradient Descent (OGD) algorithms:
	\begin{enumerate}
	\item \textbf{INIT}: Learns the policy initialization $\alert{\pi_{t,0}}$ by minimizing $\hat{f}_{t}^{init}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t\|\phi)]$.
	\item \textbf{SIM}: Learns the learning rate $\alert{\alpha_t}$ by minimizing its own loss term $\hat{f}_t^{sim}(\kappa)$.
	\end{enumerate}
	\item This allows the framework to adapt both policy and learning rate online, without knowing task properties in advance.
	\end{itemize}
	\end{block}
	\end{frame}
	\section{Innovation}
	\begin{frame}{Our Innovations}
	\begin{block}{Novel Framework and Guarantees}
	\footnotesize
	\begin{itemize}
	\item The \alert{first provable guarantees} for Meta-Safe RL, establishing bounds on task-averaged optimality gap (TAOG) and constraint violation (TACV).
	\item The regret bounds explicitly improve with \alert{task-similarity} ($\hat{D}^*$) or \alert{task-relatedness} ($\hat{V}_\psi$).
	\end{itemize}
	\end{block}

	\begin{block}{Practical and Adaptive Algorithm}
	\footnotesize
	\begin{itemize}
	\item \textbf{Inexact framework}: Works with suboptimal policies and estimates distributions using \alert{DualDICE}, making it practical.
	\item \textbf{Adaptive learning}: The meta-learner adapts both policy initialization and learning rates for each task, handling dynamic environments.
	\end{itemize}
	\end{block}

	\begin{block}{Technical Contributions}
	\footnotesize
	\begin{itemize}
	\item New analysis of the \alert{optimization landscape of CMDPs} using tame geometry to bound the distance between optimal and suboptimal policies.
	\item Extended analysis for \alert{inexact online gradient descent} to handle dynamic regret with biased gradient estimates.
	\end{itemize}
	\end{block}
	\end{frame}
	\begin{frame}
	\centering
	\Huge
	Experimental Evaluation
	\end{frame}
	\section{Experimental Method}
	\begin{frame}{Experimental Method}
	\begin{block}{Objective}
	\footnotesize
	\begin{itemize}
	\item To empirically validate the effectiveness of our \alert{Meta-SRL} framework against standard meta-learning baselines.
	\end{itemize}
	\end{block}

	\begin{block}{Baselines for Comparison}
	\footnotesize
	\begin{itemize}
	\item \alert{Random Initialization}: Standard CRPO with a new random policy for each task.
	\item \alert{Pre-trained}: Initialize with the final policy from the previous task.
	\item \alert{Simple Averaging}: Offline average of all previously learned policies.
	\item \alert{Follow the Average Leader (FAL)}: Online average of all previously learned policies.
	\end{itemize}
	\end{block}

	\begin{block}{Task Generation}
	\footnotesize
	\begin{itemize}
	\item We generate a sequence of related CMDP tasks by sampling from a distribution over environment parameters (e.g., transition dynamics, reward functions).
	\item We test under two conditions: \alert{high task-similarity} and \alert{low task-similarity}.
	\end{itemize}
	\end{block}
	\end{frame}
	\section{Experimental Setting}
	\begin{frame}{Experimental Setting}
	\begin{block}{Environments}
	\footnotesize
	We use a range of classic control environments with added safety constraints:
	\begin{itemize}
	\item \textbf{OpenAI Gym:}
	\begin{itemize}
	\item \alert{FrozenLake}: Discrete state space, $T=10$ tasks.
	\item \alert{Acrobot}: Continuous state space, $T=50$ tasks.
	\end{itemize}
	\item \textbf{MuJoCo:}
	\begin{itemize}
	\item \alert{Half-Cheetah}: High-dimensional continuous control, $T=100$ tasks. Constraint on head height.
	\item \alert{Humanoid}: Very high-dimensional, $T=250$ tasks. Constraint on joint angles for smooth motion.
	\end{itemize}
	\end{itemize}
	\end{block}
	\end{frame}
	\section{Experimental Results}
	\begin{frame}{Experimental Results: Low Task-Similarity}
	\begin{columns}[T]
	\begin{column}{0.5\textwidth}
	\centering
	\textbf{FrozenLake}
	\includegraphics[width=1\textwidth]{FrozenLake/FrozenLakeLowSimilarity.pdf}
	\end{column}
	\begin{column}{0.5\textwidth}
	\centering
	\textbf{Acrobot}
	\includegraphics[width=1\textwidth]{Acrobot/Acrobot_low_similarity2.pdf}
	\end{column}
	\end{columns}

	\begin{block}{Observations}
	\footnotesize
	\footnotesize
	\begin{itemize}
	\item In settings with low task similarity, \alert{Meta-SRL} (our method) consistently learns faster and more safely.
	\item It achieves higher rewards while rapidly satisfying the safety constraints (driving constraint violation to zero).
	\item Simpler baselines like \alert{FAL} and \alert{Pre-trained} struggle to satisfy constraints or learn good policies.
	\end{itemize}
	\end{block}
	\end{frame}
	\begin{frame}{Experimental Results: MuJoCo Environments}
	\centering
	\textbf{Half-Cheetah (Low Task-Similarity)}
	\begin{figure}
	\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahReward_low_task_similarity_broken_axis.pdf}
	\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahCost_low_task_similarity.pdf}
	\caption{Reward (top) and constraint violation (bottom) for Half-Cheetah. Our method (Meta-SRL) learns a high-reward policy while keeping the constraint violation below the threshold (blue line).}
	\label{fig:halfcheetah}
	\end{figure}
	\end{frame}
	\section{Ablation Experiment}
	\begin{frame}{Ablation Analysis}
	While no explicit ablation study was conducted, comparing Meta-SRL to the baselines serves as a validation of its key components.

	\begin{block}{Meta-SRL vs. FAL / Simple Averaging}
	\footnotesize
	\begin{itemize}
	\item \textbf{Ablated Component:} The intelligent meta-update (using \alert{DualDICE} estimates and \alert{OGD} on the regret bound).
	\item \textbf{Result:} Meta-SRL significantly outperforms simple averaging, showing that a weighted, adaptive update is crucial and superior to naive averaging.
	\end{itemize}
	\end{block}

	\begin{block}{Meta-SRL vs. Pre-trained}
	\footnotesize
	\begin{itemize}
	\item \textbf{Ablated Component:} Learning from a history of multiple tasks. The pre-trained baseline only uses the most recent task.
	\item \textbf{Result:} Meta-SRL is more robust, especially in low-similarity settings, demonstrating the benefit of aggregating knowledge from diverse past experiences.
	\end{itemize}
	\end{block}

	\begin{block}{Conclusion}
	\footnotesize
	The full \alert{Meta-SRL} model, with its inexact estimation and adaptive learning, is critical for achieving strong performance and safety.
	\end{block}
	\end{frame}
	\section{Deficiencies}
	\begin{frame}{Limitations of the Current Method}
	\begin{itemize}
	\item \textbf{Algorithm-Specific Guarantees:}
	\begin{itemize}
	\item Our theoretical framework is built upon the \alert{CRPO} algorithm.
	\item Extending it to other within-task Safe RL algorithms (e.g., primal-dual methods) would require a new analysis of their specific regret bounds.
	\end{itemize}
	\bigskip
	\item \textbf{No Hard Safety Guarantees During Learning:}
	\begin{itemize}
	\item The framework minimizes task-averaged constraint violation, achieving safety \textit{on average} and \textit{asymptotically}.
	\item It does not guarantee \alert{zero constraint violation} at every step during the learning process, which may be a requirement for highly critical systems.
	\end{itemize}
	\end{itemize}
	\end{frame}
	\section{Future Research}
	\begin{frame}{Future Research Directions}
	\begin{itemize}
	\item \textbf{Meta-SRL with Zero-Violation Guarantees:}
	\begin{itemize}
	\item Designing frameworks that can provide hard safety constraints throughout the learning phase, possibly by integrating pessimistic or certified approaches.
	\end{itemize}
	\bigskip
	\item \textbf{Extension to More Complex Scenarios:}
	\begin{itemize}
	\item \alert{Non-stationary environments} where the task distribution itself may shift over time.
	\item \alert{Multi-agent settings}, where agents must learn to coordinate safely and adapt to each other's policies.
	\end{itemize}
	\bigskip
	\item \textbf{Fairness and Socially Responsible AI:}
	\begin{itemize}
	\item Adapting the framework to handle \alert{fairness constraints}, ensuring that RL agents do not produce biased or discriminatory outcomes in non-stationary environments.
	\end{itemize}
	\end{itemize}
	\end{frame}
	\section{End}
	\begin{frame}
	\centering
	\Huge
	Thank You!
	\vfill
	\Large
	Questions?
	\end{frame}
	\end{document}