PaperShow / Paper2Video /src /latex_proj /slidesrefined.tex
ZaynZhu
Clean version without large assets
7c08dc3
\documentclass{beamer}
% Theme and Color
\usetheme{Madrid}
\usecolortheme{default}
% Packages
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{amsmath, amssymb, amsfonts}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{bm} % For bold math symbols
% Custom commands from the source text for consistency
\newcommand{\KL}{D_{\mathrm{KL}}}
\def\figref#1{Figure~\ref{#1}}
\title[Meta-Safe RL]{A CMDP-within-online framework for Meta-Safe Reinforcement Learning}
\author{Vanshaj Khattar\inst{1} \and Yuhao Ding\inst{2} \and Bilgehan Sel\inst{1} \and Javad Lavaei\inst{2} \and Ming Jin\inst{1}}
\institute[VT \& UCB]{
\inst{1} Virginia Tech \\
\inst{2} UC Berkeley
}
\date{\today}
\setbeamerfont{caption}{size=\scriptsize}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Outline}
\tableofcontents
\end{frame}
\section{Motivation}
\begin{frame}{Motivation: Why Meta-Safe RL?}
\begin{block}{Background: Meta-Reinforcement Learning (Meta-RL)}
\footnotesize
\begin{itemize}
\item Meta-RL enables agents to learn new tasks quickly with limited experience.
\item It's a "learning-to-learn" framework successful in robotics, federated learning, etc.
\end{itemize}
\end{block}
\begin{block}{The Problem: Safety is Critical}
\footnotesize
\begin{itemize}
\item Many real-world applications have \alert{safety constraints} that must not be violated (e.g., robotics, autonomous driving).
\item Existing Meta-RL methods do not adequately address these constraints.
\item Safe RL problems are often modeled as \alert{Constrained Markov Decision Processes (CMDPs)}, but standard CMDP algorithms don't generalize efficiently to new tasks.
\end{itemize}
\end{block}
\begin{block}{Our Goal}
\footnotesize
\begin{itemize}
\item Develop a principled framework, \alert{Meta-Safe RL (Meta-SRL)}, that combines the fast adaptation of meta-learning with the safety guarantees of Safe RL.
\item Provide the \alert{first provable guarantees} for learning across multiple safe RL tasks.
\end{itemize}
\end{block}
\end{frame}
\section{Related Work}
\begin{frame}{Related Work}
\begin{itemize}
\item \textbf{Meta-Reinforcement Learning:}
\begin{itemize}
\item Focuses on learning initial conditions, hyperparameters, etc., for fast adaptation.
\item Most work is for \alert{unconstrained} environments.
\end{itemize}
\item \textbf{Online Meta-Learning:}
\begin{itemize}
\item Provides theoretical frameworks, often for convex and decomposable loss functions.
\item Our work extends this to the \alert{nonconvex and complex} setting of CMDPs.
\end{itemize}
\item \textbf{Safe RL and CMDPs:}
\begin{itemize}
\item A rich field with many algorithms (e.g., primal-dual, policy-based like \alert{CRPO}).
\item However, these are designed for a \alert{single task} and are not built to generalize or adapt quickly to unseen tasks.
\end{itemize}
\end{itemize}
\end{frame}
\section{Method}
\begin{frame}{Method: CMDP-within-Online Framework}
\begin{block}{Core Idea}
\footnotesize
\footnotesize
\begin{itemize}
\item A \alert{meta-learner} (online algorithm) operates over a sequence of CMDP tasks.
\item For each task $t$, the meta-learner provides an initial policy $\alert{\pi_{t,0}}$ and a learning rate $\alert{\alpha_t}$ to a \alert{within-task} Safe RL algorithm (e.g., CRPO).
\item The goal is to minimize the \textbf{Task-Averaged Optimality Gap (TAOG)} and \textbf{Task-Averaged Constraint Violation (TACV)}.
\end{itemize}
\end{block}
\begin{figure}
\centering
\includegraphics[width=0.3\textwidth]{illustrate.pdf}
\caption{Conceptual illustration of the meta-learning process.}
\label{fig:method_concept}
\end{figure}
\end{frame}
\begin{frame}{Method: The Within-Task Algorithm (CRPO)}
% Corresponds to the source text Section 2.1
\begin{block}{Constrained Markov Decision Process (CMDP)}
\footnotesize
For each task $t$, the agent aims to solve:
\begin{equation*}
\underset{\pi}{\max} \hspace{0.1cm} J_{t,0}(\pi) \hspace{0.3cm} \text{s.t.} \hspace{0.2cm} \alert{J_{t,i}(\pi) \leq d_{t,i}}, \hspace{0.3cm} \forall i = 1,...,p
\end{equation*}
where $J_{t,0}$ is the expected reward and $J_{t,i}$ are expected costs.
\end{block}
\begin{block}{CRPO Algorithm \& Regret}
\footnotesize
\begin{itemize}
\item We use the Constraint-Rectified Policy Optimization (\alert{CRPO}) algorithm.
\item The single-task optimality gap ($R_0$) and constraint violation ($R_i$) are bounded by:
\begin{equation*}
R_0, R_i \leq \mathcal{O}\left( \frac{\mathbb{E}_{s \sim \nu_t^*}[\alert{\KL(\pi_t^*|\pi_{t,0})}]}{\alpha_t M} + \alpha_t \right)
\end{equation*}
\item \textbf{Key Insight:} The performance depends heavily on the KL-divergence between the optimal policy $\pi_t^*$ and the initial policy $\pi_{t,0}$.
\item Our meta-learner will optimize this upper bound by choosing good `$\alert{\pi_{t,0}}$` and `$\alert{\alpha_t}$`.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Method: The Inexact Framework}
% Corresponds to the source text Section 3.1
\begin{block}{Challenge: Unknown Optimal Policies}
\footnotesize
\begin{itemize}
\item In practice, the optimal policy $\alert{\pi_t^*}$ and its state distribution $\alert{\nu_t^*}$ are unknown.
\item We only have access to a suboptimal policy $\alert{\hat{\pi}_t}$ and collected trajectory data $\alert{\mathcal{D}_t}$.
\end{itemize}
\end{block}
\begin{block}{Solution: Estimate and Bound the Error}
\footnotesize
\begin{itemize}
\item \textbf{Estimate:} Use the suboptimal policy $\hat{\pi}_t$ and estimate its state distribution $\hat{\nu}_t$ from data $\mathcal{D}_t$ using \alert{DualDICE}.
\item \textbf{Inexact Loss:} The meta-learner optimizes an inexact loss function:
$$ \hat{f}_{t}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)] $$
\item \textbf{Bound the Error:} We prove a bound on the estimation error:
$$ |\mathbb{E}_{\nu_t^*}[\KL(\pi_t^*|\phi)] - \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]| \leq \alert{\epsilon_t} $$
This bound (Thm. 3.1) is derived using novel techniques from \alert{tame geometry}.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Method: Dynamic Regret \& Adaptive Learning Rates}
% Corresponds to the source text Section 3.3
\begin{block}{Challenge: Adapting to Dynamic Environments}
\footnotesize
\begin{itemize}
\item A fixed meta-initialization may not be optimal if the environment changes over time.
\item Setting the learning rate $\alpha_t$ optimally requires knowledge of future tasks.
\end{itemize}
\end{block}
\begin{block}{Solution: Separate Online Learners}
\footnotesize
\begin{itemize}
\item We decompose the regret upper bound into two components.
\item We use two parallel Online Gradient Descent (OGD) algorithms:
\begin{enumerate}
\item \textbf{INIT}: Learns the policy initialization $\alert{\pi_{t,0}}$ by minimizing $\hat{f}_{t}^{init}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]$.
\item \textbf{SIM}: Learns the learning rate $\alert{\alpha_t}$ by minimizing its own loss term $\hat{f}_t^{sim}(\kappa)$.
\end{enumerate}
\item This allows the framework to adapt both policy and learning rate online, without knowing task properties in advance.
\end{itemize}
\end{block}
\end{frame}
\section{Innovation}
\begin{frame}{Our Innovations}
\begin{block}{Novel Framework and Guarantees}
\footnotesize
\begin{itemize}
\item The \alert{first provable guarantees} for Meta-Safe RL, establishing bounds on task-averaged optimality gap (TAOG) and constraint violation (TACV).
\item The regret bounds explicitly improve with \alert{task-similarity} ($\hat{D}^*$) or \alert{task-relatedness} ($\hat{V}_\psi$).
\end{itemize}
\end{block}
\begin{block}{Practical and Adaptive Algorithm}
\footnotesize
\begin{itemize}
\item \textbf{Inexact framework}: Works with suboptimal policies and estimates distributions using \alert{DualDICE}, making it practical.
\item \textbf{Adaptive learning}: The meta-learner adapts both policy initialization and learning rates for each task, handling dynamic environments.
\end{itemize}
\end{block}
\begin{block}{Technical Contributions}
\footnotesize
\begin{itemize}
\item New analysis of the \alert{optimization landscape of CMDPs} using tame geometry to bound the distance between optimal and suboptimal policies.
\item Extended analysis for \alert{inexact online gradient descent} to handle dynamic regret with biased gradient estimates.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}
\centering
\Huge
Experimental Evaluation
\end{frame}
\section{Experimental Method}
\begin{frame}{Experimental Method}
\begin{block}{Objective}
\footnotesize
\begin{itemize}
\item To empirically validate the effectiveness of our \alert{Meta-SRL} framework against standard meta-learning baselines.
\end{itemize}
\end{block}
\begin{block}{Baselines for Comparison}
\footnotesize
\begin{itemize}
\item \alert{Random Initialization}: Standard CRPO with a new random policy for each task.
\item \alert{Pre-trained}: Initialize with the final policy from the previous task.
\item \alert{Simple Averaging}: Offline average of all previously learned policies.
\item \alert{Follow the Average Leader (FAL)}: Online average of all previously learned policies.
\end{itemize}
\end{block}
\begin{block}{Task Generation}
\footnotesize
\begin{itemize}
\item We generate a sequence of related CMDP tasks by sampling from a distribution over environment parameters (e.g., transition dynamics, reward functions).
\item We test under two conditions: \alert{high task-similarity} and \alert{low task-similarity}.
\end{itemize}
\end{block}
\end{frame}
\section{Experimental Setting}
\begin{frame}{Experimental Setting}
\begin{block}{Environments}
\footnotesize
We use a range of classic control environments with added safety constraints:
\begin{itemize}
\item \textbf{OpenAI Gym:}
\begin{itemize}
\item \alert{FrozenLake}: Discrete state space, $T=10$ tasks.
\item \alert{Acrobot}: Continuous state space, $T=50$ tasks.
\end{itemize}
\item \textbf{MuJoCo:}
\begin{itemize}
\item \alert{Half-Cheetah}: High-dimensional continuous control, $T=100$ tasks. Constraint on head height.
\item \alert{Humanoid}: Very high-dimensional, $T=250$ tasks. Constraint on joint angles for smooth motion.
\end{itemize}
\end{itemize}
\end{block}
\end{frame}
\section{Experimental Results}
\begin{frame}{Experimental Results: Low Task-Similarity}
\begin{columns}[T]
\begin{column}{0.5\textwidth}
\centering
\textbf{FrozenLake}
\includegraphics[width=1\textwidth]{FrozenLake/FrozenLakeLowSimilarity.pdf}
\end{column}
\begin{column}{0.5\textwidth}
\centering
\textbf{Acrobot}
\includegraphics[width=1\textwidth]{Acrobot/Acrobot_low_similarity2.pdf}
\end{column}
\end{columns}
\begin{block}{Observations}
\footnotesize
\footnotesize
\begin{itemize}
\item In settings with low task similarity, \alert{Meta-SRL} (our method) consistently learns faster and more safely.
\item It achieves higher rewards while rapidly satisfying the safety constraints (driving constraint violation to zero).
\item Simpler baselines like \alert{FAL} and \alert{Pre-trained} struggle to satisfy constraints or learn good policies.
\end{itemize}
\end{block}
\end{frame}
\begin{frame}{Experimental Results: MuJoCo Environments}
\centering
\textbf{Half-Cheetah (Low Task-Similarity)}
\begin{figure}
\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahReward_low_task_similarity_broken_axis.pdf}
\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahCost_low_task_similarity.pdf}
\caption{Reward (top) and constraint violation (bottom) for Half-Cheetah. Our method (Meta-SRL) learns a high-reward policy while keeping the constraint violation below the threshold (blue line).}
\label{fig:halfcheetah}
\end{figure}
\end{frame}
\section{Ablation Experiment}
\begin{frame}{Ablation Analysis}
While no explicit ablation study was conducted, comparing Meta-SRL to the baselines serves as a validation of its key components.
\begin{block}{Meta-SRL vs. FAL / Simple Averaging}
\footnotesize
\begin{itemize}
\item \textbf{Ablated Component:} The intelligent meta-update (using \alert{DualDICE} estimates and \alert{OGD} on the regret bound).
\item \textbf{Result:} Meta-SRL significantly outperforms simple averaging, showing that a weighted, adaptive update is crucial and superior to naive averaging.
\end{itemize}
\end{block}
\begin{block}{Meta-SRL vs. Pre-trained}
\footnotesize
\begin{itemize}
\item \textbf{Ablated Component:} Learning from a history of multiple tasks. The pre-trained baseline only uses the most recent task.
\item \textbf{Result:} Meta-SRL is more robust, especially in low-similarity settings, demonstrating the benefit of aggregating knowledge from diverse past experiences.
\end{itemize}
\end{block}
\begin{block}{Conclusion}
\footnotesize
The full \alert{Meta-SRL} model, with its inexact estimation and adaptive learning, is critical for achieving strong performance and safety.
\end{block}
\end{frame}
\section{Deficiencies}
\begin{frame}{Limitations of the Current Method}
\begin{itemize}
\item \textbf{Algorithm-Specific Guarantees:}
\begin{itemize}
\item Our theoretical framework is built upon the \alert{CRPO} algorithm.
\item Extending it to other within-task Safe RL algorithms (e.g., primal-dual methods) would require a new analysis of their specific regret bounds.
\end{itemize}
\bigskip
\item \textbf{No Hard Safety Guarantees During Learning:}
\begin{itemize}
\item The framework minimizes task-averaged constraint violation, achieving safety \textit{on average} and \textit{asymptotically}.
\item It does not guarantee \alert{zero constraint violation} at every step during the learning process, which may be a requirement for highly critical systems.
\end{itemize}
\end{itemize}
\end{frame}
\section{Future Research}
\begin{frame}{Future Research Directions}
\begin{itemize}
\item \textbf{Meta-SRL with Zero-Violation Guarantees:}
\begin{itemize}
\item Designing frameworks that can provide hard safety constraints throughout the learning phase, possibly by integrating pessimistic or certified approaches.
\end{itemize}
\bigskip
\item \textbf{Extension to More Complex Scenarios:}
\begin{itemize}
\item \alert{Non-stationary environments} where the task distribution itself may shift over time.
\item \alert{Multi-agent settings}, where agents must learn to coordinate safely and adapt to each other's policies.
\end{itemize}
\bigskip
\item \textbf{Fairness and Socially Responsible AI:}
\begin{itemize}
\item Adapting the framework to handle \alert{fairness constraints}, ensuring that RL agents do not produce biased or discriminatory outcomes in non-stationary environments.
\end{itemize}
\end{itemize}
\end{frame}
\section{End}
\begin{frame}
\centering
\Huge
Thank You!
\vfill
\Large
Questions?
\end{frame}
\end{document}