|
|
\documentclass{beamer} |
|
|
|
|
|
|
|
|
|
|
|
\usetheme{Madrid} |
|
|
\usecolortheme{default} |
|
|
|
|
|
|
|
|
\usepackage[utf8]{inputenc} |
|
|
\usepackage[T1]{fontenc} |
|
|
\usepackage{amsmath, amssymb, amsfonts} |
|
|
\usepackage{booktabs} |
|
|
\usepackage{graphicx} |
|
|
\usepackage{hyperref} |
|
|
\usepackage{bm} |
|
|
|
|
|
|
|
|
\newcommand{\KL}{D_{\mathrm{KL}}} |
|
|
\def\figref#1{Figure~\ref{#1}} |
|
|
|
|
|
\title[Meta-Safe RL]{A CMDP-within-online framework for Meta-Safe Reinforcement Learning} |
|
|
\author{Vanshaj Khattar\inst{1} \and Yuhao Ding\inst{2} \and Bilgehan Sel\inst{1} \and Javad Lavaei\inst{2} \and Ming Jin\inst{1}} |
|
|
\institute[VT \& UCB]{ |
|
|
\inst{1} Virginia Tech \\ |
|
|
\inst{2} UC Berkeley |
|
|
} |
|
|
\date{\today} |
|
|
|
|
|
|
|
|
\setbeamerfont{caption}{size=\scriptsize} |
|
|
\begin{document} |
|
|
\begin{frame} |
|
|
\titlepage |
|
|
\end{frame} |
|
|
\begin{frame}{Outline} |
|
|
\tableofcontents |
|
|
\end{frame} |
|
|
\section{Motivation} |
|
|
\begin{frame}{Motivation: Why Meta-Safe RL?} |
|
|
\begin{block}{Background: Meta-Reinforcement Learning (Meta-RL)} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item Meta-RL enables agents to learn new tasks quickly with limited experience. |
|
|
\item It's a "learning-to-learn" framework successful in robotics, federated learning, etc. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{The Problem: Safety is Critical} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item Many real-world applications have \alert{safety constraints} that must not be violated (e.g., robotics, autonomous driving). |
|
|
\item Existing Meta-RL methods do not adequately address these constraints. |
|
|
\item Safe RL problems are often modeled as \alert{Constrained Markov Decision Processes (CMDPs)}, but standard CMDP algorithms don't generalize efficiently to new tasks. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Our Goal} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item Develop a principled framework, \alert{Meta-Safe RL (Meta-SRL)}, that combines the fast adaptation of meta-learning with the safety guarantees of Safe RL. |
|
|
\item Provide the \alert{first provable guarantees} for learning across multiple safe RL tasks. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\section{Related Work} |
|
|
\begin{frame}{Related Work} |
|
|
\begin{itemize} |
|
|
\item \textbf{Meta-Reinforcement Learning:} |
|
|
\begin{itemize} |
|
|
\item Focuses on learning initial conditions, hyperparameters, etc., for fast adaptation. |
|
|
\item Most work is for \alert{unconstrained} environments. |
|
|
\end{itemize} |
|
|
\item \textbf{Online Meta-Learning:} |
|
|
\begin{itemize} |
|
|
\item Provides theoretical frameworks, often for convex and decomposable loss functions. |
|
|
\item Our work extends this to the \alert{nonconvex and complex} setting of CMDPs. |
|
|
\end{itemize} |
|
|
\item \textbf{Safe RL and CMDPs:} |
|
|
\begin{itemize} |
|
|
\item A rich field with many algorithms (e.g., primal-dual, policy-based like \alert{CRPO}). |
|
|
\item However, these are designed for a \alert{single task} and are not built to generalize or adapt quickly to unseen tasks. |
|
|
\end{itemize} |
|
|
\end{itemize} |
|
|
\end{frame} |
|
|
\section{Method} |
|
|
\begin{frame}{Method: CMDP-within-Online Framework} |
|
|
\begin{block}{Core Idea} |
|
|
\footnotesize |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item A \alert{meta-learner} (online algorithm) operates over a sequence of CMDP tasks. |
|
|
\item For each task $t$, the meta-learner provides an initial policy $\alert{\pi_{t,0}}$ and a learning rate $\alert{\alpha_t}$ to a \alert{within-task} Safe RL algorithm (e.g., CRPO). |
|
|
\item The goal is to minimize the \textbf{Task-Averaged Optimality Gap (TAOG)} and \textbf{Task-Averaged Constraint Violation (TACV)}. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\begin{figure} |
|
|
\centering |
|
|
\includegraphics[width=0.3\textwidth]{illustrate.pdf} |
|
|
\caption{Conceptual illustration of the meta-learning process.} |
|
|
\label{fig:method_concept} |
|
|
\end{figure} |
|
|
\end{frame} |
|
|
\begin{frame}{Method: The Within-Task Algorithm (CRPO)} |
|
|
|
|
|
\begin{block}{Constrained Markov Decision Process (CMDP)} |
|
|
\footnotesize |
|
|
For each task $t$, the agent aims to solve: |
|
|
\begin{equation*} |
|
|
\underset{\pi}{\max} \hspace{0.1cm} J_{t,0}(\pi) \hspace{0.3cm} \text{s.t.} \hspace{0.2cm} \alert{J_{t,i}(\pi) \leq d_{t,i}}, \hspace{0.3cm} \forall i = 1,...,p |
|
|
\end{equation*} |
|
|
where $J_{t,0}$ is the expected reward and $J_{t,i}$ are expected costs. |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{CRPO Algorithm \& Regret} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item We use the Constraint-Rectified Policy Optimization (\alert{CRPO}) algorithm. |
|
|
\item The single-task optimality gap ($R_0$) and constraint violation ($R_i$) are bounded by: |
|
|
\begin{equation*} |
|
|
R_0, R_i \leq \mathcal{O}\left( \frac{\mathbb{E}_{s \sim \nu_t^*}[\alert{\KL(\pi_t^*|\pi_{t,0})}]}{\alpha_t M} + \alpha_t \right) |
|
|
\end{equation*} |
|
|
\item \textbf{Key Insight:} The performance depends heavily on the KL-divergence between the optimal policy $\pi_t^*$ and the initial policy $\pi_{t,0}$. |
|
|
\item Our meta-learner will optimize this upper bound by choosing good `$\alert{\pi_{t,0}}$` and `$\alert{\alpha_t}$`. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\begin{frame}{Method: The Inexact Framework} |
|
|
|
|
|
\begin{block}{Challenge: Unknown Optimal Policies} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item In practice, the optimal policy $\alert{\pi_t^*}$ and its state distribution $\alert{\nu_t^*}$ are unknown. |
|
|
\item We only have access to a suboptimal policy $\alert{\hat{\pi}_t}$ and collected trajectory data $\alert{\mathcal{D}_t}$. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Solution: Estimate and Bound the Error} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item \textbf{Estimate:} Use the suboptimal policy $\hat{\pi}_t$ and estimate its state distribution $\hat{\nu}_t$ from data $\mathcal{D}_t$ using \alert{DualDICE}. |
|
|
\item \textbf{Inexact Loss:} The meta-learner optimizes an inexact loss function: |
|
|
$$ \hat{f}_{t}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)] $$ |
|
|
\item \textbf{Bound the Error:} We prove a bound on the estimation error: |
|
|
$$ |\mathbb{E}_{\nu_t^*}[\KL(\pi_t^*|\phi)] - \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]| \leq \alert{\epsilon_t} $$ |
|
|
This bound (Thm. 3.1) is derived using novel techniques from \alert{tame geometry}. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\begin{frame}{Method: Dynamic Regret \& Adaptive Learning Rates} |
|
|
|
|
|
\begin{block}{Challenge: Adapting to Dynamic Environments} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item A fixed meta-initialization may not be optimal if the environment changes over time. |
|
|
\item Setting the learning rate $\alpha_t$ optimally requires knowledge of future tasks. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Solution: Separate Online Learners} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item We decompose the regret upper bound into two components. |
|
|
\item We use two parallel Online Gradient Descent (OGD) algorithms: |
|
|
\begin{enumerate} |
|
|
\item \textbf{INIT}: Learns the policy initialization $\alert{\pi_{t,0}}$ by minimizing $\hat{f}_{t}^{init}(\phi) = \mathbb{E}_{\hat{\nu}_t}[\KL(\hat{\pi}_t|\phi)]$. |
|
|
\item \textbf{SIM}: Learns the learning rate $\alert{\alpha_t}$ by minimizing its own loss term $\hat{f}_t^{sim}(\kappa)$. |
|
|
\end{enumerate} |
|
|
\item This allows the framework to adapt both policy and learning rate online, without knowing task properties in advance. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\section{Innovation} |
|
|
\begin{frame}{Our Innovations} |
|
|
\begin{block}{Novel Framework and Guarantees} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item The \alert{first provable guarantees} for Meta-Safe RL, establishing bounds on task-averaged optimality gap (TAOG) and constraint violation (TACV). |
|
|
\item The regret bounds explicitly improve with \alert{task-similarity} ($\hat{D}^*$) or \alert{task-relatedness} ($\hat{V}_\psi$). |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Practical and Adaptive Algorithm} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item \textbf{Inexact framework}: Works with suboptimal policies and estimates distributions using \alert{DualDICE}, making it practical. |
|
|
\item \textbf{Adaptive learning}: The meta-learner adapts both policy initialization and learning rates for each task, handling dynamic environments. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Technical Contributions} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item New analysis of the \alert{optimization landscape of CMDPs} using tame geometry to bound the distance between optimal and suboptimal policies. |
|
|
\item Extended analysis for \alert{inexact online gradient descent} to handle dynamic regret with biased gradient estimates. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\begin{frame} |
|
|
\centering |
|
|
\Huge |
|
|
Experimental Evaluation |
|
|
\end{frame} |
|
|
\section{Experimental Method} |
|
|
\begin{frame}{Experimental Method} |
|
|
\begin{block}{Objective} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item To empirically validate the effectiveness of our \alert{Meta-SRL} framework against standard meta-learning baselines. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Baselines for Comparison} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item \alert{Random Initialization}: Standard CRPO with a new random policy for each task. |
|
|
\item \alert{Pre-trained}: Initialize with the final policy from the previous task. |
|
|
\item \alert{Simple Averaging}: Offline average of all previously learned policies. |
|
|
\item \alert{Follow the Average Leader (FAL)}: Online average of all previously learned policies. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Task Generation} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item We generate a sequence of related CMDP tasks by sampling from a distribution over environment parameters (e.g., transition dynamics, reward functions). |
|
|
\item We test under two conditions: \alert{high task-similarity} and \alert{low task-similarity}. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\section{Experimental Setting} |
|
|
\begin{frame}{Experimental Setting} |
|
|
\begin{block}{Environments} |
|
|
\footnotesize |
|
|
We use a range of classic control environments with added safety constraints: |
|
|
\begin{itemize} |
|
|
\item \textbf{OpenAI Gym:} |
|
|
\begin{itemize} |
|
|
\item \alert{FrozenLake}: Discrete state space, $T=10$ tasks. |
|
|
\item \alert{Acrobot}: Continuous state space, $T=50$ tasks. |
|
|
\end{itemize} |
|
|
\item \textbf{MuJoCo:} |
|
|
\begin{itemize} |
|
|
\item \alert{Half-Cheetah}: High-dimensional continuous control, $T=100$ tasks. Constraint on head height. |
|
|
\item \alert{Humanoid}: Very high-dimensional, $T=250$ tasks. Constraint on joint angles for smooth motion. |
|
|
\end{itemize} |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\section{Experimental Results} |
|
|
\begin{frame}{Experimental Results: Low Task-Similarity} |
|
|
\begin{columns}[T] |
|
|
\begin{column}{0.5\textwidth} |
|
|
\centering |
|
|
\textbf{FrozenLake} |
|
|
\includegraphics[width=1\textwidth]{FrozenLake/FrozenLakeLowSimilarity.pdf} |
|
|
\end{column} |
|
|
\begin{column}{0.5\textwidth} |
|
|
\centering |
|
|
\textbf{Acrobot} |
|
|
\includegraphics[width=1\textwidth]{Acrobot/Acrobot_low_similarity2.pdf} |
|
|
\end{column} |
|
|
\end{columns} |
|
|
|
|
|
\begin{block}{Observations} |
|
|
\footnotesize |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item In settings with low task similarity, \alert{Meta-SRL} (our method) consistently learns faster and more safely. |
|
|
\item It achieves higher rewards while rapidly satisfying the safety constraints (driving constraint violation to zero). |
|
|
\item Simpler baselines like \alert{FAL} and \alert{Pre-trained} struggle to satisfy constraints or learn good policies. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\begin{frame}{Experimental Results: MuJoCo Environments} |
|
|
\centering |
|
|
\textbf{Half-Cheetah (Low Task-Similarity)} |
|
|
\begin{figure} |
|
|
\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahReward_low_task_similarity_broken_axis.pdf} |
|
|
\includegraphics[width=0.6\textwidth]{HalfCheetah/HalfCheetahCost_low_task_similarity.pdf} |
|
|
\caption{Reward (top) and constraint violation (bottom) for Half-Cheetah. Our method (Meta-SRL) learns a high-reward policy while keeping the constraint violation below the threshold (blue line).} |
|
|
\label{fig:halfcheetah} |
|
|
\end{figure} |
|
|
\end{frame} |
|
|
\section{Ablation Experiment} |
|
|
\begin{frame}{Ablation Analysis} |
|
|
While no explicit ablation study was conducted, comparing Meta-SRL to the baselines serves as a validation of its key components. |
|
|
|
|
|
\begin{block}{Meta-SRL vs. FAL / Simple Averaging} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item \textbf{Ablated Component:} The intelligent meta-update (using \alert{DualDICE} estimates and \alert{OGD} on the regret bound). |
|
|
\item \textbf{Result:} Meta-SRL significantly outperforms simple averaging, showing that a weighted, adaptive update is crucial and superior to naive averaging. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Meta-SRL vs. Pre-trained} |
|
|
\footnotesize |
|
|
\begin{itemize} |
|
|
\item \textbf{Ablated Component:} Learning from a history of multiple tasks. The pre-trained baseline only uses the most recent task. |
|
|
\item \textbf{Result:} Meta-SRL is more robust, especially in low-similarity settings, demonstrating the benefit of aggregating knowledge from diverse past experiences. |
|
|
\end{itemize} |
|
|
\end{block} |
|
|
|
|
|
\begin{block}{Conclusion} |
|
|
\footnotesize |
|
|
The full \alert{Meta-SRL} model, with its inexact estimation and adaptive learning, is critical for achieving strong performance and safety. |
|
|
\end{block} |
|
|
\end{frame} |
|
|
\section{Deficiencies} |
|
|
\begin{frame}{Limitations of the Current Method} |
|
|
\begin{itemize} |
|
|
\item \textbf{Algorithm-Specific Guarantees:} |
|
|
\begin{itemize} |
|
|
\item Our theoretical framework is built upon the \alert{CRPO} algorithm. |
|
|
\item Extending it to other within-task Safe RL algorithms (e.g., primal-dual methods) would require a new analysis of their specific regret bounds. |
|
|
\end{itemize} |
|
|
\bigskip |
|
|
\item \textbf{No Hard Safety Guarantees During Learning:} |
|
|
\begin{itemize} |
|
|
\item The framework minimizes task-averaged constraint violation, achieving safety \textit{on average} and \textit{asymptotically}. |
|
|
\item It does not guarantee \alert{zero constraint violation} at every step during the learning process, which may be a requirement for highly critical systems. |
|
|
\end{itemize} |
|
|
\end{itemize} |
|
|
\end{frame} |
|
|
\section{Future Research} |
|
|
\begin{frame}{Future Research Directions} |
|
|
\begin{itemize} |
|
|
\item \textbf{Meta-SRL with Zero-Violation Guarantees:} |
|
|
\begin{itemize} |
|
|
\item Designing frameworks that can provide hard safety constraints throughout the learning phase, possibly by integrating pessimistic or certified approaches. |
|
|
\end{itemize} |
|
|
\bigskip |
|
|
\item \textbf{Extension to More Complex Scenarios:} |
|
|
\begin{itemize} |
|
|
\item \alert{Non-stationary environments} where the task distribution itself may shift over time. |
|
|
\item \alert{Multi-agent settings}, where agents must learn to coordinate safely and adapt to each other's policies. |
|
|
\end{itemize} |
|
|
\bigskip |
|
|
\item \textbf{Fairness and Socially Responsible AI:} |
|
|
\begin{itemize} |
|
|
\item Adapting the framework to handle \alert{fairness constraints}, ensuring that RL agents do not produce biased or discriminatory outcomes in non-stationary environments. |
|
|
\end{itemize} |
|
|
\end{itemize} |
|
|
\end{frame} |
|
|
\section{End} |
|
|
\begin{frame} |
|
|
\centering |
|
|
\Huge |
|
|
Thank You! |
|
|
\vfill |
|
|
\Large |
|
|
Questions? |
|
|
\end{frame} |
|
|
\end{document} |