diff --git a/flash_attn/benchmark.html b/flash_attn/benchmark.html
new file mode 100644
index 0000000000000000000000000000000000000000..70d0e46ff13160963d3b931674761f5a5b7430a5
--- /dev/null
+++ b/flash_attn/benchmark.html
@@ -0,0 +1,4253 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>benchmark</title>
+    <script>
+// Apply theme and widget visibility immediately to prevent flicker
+        (function() {
+            const configTheme = 'dark';
+            const hasConfigUi = false;
+            const configUi = hasConfigUi ? 'None' : null;
+            const hasWidgetsConfig = false;
+            const widgetsOn = hasWidgetsConfig ? false : true;
+            let theme;
+            if (configTheme === 'auto') {
+                theme = window.matchMedia('(prefers-color-scheme: dark)').matches ? 'dark' : 'light';
+            } else {
+                theme = localStorage.getItem('uvnote-theme') || configTheme;
+            }
+            document.documentElement.setAttribute('data-theme', theme);
+
+            // Initialize UI theme (css theme)
+            let ui = hasConfigUi ? configUi : (localStorage.getItem('uvnote-ui') || 'default');
+            if (ui !== 'default' && ui !== 'none' && ui !== 'monocolor') { ui = 'default'; }
+            document.documentElement.setAttribute('data-ui', ui);
+
+            // Apply widgets visibility
+            document.documentElement.setAttribute('data-widgets', widgetsOn ? 'on' : 'off');
+        })();
+    </script>
+    <style>
+:root[data-theme="light"] {
+            --bg-primary: #ffffff;
+            --bg-secondary: #f6f8fa;
+            --bg-tertiary: #f8f9fa;
+            --bg-code: #f8f9fa;
+            --bg-error: #fdf2f2;
+            --bg-artifact: #e6f3ff;
+            --bg-artifact-hover: #d0e7ff;
+            
+            --text-primary: #333;
+            --text-secondary: #656d76;
+            --text-error: #c53030;
+            --text-link: #0969da;
+            
+            --border-primary: #e1e5e9;
+            --border-error: #e53e3e;
+            --border-cell-failed: #d73a49;
+            
+            --shadow: rgba(0, 0, 0, 0.1);
+        }
+
+        :root[data-theme="dark"] {
+            --bg-primary: #0a0a0a;
+            --bg-secondary: #121212;
+            --bg-tertiary: #181818;
+            --bg-code: #0d0d0d;
+            --bg-error: #1a0f0f;
+            --bg-artifact: #151515;
+            --bg-artifact-hover: #1a1a1a;
+            
+            --text-primary: #e0e0e0;
+            --text-secondary: #888888;
+            --text-error: #ff6b6b;
+            --text-link: #64b5f6;
+            
+            --border-primary: #2a2a2a;
+            --border-error: #ff6b6b;
+            --border-cell-failed: #ff6b6b;
+            
+            --shadow: rgba(255, 255, 255, 0.05);
+        }
+        /* Monocolor UI theme: black/white background, all text/borders single blue */
+        :root[data-ui="monocolor"] { --mono-color: #0a66ff; }
+        :root[data-ui="monocolor"][data-theme="light"] {
+            --bg-primary: #ffffff;
+        }
+        :root[data-ui="monocolor"][data-theme="dark"] {
+            --bg-primary: #000000;
+        }
+        :root[data-ui="monocolor"] {
+            --bg-secondary: var(--bg-primary);
+            --bg-tertiary: var(--bg-primary);
+            --bg-code: var(--bg-primary);
+            --bg-error: var(--bg-primary);
+            --bg-artifact: var(--bg-primary);
+            --bg-artifact-hover: var(--bg-primary);
+
+            --text-primary: var(--mono-color);
+            --text-secondary: var(--mono-color);
+            --text-error: var(--mono-color);
+            --text-link: var(--mono-color);
+
+            --border-primary: var(--mono-color);
+            --border-error: var(--mono-color);
+            --border-cell-failed: var(--mono-color);
+
+            --shadow: none;
+        }
+        :root[data-ui="monocolor"] a { color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-button,
+        :root[data-ui="monocolor"] .theme-toggle,
+        :root[data-ui="monocolor"] .reset-toggle { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-button:hover,
+        :root[data-ui="monocolor"] .theme-toggle:hover,
+        :root[data-ui="monocolor"] .reset-toggle:hover { background: var(--bg-primary); color: var(--mono-color); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .menu-dropdown { background: var(--bg-primary); border-color: var(--mono-color); box-shadow: none; }
+        :root[data-ui="monocolor"] .menu-item { color: var(--mono-color); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .system-info { background: var(--bg-primary); border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .cell { border-color: var(--mono-color); background: var(--bg-primary); }
+        :root[data-ui="monocolor"] .cell-header { background: var(--bg-primary); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .artifact { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .artifact:hover { background: var(--bg-primary); }
+        :root[data-ui="monocolor"] .artifact-preview img,
+        :root[data-ui="monocolor"] .artifact-preview svg { border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .status-widget { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .minimap,
+        :root[data-ui="monocolor"] .file-explorer,
+        :root[data-ui="monocolor"] .tools-widget {
+            background: var(--bg-primary);
+            border-color: var(--mono-color);
+            color: var(--mono-color);
+        }
+        :root[data-ui="monocolor"] .cell-code {
+            background: var(--bg-primary);
+            border-bottom-color: var(--mono-color);
+        }
+        :root[data-ui="monocolor"] .tools-title,
+        :root[data-ui="monocolor"] .file-explorer-section-title,
+        :root[data-ui="monocolor"] .minimap-title { color: var(--mono-color); border-bottom-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .tool-button { background: var(--bg-primary); border-color: var(--mono-color); color: var(--mono-color); }
+        :root[data-ui="monocolor"] .tool-button.active { border-color: var(--mono-color); }
+        :root[data-ui="monocolor"] .file-explorer-item,
+        :root[data-ui="monocolor"] .minimap-item { color: var(--mono-color); }
+        /* Force Pygments code to mono blue on mono bg */
+        :root[data-ui="monocolor"] .highlight { background: var(--bg-primary) !important; color: var(--mono-color) !important; }
+        :root[data-ui="monocolor"] .highlight *,
+        :root[data-ui="monocolor"] .highlight .hll { color: var(--mono-color) !important; background: transparent !important; border-color: var(--mono-color) !important; }
+        /* Default code font + metrics (overridable via frontmatter) */
+        :root { --code-font-size: 0.95rem; --code-line-height: 1.5; --code-pad-y: 0.75rem; }
+        /* Minimal UI theme overrides base variables for a flatter, 90s look */
+        :root[data-ui="none"] {
+            --bg-primary: #ffffff;
+            --bg-secondary: transparent;
+            --bg-tertiary: transparent;
+            --bg-code: #f9f9f9;
+            --bg-error: #fff0f0;
+            --bg-artifact: #f0f7ff;
+            --bg-artifact-hover: #e5f1ff;
+
+            --text-primary: #000000;
+            --text-secondary: #222222;
+            --text-error: #a00000;
+            --text-link: #0000ee;
+
+            --border-primary: #cccccc;
+            --border-error: #cc0000;
+            --border-cell-failed: #cc0000;
+
+            --shadow: none;
+        }
+        html {
+            overscroll-behavior: none;
+        }
+        body {
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            line-height: 1.4;
+            max-width: 1000px;
+            margin: 0 auto;
+            padding: 15px;
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            transition: background-color 0.2s ease, color 0.2s ease;
+            overscroll-behavior: none;
+        }
+        /* Minimal "none" UI theme overrides */
+        :root[data-ui="none"] body {
+            font-family: 'Times New Roman', Times, serif;
+            line-height: 1.5;
+            max-width: 860px;
+            padding: 12px;
+            background: #ffffff;
+            color: #000000;
+            transition: none;
+        }
+        
+        /* Two panel layout removed */
+        
+        .controls {
+            position: fixed;
+            top: 20px;
+            right: 20px;
+            display: flex;
+            flex-direction: column;
+            align-items: flex-end;
+            gap: 0.25rem;
+            z-index: 1000;
+        }
+        .controls-buttons { display: flex; gap: 0.5rem; }
+        
+        .menu-button {
+            position: relative;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            padding: 8px 12px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.9rem;
+            user-select: none;
+        }
+        /* Keep default control styling when widgets are enabled, even in minimal UI */
+        :root[data-ui="none"][data-widgets="on"] .menu-button,
+        :root[data-ui="none"][data-widgets="on"] .theme-toggle,
+        :root[data-ui="none"][data-widgets="on"] .reset-toggle {
+            background: #f6f6f6;
+            border: 1px solid #cccccc;
+            color: #222222;
+        }
+        
+        .menu-button:hover {
+            color: var(--text-primary);
+            background: var(--bg-tertiary);
+        }
+        /* Controls state indicator (top-right) */
+        /* Status widget (bottom-right) */
+        .status-widget {
+            position: fixed;
+            right: 20px;
+            bottom: 20px;
+            width: auto;
+            max-width: 260px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 6px 8px;
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            z-index: 100;
+        }
+        .status-widget strong { color: var(--text-primary); }
+        :root[data-ui="none"][data-widgets="on"] .status-widget { background: #f6f6f6; border-color: #ccc; color: #222; }
+        :root[data-ui="none"][data-widgets="on"] .menu-button:hover,
+        :root[data-ui="none"][data-widgets="on"] .theme-toggle:hover,
+        :root[data-ui="none"][data-widgets="on"] .reset-toggle:hover {
+            background: #ededed;
+            border-color: #bbbbbb;
+            color: #000000;
+        }
+        
+        .menu-dropdown {
+            position: absolute;
+            top: 100%;
+            right: 0;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 4px;
+            box-shadow: 0 4px 12px var(--shadow);
+            min-width: 160px;
+            opacity: 0;
+            visibility: hidden;
+            transform: translateY(-8px);
+            transition: all 0.2s ease;
+            z-index: 1001;
+            margin-top: 4px;
+        }
+        :root[data-ui="none"][data-widgets="on"] .menu-dropdown { background: #ffffff; border: 1px solid #cccccc; box-shadow: none; }
+        
+        .menu-button.active .menu-dropdown {
+            opacity: 1;
+            visibility: visible;
+            transform: translateY(0);
+        }
+        
+        .menu-item {
+            display: block;
+            padding: 8px 12px;
+            color: var(--text-secondary);
+            text-decoration: none;
+            font-size: 0.85rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: pointer;
+        }
+        :root[data-ui="none"] .menu-item { color: #000; border-bottom: 1px solid #eee; }
+        
+        .menu-item:last-child {
+            border-bottom: none;
+        }
+        
+        .menu-item:hover {
+            background: var(--bg-tertiary);
+            color: var(--text-primary);
+        }
+        
+        .menu-checkbox {
+            display: inline-block;
+            width: 16px;
+            font-family: monospace;
+            color: var(--text-link);
+        }
+        
+        .theme-toggle,
+        .reset-toggle {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            padding: 8px 12px;
+            border-radius: 4px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.9rem;
+            user-select: none;
+        }
+        
+        .theme-toggle:hover,
+        .reset-toggle:hover {
+            color: var(--text-primary);
+            background: var(--bg-tertiary);
+        }
+        
+        .system-info {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 4px;
+            padding: 8px 12px;
+            margin-bottom: 16px;
+            font-size: 0.85em;
+            color: var(--text-secondary);
+        }
+        
+        .system-info-header {
+            font-weight: 600;
+            color: var(--text-primary);
+            margin-bottom: 2px;
+        }
+        
+        .system-info-content {
+            font-family: monospace;
+        }
+        
+        .theme-toggle, .reset-toggle {
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            /* padding: 0.4rem 0.6rem; */
+            cursor: pointer;
+            font-family: inherit;
+            font-size: 0.8rem;
+            color: var(--text-secondary);
+            user-select: none;
+            transition: all 0.2s ease;
+            text-transform: lowercase;
+            letter-spacing: 0;
+        }
+        
+        .theme-toggle:hover, .reset-toggle:hover {
+            background: var(--bg-tertiary);
+            border-color: var(--text-secondary);
+            color: var(--text-primary);
+        }
+        
+        .minimap {
+            position: fixed;
+            bottom: 20px;
+            right: 20px;
+            width: 220px;
+            max-height: 400px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            overflow-y: auto;
+            z-index: 100;
+            opacity: 0.9;
+            transition: opacity 0.2s ease;
+        }
+        /* Hide widgets and controls when disabled via frontmatter */
+        :root[data-widgets="off"] .controls,
+        :root[data-widgets="off"] .minimap,
+        :root[data-widgets="off"] .file-explorer,
+        :root[data-widgets="off"] .tools-widget,
+        :root[data-widgets="off"] .status-widget { display: none !important; }
+        
+        .file-explorer {
+            position: fixed;
+            bottom: 20px; /* default; JS will stack */
+            right: 20px;
+            left: auto;
+            top: auto;
+            width: 220px;
+            max-height: 400px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            overflow-y: auto;
+            z-index: 100;
+            opacity: 0.9;
+            transition: opacity 0.2s ease;
+        }
+
+        /* Drawing overlay */
+        .draw-overlay {
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100vw;
+            height: 100vh;
+            z-index: 80; /* under widgets (100) and controls (1000) */
+            display: block;
+            pointer-events: none; /* enabled only when a tool is active */
+        }
+
+        /* Tools widget */
+        .tools-widget {
+            position: fixed;
+            bottom: 20px; /* default; JS will stack */
+            right: 20px;
+            left: auto;
+            top: auto;
+            width: 220px;
+            background: var(--bg-secondary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.5rem;
+            font-size: 0.7rem;
+            z-index: 100;
+            opacity: 0.95;
+        }
+        .tools-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab;
+            user-select: none;
+        }
+        .tools-row { display: flex; gap: 0.4rem; flex-wrap: wrap; }
+        .tool-button {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            padding: 0.25rem 0.4rem;
+            cursor: pointer;
+            color: var(--text-secondary);
+            font-family: inherit;
+            font-size: 0.75rem;
+            user-select: none;
+        }
+        .tool-button:hover { color: var(--text-primary); }
+        .tool-button.active { color: var(--text-primary); border-color: var(--text-secondary); background: var(--bg-secondary); }
+        
+        .minimap:hover, .file-explorer:hover {
+            opacity: 1;
+        }
+        
+        .minimap-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab; /* drag handle */
+            user-select: none;
+        }
+        
+        .minimap-item {
+            display: block;
+            color: var(--text-secondary);
+            text-decoration: none;
+            padding: 0.15rem 0;
+            border-left: 2px solid transparent;
+            padding-left: 0.5rem;
+            transition: all 0.2s ease;
+            cursor: pointer;
+        }
+        
+        .minimap-item:hover {
+            color: var(--text-primary);
+            border-left-color: var(--text-secondary);
+        }
+        
+        .minimap-item.active {
+            color: var(--text-primary);
+            border-left-color: var(--text-link);
+        }
+        
+        .minimap-heading {
+            font-weight: normal;
+        }
+        
+        .minimap-heading.h1 { padding-left: 0.5rem; }
+        .minimap-heading.h2 { padding-left: 1rem; }
+        .minimap-heading.h3 { padding-left: 1.5rem; }
+        .minimap-heading.h4 { padding-left: 2rem; }
+        .minimap-heading.h5 { padding-left: 2.5rem; }
+        .minimap-heading.h6 { padding-left: 3rem; }
+        
+        .minimap-cell {
+            color: var(--text-link);
+            opacity: 0.8;
+            font-style: italic;
+        }
+        
+        .minimap-cell:hover {
+            opacity: 1;
+        }
+        
+        .file-explorer-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            margin-bottom: 0.5rem;
+            padding-bottom: 0.25rem;
+            border-bottom: 1px solid var(--border-primary);
+            cursor: grab; /* drag handle */
+            user-select: none;
+        }
+        
+        .file-explorer-section {
+            margin-bottom: 0.75rem;
+        }
+        
+        .file-explorer-section-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            font-size: 0.65rem;
+            margin-bottom: 0.25rem;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        
+        .file-explorer-item {
+            display: block;
+            color: var(--text-secondary);
+            text-decoration: none;
+            padding: 0.1rem 0;
+            margin-left: 0.5rem;
+            transition: color 0.2s ease;
+            cursor: pointer;
+            font-family: monospace;
+        }
+        
+        .file-explorer-item:hover {
+            color: var(--text-primary);
+        }
+        
+        .file-explorer-item.script {
+            color: var(--text-link);
+        }
+        
+        .file-explorer-item.artifact {
+            color: var(--text-secondary);
+            opacity: 0.8;
+        }
+        
+
+        /* Hide widgets on smaller screens */
+        @media (max-width: 768px) {
+            .minimap, .file-explorer, .tools-widget {
+                display: none;
+            }
+        }
+        
+        .cell {
+            margin: 1rem 0;
+            border: 1px solid var(--border-primary);
+            border-radius: 2px;
+            overflow: hidden;
+            background: var(--bg-secondary);
+        }
+        :root[data-ui="none"] .cell { margin: 1em 0; border: none; background: transparent; }
+        .cell-header {
+            background: var(--bg-secondary);
+            padding: 0.5rem 1rem;
+            border-bottom: 1px solid var(--border-primary);
+            font-family: inherit;
+            font-size: 0.85rem;
+        }
+        :root[data-ui="none"] .cell-header { background: transparent; border: none; padding: 0; font-weight: bold; }
+        :root[data-ui="none"] .cell-content { padding: 0; }
+        :root[data-ui="none"] .copy-button,
+        :root[data-ui="none"] .collapse-indicators,
+        :root[data-ui="none"] .cell-meta,
+        :root[data-ui="none"] .cell-outputs-header { display: none !important; }
+        :root[data-ui="none"] pre, :root[data-ui="none"] code { font-family: Menlo, Monaco, 'Courier New', monospace; }
+        :root[data-ui="none"] .code-content pre { background: #f9f9f9; border: 1px solid #ddd; padding: 8px; }
+        :root[data-ui="none"] .output { background: transparent; border: none; padding: 0.25em 0; }
+            color: var(--text-secondary);
+            cursor: pointer;
+            user-select: none;
+            transition: background-color 0.2s ease;
+        }
+        .cell-header:hover {
+            background: var(--bg-tertiary);
+        }
+        .collapse-indicators {
+            color: var(--text-secondary);
+            font-size: 0.8rem;
+            opacity: 0.7;
+        }
+        .collapse-indicators span:hover {
+            color: var(--text-primary);
+            opacity: 1;
+        }
+        .cell-code {
+            display: block;
+            background: var(--bg-code);
+        }
+        .cell-code.collapsed {
+            display: none;
+        }
+        .cell-code pre {
+            margin: 0;
+            padding: 0.75rem;
+            background: var(--bg-code);
+            overflow-x: auto;
+            color: var(--text-primary);
+        }
+        .cell-output {
+            padding: 0.75rem;
+            /* background: var(--bg-primary); */
+            background: var(--bg-secondary);
+        }
+        .cell-output.collapsed {
+            display: none;
+        }
+        .cell-stdout {
+            background: var(--bg-tertiary);
+            padding: 0.75rem;
+            border-radius: 1px;
+            /* margin: 0.25rem 0; */
+            font-family: inherit;
+            font-size: 0.9rem;
+            white-space: pre-wrap;
+            color: var(--text-primary);
+        }
+        .cell-stderr {
+            background: var(--bg-error);
+            border-left: 2px solid var(--border-error);
+            padding: 1rem;
+            margin: 0.5rem 0;
+            font-family: inherit;
+            font-size: 0.9rem;
+            color: var(--text-error);
+            white-space: pre-wrap;
+        }
+        .uv-install-logs {
+            margin: 0.5rem 0;
+        }
+        .uv-logs-header {
+            cursor: pointer;
+            padding: 0.75rem;
+            border-left: 3px solid var(--border-color);
+            font-family: inherit;
+            font-size: 0.85rem;
+            color: var(--text-secondary);
+            user-select: none;
+        }
+        .uv-logs-content {
+            background: var(--bg-secondary);
+            padding: 1rem;
+            border-left: 3px solid var(--border-color);
+            white-space: pre-wrap;
+            font-family: monospace;
+            font-size: 0.85rem;
+            color: var(--text-secondary);
+            overflow-x: auto;
+        }
+        .cell-artifacts {
+            margin: 1rem 0;
+        }
+        .cell-artifacts h4 {
+            margin: 0 0 0.5rem 0;
+            color: var(--text-secondary);
+            font-size: 0.9rem;
+        }
+        .artifact {
+            display: inline-block;
+            background: var(--bg-artifact);
+            padding: 0.25rem 0.5rem;
+            border-radius: 1px;
+            margin: 0.25rem 0.5rem 0.25rem 0;
+            font-family: inherit;
+            font-size: 0.8rem;
+            color: var(--text-link);
+            text-decoration: none;
+            transition: background-color 0.2s ease;
+            border: 1px solid var(--border-primary);
+        }
+        .artifact:hover {
+            background: var(--bg-artifact-hover);
+        }
+        .artifact-preview {
+            margin-top: 1rem;
+        }
+        .artifact-preview img {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+        }
+        .artifact-preview svg {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid var(--border-primary);
+            border-radius: 1px;
+            display: block;
+        }
+        /* Style SVG text elements */
+        .artifact-preview svg g {
+            fill: var(--text-primary) !important;
+        }
+        /* Auto-theme SVG elements */
+        .artifact-preview svg {
+            background: transparent;
+        }
+        .cell-failed {
+            border-color: var(--border-cell-failed);
+        }
+        .cell-failed .cell-header {
+            background: var(--bg-error);
+            color: var(--text-error);
+        }
+        .cell-commented {
+            opacity: 0.6;
+            border-style: dashed;
+        }
+        .cell-commented .cell-header {
+            background: var(--bg-secondary);
+            color: var(--text-secondary);
+            font-style: italic;
+        }
+        .run-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+        }
+        .run-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+        }
+        .run-btn:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+        }
+        .copy-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+        }
+        .copy-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+        }
+        .copy-btn:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+        }
+        .raw-btn {
+            background: var(--bg-tertiary);
+            border: 1px solid var(--border-primary);
+            padding: 2px 6px;
+            border-radius: 2px;
+            color: var(--text-secondary);
+            cursor: pointer;
+            font-size: 0.75em;
+            font-family: inherit;
+            margin-left: 4px;
+            text-decoration: none;
+            display: inline-block;
+        }
+        .raw-btn:hover {
+            color: var(--text-primary);
+            background: var(--bg-primary);
+            text-decoration: none;
+        }
+        .output-stale {
+            opacity: 0.5;
+            position: relative;
+        }
+        .output-stale::after {
+            content: '⏳ updating...';
+            position: absolute;
+            top: 8px;
+            right: 8px;
+            background: var(--bg-secondary);
+            padding: 4px 8px;
+            border-radius: 2px;
+            font-size: 0.75em;
+            color: var(--text-secondary);
+            border: 1px solid var(--border-primary);
+        }
+        h1, h2, h3, h4, h5, h6 {
+            margin-top: 1.5rem;
+            margin-bottom: 0.75rem;
+            color: var(--text-primary);
+        }
+        h1 {
+            margin-top: 0;
+            margin-bottom: 1rem;
+        }
+        p {
+            margin: 0.75rem 0;
+            color: var(--text-primary);
+        }
+        a {
+            color: var(--text-link);
+        }
+        img {
+            max-width: 100%;
+            height: auto;
+            border-radius: 1px;
+            box-shadow: none;
+        }
+        pre, code {
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            font-size: var(--code-font-size);
+        }
+        .code-wrap { position: relative; }
+        .code-line-highlight { display: none; position: absolute; left: 0; right: 0; height: 1.5em; background: rgba(255, 235, 170, 0.35); pointer-events: none; border-left: 3px solid #f4c542; }
+        .line-number { cursor: pointer; text-decoration: none; color: var(--text-secondary); padding: 0 0.25rem; }
+        .line-number.selected { background: rgba(255, 235, 170, 0.4); color: var(--text-primary); }
+        
+        /* Line numbers */
+        .highlight-with-lines {
+            display: flex;
+        }
+        .line-numbers {
+            background: var(--bg-tertiary);
+            padding: var(--code-pad-y) 0.5rem;
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace;
+            font-size: var(--code-font-size);
+            line-height: var(--code-line-height);
+            color: var(--text-secondary);
+            user-select: none;
+            text-align: right;
+            border-right: 1px solid var(--border-primary);
+        }
+        .line-numbers .line-number {
+            display: block;
+            line-height: var(--code-line-height);
+        }
+        .highlight-with-lines .highlight {
+            flex: 1;
+        }
+        .highlight .hll { background-color: transparent; } /* don't conflict with our highlight */
+        .highlight pre {
+            white-space: pre;
+            margin: 0;
+            padding: var(--code-pad-y) 0.75rem;
+            line-height: var(--code-line-height);
+        }
+        
+        /* Collapsed code styling */
+        .cell-code.collapsed {
+            display: none;
+        }
+        .cell-code.expanded {
+            display: block;
+        }
+        
+        .cell-code {
+            display: block;
+            border-bottom: 1px solid var(--border-primary);
+        }
+        
+        
+        pre { line-height: 125%; }
+td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+[data-theme="light"] .highlight .hll { background-color: #ffffcc }
+[data-theme="light"] .highlight { background: #f8f8f8; }
+[data-theme="light"] .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */
+[data-theme="light"] .highlight .err { border: 1px solid #F00 } /* Error */
+[data-theme="light"] .highlight .k { color: #008000; font-weight: bold } /* Keyword */
+[data-theme="light"] .highlight .o { color: #666 } /* Operator */
+[data-theme="light"] .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */
+[data-theme="light"] .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */
+[data-theme="light"] .highlight .cp { color: #9C6500 } /* Comment.Preproc */
+[data-theme="light"] .highlight .cpf { color: #3D7B7B; font-style: italic } /* Comment.PreprocFile */
+[data-theme="light"] .highlight .c1 { color: #3D7B7B; font-style: italic } /* Comment.Single */
+[data-theme="light"] .highlight .cs { color: #3D7B7B; font-style: italic } /* Comment.Special */
+[data-theme="light"] .highlight .gd { color: #A00000 } /* Generic.Deleted */
+[data-theme="light"] .highlight .ge { font-style: italic } /* Generic.Emph */
+[data-theme="light"] .highlight .ges { font-weight: bold; font-style: italic } /* Generic.EmphStrong */
+[data-theme="light"] .highlight .gr { color: #E40000 } /* Generic.Error */
+[data-theme="light"] .highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+[data-theme="light"] .highlight .gi { color: #008400 } /* Generic.Inserted */
+[data-theme="light"] .highlight .go { color: #717171 } /* Generic.Output */
+[data-theme="light"] .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */
+[data-theme="light"] .highlight .gs { font-weight: bold } /* Generic.Strong */
+[data-theme="light"] .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+[data-theme="light"] .highlight .gt { color: #04D } /* Generic.Traceback */
+[data-theme="light"] .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */
+[data-theme="light"] .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */
+[data-theme="light"] .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */
+[data-theme="light"] .highlight .kp { color: #008000 } /* Keyword.Pseudo */
+[data-theme="light"] .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */
+[data-theme="light"] .highlight .kt { color: #B00040 } /* Keyword.Type */
+[data-theme="light"] .highlight .m { color: #666 } /* Literal.Number */
+[data-theme="light"] .highlight .s { color: #BA2121 } /* Literal.String */
+[data-theme="light"] .highlight .na { color: #687822 } /* Name.Attribute */
+[data-theme="light"] .highlight .nb { color: #008000 } /* Name.Builtin */
+[data-theme="light"] .highlight .nc { color: #00F; font-weight: bold } /* Name.Class */
+[data-theme="light"] .highlight .no { color: #800 } /* Name.Constant */
+[data-theme="light"] .highlight .nd { color: #A2F } /* Name.Decorator */
+[data-theme="light"] .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */
+[data-theme="light"] .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */
+[data-theme="light"] .highlight .nf { color: #00F } /* Name.Function */
+[data-theme="light"] .highlight .nl { color: #767600 } /* Name.Label */
+[data-theme="light"] .highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */
+[data-theme="light"] .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */
+[data-theme="light"] .highlight .nv { color: #19177C } /* Name.Variable */
+[data-theme="light"] .highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */
+[data-theme="light"] .highlight .w { color: #BBB } /* Text.Whitespace */
+[data-theme="light"] .highlight .mb { color: #666 } /* Literal.Number.Bin */
+[data-theme="light"] .highlight .mf { color: #666 } /* Literal.Number.Float */
+[data-theme="light"] .highlight .mh { color: #666 } /* Literal.Number.Hex */
+[data-theme="light"] .highlight .mi { color: #666 } /* Literal.Number.Integer */
+[data-theme="light"] .highlight .mo { color: #666 } /* Literal.Number.Oct */
+[data-theme="light"] .highlight .sa { color: #BA2121 } /* Literal.String.Affix */
+[data-theme="light"] .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */
+[data-theme="light"] .highlight .sc { color: #BA2121 } /* Literal.String.Char */
+[data-theme="light"] .highlight .dl { color: #BA2121 } /* Literal.String.Delimiter */
+[data-theme="light"] .highlight .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */
+[data-theme="light"] .highlight .s2 { color: #BA2121 } /* Literal.String.Double */
+[data-theme="light"] .highlight .se { color: #AA5D1F; font-weight: bold } /* Literal.String.Escape */
+[data-theme="light"] .highlight .sh { color: #BA2121 } /* Literal.String.Heredoc */
+[data-theme="light"] .highlight .si { color: #A45A77; font-weight: bold } /* Literal.String.Interpol */
+[data-theme="light"] .highlight .sx { color: #008000 } /* Literal.String.Other */
+[data-theme="light"] .highlight .sr { color: #A45A77 } /* Literal.String.Regex */
+[data-theme="light"] .highlight .s1 { color: #BA2121 } /* Literal.String.Single */
+[data-theme="light"] .highlight .ss { color: #19177C } /* Literal.String.Symbol */
+[data-theme="light"] .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */
+[data-theme="light"] .highlight .fm { color: #00F } /* Name.Function.Magic */
+[data-theme="light"] .highlight .vc { color: #19177C } /* Name.Variable.Class */
+[data-theme="light"] .highlight .vg { color: #19177C } /* Name.Variable.Global */
+[data-theme="light"] .highlight .vi { color: #19177C } /* Name.Variable.Instance */
+[data-theme="light"] .highlight .vm { color: #19177C } /* Name.Variable.Magic */
+[data-theme="light"] .highlight .il { color: #666 } /* Literal.Number.Integer.Long */
+
+pre { line-height: 125%; }
+td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
+td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
+[data-theme="dark"] .highlight .hll { background-color: #49483e }
+[data-theme="dark"] .highlight { background: #272822; color: #F8F8F2 }
+[data-theme="dark"] .highlight .c { color: #959077 } /* Comment */
+[data-theme="dark"] .highlight .err { color: #ED007E; background-color: #1E0010 } /* Error */
+[data-theme="dark"] .highlight .esc { color: #F8F8F2 } /* Escape */
+[data-theme="dark"] .highlight .g { color: #F8F8F2 } /* Generic */
+[data-theme="dark"] .highlight .k { color: #66D9EF } /* Keyword */
+[data-theme="dark"] .highlight .l { color: #AE81FF } /* Literal */
+[data-theme="dark"] .highlight .n { color: #F8F8F2 } /* Name */
+[data-theme="dark"] .highlight .o { color: #FF4689 } /* Operator */
+[data-theme="dark"] .highlight .x { color: #F8F8F2 } /* Other */
+[data-theme="dark"] .highlight .p { color: #F8F8F2 } /* Punctuation */
+[data-theme="dark"] .highlight .ch { color: #959077 } /* Comment.Hashbang */
+[data-theme="dark"] .highlight .cm { color: #959077 } /* Comment.Multiline */
+[data-theme="dark"] .highlight .cp { color: #959077 } /* Comment.Preproc */
+[data-theme="dark"] .highlight .cpf { color: #959077 } /* Comment.PreprocFile */
+[data-theme="dark"] .highlight .c1 { color: #959077 } /* Comment.Single */
+[data-theme="dark"] .highlight .cs { color: #959077 } /* Comment.Special */
+[data-theme="dark"] .highlight .gd { color: #FF4689 } /* Generic.Deleted */
+[data-theme="dark"] .highlight .ge { color: #F8F8F2; font-style: italic } /* Generic.Emph */
+[data-theme="dark"] .highlight .ges { color: #F8F8F2; font-weight: bold; font-style: italic } /* Generic.EmphStrong */
+[data-theme="dark"] .highlight .gr { color: #F8F8F2 } /* Generic.Error */
+[data-theme="dark"] .highlight .gh { color: #F8F8F2 } /* Generic.Heading */
+[data-theme="dark"] .highlight .gi { color: #A6E22E } /* Generic.Inserted */
+[data-theme="dark"] .highlight .go { color: #66D9EF } /* Generic.Output */
+[data-theme="dark"] .highlight .gp { color: #FF4689; font-weight: bold } /* Generic.Prompt */
+[data-theme="dark"] .highlight .gs { color: #F8F8F2; font-weight: bold } /* Generic.Strong */
+[data-theme="dark"] .highlight .gu { color: #959077 } /* Generic.Subheading */
+[data-theme="dark"] .highlight .gt { color: #F8F8F2 } /* Generic.Traceback */
+[data-theme="dark"] .highlight .kc { color: #66D9EF } /* Keyword.Constant */
+[data-theme="dark"] .highlight .kd { color: #66D9EF } /* Keyword.Declaration */
+[data-theme="dark"] .highlight .kn { color: #FF4689 } /* Keyword.Namespace */
+[data-theme="dark"] .highlight .kp { color: #66D9EF } /* Keyword.Pseudo */
+[data-theme="dark"] .highlight .kr { color: #66D9EF } /* Keyword.Reserved */
+[data-theme="dark"] .highlight .kt { color: #66D9EF } /* Keyword.Type */
+[data-theme="dark"] .highlight .ld { color: #E6DB74 } /* Literal.Date */
+[data-theme="dark"] .highlight .m { color: #AE81FF } /* Literal.Number */
+[data-theme="dark"] .highlight .s { color: #E6DB74 } /* Literal.String */
+[data-theme="dark"] .highlight .na { color: #A6E22E } /* Name.Attribute */
+[data-theme="dark"] .highlight .nb { color: #F8F8F2 } /* Name.Builtin */
+[data-theme="dark"] .highlight .nc { color: #A6E22E } /* Name.Class */
+[data-theme="dark"] .highlight .no { color: #66D9EF } /* Name.Constant */
+[data-theme="dark"] .highlight .nd { color: #A6E22E } /* Name.Decorator */
+[data-theme="dark"] .highlight .ni { color: #F8F8F2 } /* Name.Entity */
+[data-theme="dark"] .highlight .ne { color: #A6E22E } /* Name.Exception */
+[data-theme="dark"] .highlight .nf { color: #A6E22E } /* Name.Function */
+[data-theme="dark"] .highlight .nl { color: #F8F8F2 } /* Name.Label */
+[data-theme="dark"] .highlight .nn { color: #F8F8F2 } /* Name.Namespace */
+[data-theme="dark"] .highlight .nx { color: #A6E22E } /* Name.Other */
+[data-theme="dark"] .highlight .py { color: #F8F8F2 } /* Name.Property */
+[data-theme="dark"] .highlight .nt { color: #FF4689 } /* Name.Tag */
+[data-theme="dark"] .highlight .nv { color: #F8F8F2 } /* Name.Variable */
+[data-theme="dark"] .highlight .ow { color: #FF4689 } /* Operator.Word */
+[data-theme="dark"] .highlight .pm { color: #F8F8F2 } /* Punctuation.Marker */
+[data-theme="dark"] .highlight .w { color: #F8F8F2 } /* Text.Whitespace */
+[data-theme="dark"] .highlight .mb { color: #AE81FF } /* Literal.Number.Bin */
+[data-theme="dark"] .highlight .mf { color: #AE81FF } /* Literal.Number.Float */
+[data-theme="dark"] .highlight .mh { color: #AE81FF } /* Literal.Number.Hex */
+[data-theme="dark"] .highlight .mi { color: #AE81FF } /* Literal.Number.Integer */
+[data-theme="dark"] .highlight .mo { color: #AE81FF } /* Literal.Number.Oct */
+[data-theme="dark"] .highlight .sa { color: #E6DB74 } /* Literal.String.Affix */
+[data-theme="dark"] .highlight .sb { color: #E6DB74 } /* Literal.String.Backtick */
+[data-theme="dark"] .highlight .sc { color: #E6DB74 } /* Literal.String.Char */
+[data-theme="dark"] .highlight .dl { color: #E6DB74 } /* Literal.String.Delimiter */
+[data-theme="dark"] .highlight .sd { color: #E6DB74 } /* Literal.String.Doc */
+[data-theme="dark"] .highlight .s2 { color: #E6DB74 } /* Literal.String.Double */
+[data-theme="dark"] .highlight .se { color: #AE81FF } /* Literal.String.Escape */
+[data-theme="dark"] .highlight .sh { color: #E6DB74 } /* Literal.String.Heredoc */
+[data-theme="dark"] .highlight .si { color: #E6DB74 } /* Literal.String.Interpol */
+[data-theme="dark"] .highlight .sx { color: #E6DB74 } /* Literal.String.Other */
+[data-theme="dark"] .highlight .sr { color: #E6DB74 } /* Literal.String.Regex */
+[data-theme="dark"] .highlight .s1 { color: #E6DB74 } /* Literal.String.Single */
+[data-theme="dark"] .highlight .ss { color: #E6DB74 } /* Literal.String.Symbol */
+[data-theme="dark"] .highlight .bp { color: #F8F8F2 } /* Name.Builtin.Pseudo */
+[data-theme="dark"] .highlight .fm { color: #A6E22E } /* Name.Function.Magic */
+[data-theme="dark"] .highlight .vc { color: #F8F8F2 } /* Name.Variable.Class */
+[data-theme="dark"] .highlight .vg { color: #F8F8F2 } /* Name.Variable.Global */
+[data-theme="dark"] .highlight .vi { color: #F8F8F2 } /* Name.Variable.Instance */
+[data-theme="dark"] .highlight .vm { color: #F8F8F2 } /* Name.Variable.Magic */
+[data-theme="dark"] .highlight .il { color: #AE81FF } /* Literal.Number.Integer.Long */
+
+        /* Ensure our code metrics override Pygments defaults */
+        .highlight pre {
+            white-space: pre;
+            margin: 0;
+            padding: var(--code-pad-y) 0.75rem !important;
+            line-height: var(--code-line-height) !important;
+            font-size: var(--code-font-size) !important;
+            font-family: 'Cascadia Mono', 'Cascadia Code', 'JetBrains Mono', 'SF Mono', Monaco, 'Consolas', monospace !important;
+            border: none;
+        }
+        .line-numbers { line-height: var(--code-line-height) !important; }
+        .line-numbers .line-number { line-height: var(--code-line-height) !important; }
+
+        /* Custom CSS from frontmatter */
+        
+
+        
+        
+        
+        /* Cursor for tools */
+        body[data-tool="arrow"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><path d="M2 2l7.586 7.586"/><circle cx="11" cy="11" r="2"/></svg>') 12 12, crosshair;
+        }
+        body[data-tool="pen"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M12 19l7-7 3 3-7 7-3-3z"/><path d="M18 13l-1.5-7.5L2 2l3.5 14.5L13 18l5-5z"/><circle cx="4" cy="20" r="2" fill="%23e53935"/></svg>') 4 20, pointer;
+        }
+        body[data-tool="eraser"] .main-content { 
+            cursor: url('data:image/svg+xml;utf8,<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="%23e53935" stroke-width="2"><path d="M20 20H7l-7-7 7-7h13v14z"/><path d="M13 13l7-7"/><path d="M13 13L9 9"/></svg>') 12 12, auto;
+        }
+
+        /* Color picker styles */
+        .tools-section-title {
+            font-weight: bold;
+            color: var(--text-secondary);
+            font-size: 0.65rem;
+            margin: 0.75rem 0 0.5rem 0;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        .color-row {
+            display: grid;
+            grid-template-columns: repeat(6, 1fr);
+            gap: 0.25rem;
+            margin-bottom: 0.5rem;
+        }
+        .color-swatch {
+            width: 18px;
+            height: 18px;
+            border: 2px solid var(--border-primary);
+            border-radius: 3px;
+            cursor: pointer;
+            transition: all 0.2s ease;
+            position: relative;
+        }
+        .color-swatch:hover {
+            transform: scale(1.1);
+            border-color: var(--text-secondary);
+        }
+        .color-swatch.selected {
+            border-color: var(--text-primary);
+            box-shadow: 0 0 0 2px var(--text-link);
+        }
+        .color-swatch.selected::after {
+            content: '✓';
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            color: white;
+            font-size: 10px;
+            font-weight: bold;
+            text-shadow: 1px 1px 1px black;
+        }
+        .color-input {
+            width: 24px;
+            height: 24px;
+            border: 2px solid var(--border-primary);
+            border-radius: 3px;
+            cursor: pointer;
+            background: none;
+            padding: 0;
+            grid-column: span 2;
+            justify-self: center;
+        }
+        .color-input:hover {
+            border-color: var(--text-secondary);
+        }
+        
+        /* Thickness slider styles */
+        .thickness-row {
+            display: flex;
+            align-items: center;
+            gap: 0.5rem;
+            margin-top: 0.75rem;
+        }
+        .thickness-slider {
+            flex: 1;
+            -webkit-appearance: none;
+            appearance: none;
+            height: 4px;
+            background: var(--border-primary);
+            border-radius: 2px;
+            outline: none;
+            opacity: 0.7;
+            transition: opacity 0.2s;
+        }
+        .thickness-slider:hover {
+            opacity: 1;
+        }
+        .thickness-slider::-webkit-slider-thumb {
+            -webkit-appearance: none;
+            appearance: none;
+            width: 12px;
+            height: 12px;
+            background: var(--text-link);
+            border-radius: 50%;
+            cursor: pointer;
+        }
+        .thickness-slider::-moz-range-thumb {
+            width: 12px;
+            height: 12px;
+            background: var(--text-link);
+            border-radius: 50%;
+            cursor: pointer;
+            border: none;
+        }
+        .thickness-value {
+            font-size: 0.7rem;
+            color: var(--text-secondary);
+            min-width: 20px;
+            text-align: right;
+        }
+
+        .highlight {
+            background: none !important;
+        }
+        
+        /* Loading animations */
+        .loading-spinner {
+            display: inline-block;
+            width: 16px;
+            height: 16px;
+            border: 2px solid var(--border-primary);
+            border-radius: 50%;
+            border-top-color: var(--text-link);
+            animation: spin 1s linear infinite;
+            margin-right: 8px;
+            vertical-align: middle;
+        }
+        
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+        
+        .loading-skeleton {
+            display: inline-block;
+            background: var(--bg-tertiary);
+            background: linear-gradient(
+                90deg,
+                var(--bg-tertiary) 25%,
+                var(--bg-secondary) 50%,
+                var(--bg-tertiary) 75%
+            );
+            background-size: 200% 100%;
+            animation: loading-shimmer 2s ease-in-out infinite;
+            border-radius: 2px;
+            height: 1em;
+            width: 80px;
+            vertical-align: middle;
+        }
+        
+        @keyframes loading-shimmer {
+            0% { background-position: -200% 0; }
+            100% { background-position: 200% 0; }
+        }
+        
+        /* Loading state for cell output */
+        .cell-output:has(.loading-spinner) {
+            opacity: 0.7;
+            background: var(--bg-secondary);
+            /* border-left: 3px solid var(--text-link); */
+        }
+    </style>
+    <script>
+// --- Drag utilities ---
+        function clamp(val, min, max) { return Math.max(min, Math.min(max, val)); }
+
+        function restorePosition(el, storageKey) {
+            try {
+                const raw = localStorage.getItem(storageKey);
+                if (!raw) return;
+                const pos = JSON.parse(raw);
+                if (typeof pos.left === 'number' && typeof pos.top === 'number') {
+                    el.style.left = pos.left + 'px';
+                    el.style.top = pos.top + 'px';
+                    el.style.right = 'auto';
+                    el.style.bottom = 'auto';
+                }
+            } catch (_) {}
+        }
+
+        function savePosition(el, storageKey) {
+            try {
+                const left = parseFloat(el.style.left || 'NaN');
+                const top = parseFloat(el.style.top || 'NaN');
+                if (!Number.isNaN(left) && !Number.isNaN(top)) {
+                    localStorage.setItem(storageKey, JSON.stringify({ left, top }));
+                }
+            } catch (_) {}
+        }
+
+
+        function makeDraggable(el, storageKey, handleEl) {
+            let dragging = false;
+            let startX = 0, startY = 0; // cursor
+            let origLeft = 0, origTop = 0; // element
+
+            const onMove = (e) => {
+                if (!dragging) return;
+                const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+                const clientY = e.touches ? e.touches[0].clientY : e.clientY;
+                const dx = clientX - startX;
+                const dy = clientY - startY;
+                const w = el.offsetWidth;
+                const h = el.offsetHeight;
+                const maxX = window.innerWidth - w;
+                const maxY = window.innerHeight - h;
+                const newLeft = clamp(origLeft + dx, 0, maxX);
+                const newTop = clamp(origTop + dy, 0, maxY);
+                el.style.left = newLeft + 'px';
+                el.style.top = newTop + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+            };
+
+            const endDrag = () => {
+                if (!dragging) return;
+                dragging = false;
+                document.removeEventListener('mousemove', onMove);
+                document.removeEventListener('mouseup', endDrag);
+                document.removeEventListener('touchmove', onMove);
+                document.removeEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grab');
+                savePosition(el, storageKey);
+                // ensure no-overlap constraint after a drag
+                try { layoutWidgetsStackedBottomRight(); } catch (_) {}
+            };
+
+            const startDrag = (e) => {
+                // Start from element's current on-screen rect
+                const elRect = el.getBoundingClientRect();
+                el.style.left = elRect.left + 'px';
+                el.style.top = elRect.top + 'px';
+                el.style.right = 'auto';
+                el.style.bottom = 'auto';
+
+                dragging = true;
+                startX = e.touches ? e.touches[0].clientX : e.clientX;
+                startY = e.touches ? e.touches[0].clientY : e.clientY;
+                origLeft = elRect.left;
+                origTop = elRect.top;
+
+                document.addEventListener('mousemove', onMove);
+                document.addEventListener('mouseup', endDrag);
+                document.addEventListener('touchmove', onMove, { passive: false });
+                document.addEventListener('touchend', endDrag);
+                handleEl && (handleEl.style.cursor = 'grabbing');
+                e.preventDefault();
+            };
+
+            (handleEl || el).addEventListener('mousedown', startDrag);
+            (handleEl || el).addEventListener('touchstart', startDrag, { passive: false });
+
+            // Apply any saved position on init
+            restorePosition(el, storageKey);
+        }
+        function toggleCell(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+            }
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+            }
+            
+            updateIndicators(cellId);
+            encodeToolStateToUrl();
+        }
+        
+        function toggleCode(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            if (codeElement) {
+                codeElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+        
+        function toggleOutput(cellId) {
+            const outputElement = document.getElementById('output-' + cellId);
+            if (outputElement) {
+                outputElement.classList.toggle('collapsed');
+                updateIndicators(cellId);
+                encodeToolStateToUrl();
+            }
+        }
+
+        function toggleUvLogs(headerElement) {
+            const contentElement = headerElement.nextElementSibling;
+            if (contentElement) {
+                const isCollapsed = contentElement.style.display === 'none';
+                contentElement.style.display = isCollapsed ? 'block' : 'none';
+                headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+
+                // Update the header indicator if it exists
+                const uvLogsDiv = headerElement.parentElement;
+                if (uvLogsDiv && uvLogsDiv.id && uvLogsDiv.id.startsWith('uv-logs-')) {
+                    const cellId = uvLogsDiv.id.replace('uv-logs-', '');
+                    const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+
+        function toggleUvLogsFromHeader(cellId) {
+            const uvLogsElement = document.getElementById('uv-logs-' + cellId);
+            const indicatorElement = document.getElementById('uv-indicator-' + cellId);
+            if (uvLogsElement) {
+                const headerElement = uvLogsElement.querySelector('.uv-logs-header');
+                const contentElement = uvLogsElement.querySelector('.uv-logs-content');
+                if (contentElement && headerElement) {
+                    const isCollapsed = contentElement.style.display === 'none';
+                    contentElement.style.display = isCollapsed ? 'block' : 'none';
+                    headerElement.textContent = isCollapsed ? '▼ UV Install Logs' : '▶ UV Install Logs';
+                    if (indicatorElement) {
+                        indicatorElement.textContent = isCollapsed ? '▼ uv-logs' : '▶ uv-logs';
+                    }
+                }
+            }
+        }
+        
+        function updateIndicators(cellId) {
+            const codeElement = document.getElementById('code-' + cellId);
+            const outputElement = document.getElementById('output-' + cellId);
+            const indicators = document.querySelector(`[onclick*="${cellId}"]`)?.closest('.cell-header')?.querySelector('.collapse-indicators');
+            
+            if (indicators) {
+                const codeCollapsed = codeElement && codeElement.classList.contains('collapsed');
+                const outputCollapsed = outputElement && outputElement.classList.contains('collapsed');
+                
+                const codeIcon = codeCollapsed ? '▶' : '▼';
+                const outputIcon = outputCollapsed ? '▶' : '▼';
+                
+                const codeSpan = indicators.querySelector('[onclick*="toggleCode"]');
+                const outputSpan = indicators.querySelector('[onclick*="toggleOutput"]');
+                
+                if (codeSpan) codeSpan.innerHTML = `${codeIcon} code`;
+                if (outputSpan) outputSpan.innerHTML = `${outputIcon} output`;
+            }
+        }
+        
+        function toggleTheme() {
+            const html = document.documentElement;
+            const currentTheme = html.getAttribute('data-theme');
+            const newTheme = currentTheme === 'dark' ? 'light' : 'dark';
+            html.setAttribute('data-theme', newTheme);
+            localStorage.setItem('uvnote-theme', newTheme);
+            updateThemeIcon();
+            updateUiDebug();
+        }
+        
+        // Two panel code removed
+        
+        function updateThemeIcon() {
+            const theme = document.documentElement.getAttribute('data-theme');
+            const toggle = document.querySelector('.theme-toggle');
+            if (toggle) {
+                toggle.textContent = theme === 'dark' ? 'light' : 'dark';
+            }
+        }
+        function setUiTheme(newUi) {
+            if (newUi !== 'default' && newUi !== 'none' && newUi !== 'monocolor') return;
+            const html = document.documentElement;
+            html.setAttribute('data-ui', newUi);
+            try { localStorage.setItem('uvnote-ui', newUi); } catch (_) {}
+            updateUiMenu();
+            updateUiDebug();
+        }
+        function updateUiMenu() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const checks = {
+                default: document.getElementById('checkbox-ui-default'),
+                none: document.getElementById('checkbox-ui-none'),
+                monocolor: document.getElementById('checkbox-ui-monocolor')
+            };
+            if (checks.default) checks.default.textContent = ui === 'default' ? '☑' : '☐';
+            if (checks.none) checks.none.textContent = ui === 'none' ? '☑' : '☐';
+            if (checks.monocolor) checks.monocolor.textContent = ui === 'monocolor' ? '☑' : '☐';
+        }
+
+        function updateUiDebug() {
+            const ui = document.documentElement.getAttribute('data-ui') || 'default';
+            const color = document.documentElement.getAttribute('data-theme') || 'light';
+            const el = document.getElementById('ui-debug');
+            if (el) {
+                el.textContent = `UI: ${ui} | Color: ${color}`;
+            }
+        }
+
+        // Line selection and deep-linking
+        function clearLineSelections() {
+            try {
+                document.querySelectorAll('.code-line-highlight').forEach(el => { el.style.display = 'none'; });
+                document.querySelectorAll('.line-number.selected').forEach(el => el.classList.remove('selected'));
+            } catch (_) {}
+        }
+
+        let _selection = null; // { cellId, a, b }
+
+        function clearSelection(updateUrl) {
+            clearLineSelections();
+            _selection = null;
+            if (updateUrl) {
+                try {
+                    const url = new URL(window.location.href);
+                    url.searchParams.delete('cell');
+                    url.searchParams.delete('line');
+                    history.replaceState(null, '', url.toString());
+                } catch (_) {}
+            }
+            updateStateIndicator();
+        }
+
+        function selectCellLine(cellId, line, updateUrl) {
+            try {
+                // Ensure only one selection across the whole document
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                // Measure line height directly from computed style
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                // Position overlay
+                overlay.style.display = 'block';
+                overlay.style.height = `${lh}px`;
+                overlay.style.top = `${pre.offsetTop + padTop + (line - 1) * lh}px`;
+
+                // Update selected class in line numbers
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    const sel = numbers.querySelector(`.line-number[data-line="${line}"]`);
+                    if (sel) sel.classList.add('selected');
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    url.searchParams.set('line', String(line));
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a: line, b: line };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLine error', e); }
+        }
+
+        function selectCellLines(cellId, startLine, endLine, updateUrl) {
+            try {
+                // normalize order
+                const a = Math.min(startLine, endLine);
+                const b = Math.max(startLine, endLine);
+                clearLineSelections();
+                const codeBox = document.getElementById(`code-${cellId}`);
+                if (!codeBox) return;
+                const pre = codeBox.querySelector('.highlight pre');
+                const overlay = document.getElementById(`line-highlight-${cellId}`);
+                const numbers = document.getElementById(`lines-${cellId}`);
+                if (!pre || !overlay) return;
+
+                const preStyle = getComputedStyle(pre);
+                const padTop = parseFloat(preStyle.paddingTop || '0');
+                const lh = parseFloat(preStyle.lineHeight || '20');
+
+                overlay.style.display = 'block';
+                overlay.style.top = `${pre.offsetTop + padTop + (a - 1) * lh}px`;
+                overlay.style.height = `${(b - a + 1) * lh}px`;
+
+                if (numbers) {
+                    numbers.querySelectorAll('.line-number').forEach(a => a.classList.remove('selected'));
+                    for (let i = a; i <= b; i++) {
+                        const el = numbers.querySelector(`.line-number[data-line="${i}"]`);
+                        if (el) el.classList.add('selected');
+                    }
+                }
+
+                if (updateUrl) {
+                    const url = new URL(window.location.href);
+                    url.searchParams.set('cell', cellId);
+                    if (a === b) url.searchParams.set('line', String(a));
+                    else url.searchParams.set('line', `${a}-${b}`);
+                    history.replaceState(null, '', url.toString());
+                }
+                _selection = { cellId, a, b };
+                updateStateIndicator();
+            } catch (e) { console.warn('selectCellLines error', e); }
+        }
+
+        // Drag-to-select support on line numbers
+        let _lineDrag = { active: false, cellId: null, start: 0 };
+        function onLineNumberMouseDown(e) {
+            const a = e.target.closest('.line-number');
+            if (!a) return;
+            e.preventDefault();
+            const cellId = a.dataset.cell;
+            const line = parseInt(a.dataset.line || '1', 10) || 1;
+            // Toggle off if this exact single line is already the only selection
+            const numbers = document.getElementById(`lines-${cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length === 1 && selected[0] === line) {
+                    clearSelection(true);
+                    return;
+                }
+            }
+            _lineDrag.active = true;
+            _lineDrag.cellId = cellId;
+            _lineDrag.start = line;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, _lineDrag.start, false);
+        }
+        function onDocMouseMove(e) {
+            if (!_lineDrag.active) return;
+            const el = document.elementFromPoint(e.clientX, e.clientY);
+            if (!el) return;
+            const a = el.closest && el.closest('.line-number');
+            if (!a) return;
+            if (a.dataset.cell !== _lineDrag.cellId) return;
+            const cur = parseInt(a.dataset.line || '1', 10) || 1;
+            selectCellLines(_lineDrag.cellId, _lineDrag.start, cur, false);
+        }
+        function onDocMouseUp(e) {
+            if (!_lineDrag.active) return;
+            const last = document.querySelector('.line-number.selected:last-of-type');
+            // finalize URL using the current selected range
+            const numbers = document.getElementById(`lines-${_lineDrag.cellId}`);
+            if (numbers) {
+                const selected = Array.from(numbers.querySelectorAll('.line-number.selected')).map(n => parseInt(n.dataset.line||'0',10)).filter(Boolean);
+                if (selected.length) {
+                    const a = Math.min(...selected); const b = Math.max(...selected);
+                    selectCellLines(_lineDrag.cellId, a, b, true);
+                }
+            }
+            _lineDrag.active = false; _lineDrag.cellId = null; _lineDrag.start = 0;
+        }
+
+        function applyLocationFromUrl() {
+            try {
+                const url = new URL(window.location.href);
+                const cell = url.searchParams.get('cell');
+                const lineParam = url.searchParams.get('line');
+                if (cell && lineParam) {
+                    if (lineParam.includes('-')) {
+                        const [a, b] = lineParam.split('-').map(x => parseInt(x, 10));
+                        if (!Number.isNaN(a) && !Number.isNaN(b)) selectCellLines(cell, a, b, false);
+                    } else {
+                        const l = parseInt(lineParam, 10);
+                        if (!Number.isNaN(l)) selectCellLine(cell, l, false);
+                    }
+                }
+                
+                // Apply tool parameters from URL
+                applyToolsFromUrl(url.searchParams);
+                
+                // Cell states will be applied later in DOMContentLoaded with proper timing
+                const encodedCellStates = url.searchParams.get('cells');
+                console.log('Encoded cell states from URL:', encodedCellStates);
+            } catch (_) {}
+        }
+        
+        function applyToolsFromUrl(params) {
+            try {
+                // Check if tools widget should be shown
+                const showTools = params.get('tools');
+                if (showTools === '1') {
+                    // Mark that tool was loaded from URL
+                    _urlLoadedTool = true;
+                    
+                    // Apply color
+                    const color = params.get('color');
+                    if (color && /^[0-9a-fA-F]{6}$/.test(color)) {
+                        setStoredArrowColor('#' + color);
+                    }
+                    
+                    // Apply thickness
+                    const thickness = params.get('thickness');
+                    if (thickness) {
+                        const value = parseInt(thickness, 10);
+                        if (value >= 1 && value <= 10) {
+                            setStoredLineThickness(value);
+                        }
+                    }
+                    
+                    // Don't override fadeout time for URL-loaded tools - let individual shapes decide
+                    
+                    // Load shapes from URL
+                    const encodedShapes = params.get('shapes');
+                    if (encodedShapes) {
+                        const decodedShapes = decodeShapesFromUrl(encodedShapes);
+                        if (decodedShapes.length > 0) {
+                            _shapes = decodedShapes;
+                            saveShapes();
+                            // Trigger render after overlay is initialized
+                            setTimeout(() => {
+                                renderOverlay();
+                            }, 300);
+                        }
+                    }
+                    
+                    // Wait for widgets to be initialized before showing tools
+                    setTimeout(() => {
+                        const toolsWidget = document.querySelector('.tools-widget');
+                        const checkbox = document.getElementById('checkbox-tools');
+                        if (toolsWidget && checkbox) {
+                            toolsWidget.style.display = 'block';
+                            checkbox.textContent = '☑';
+                            localStorage.setItem('uvnote-widget-tools', 'visible');
+                        }
+                        
+                        // Apply active tool
+                        const activeTool = params.get('tool');
+                        if (activeTool && ['arrow', 'pen', 'eraser', 'spotlight'].includes(activeTool)) {
+                            const toolBtn = Array.from(document.querySelectorAll('.tool-button')).find(btn => btn.textContent === activeTool);
+                            if (toolBtn) {
+                                toolBtn.click();
+                            }
+                        }
+                        
+                        // Re-layout widgets after showing tools
+                        layoutWidgetsStackedBottomRight();
+                    }, 200);
+                }
+            } catch (_) {}
+        }
+        
+        function captureInitialCellStates() {
+            const cells = document.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    if (codeEl) {
+                        state.c = codeEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    if (outputEl) {
+                        state.o = outputEl.classList.contains('collapsed') ? 0 : 1;
+                    }
+                    _initialCellStates[cellId] = state;
+                }
+            });
+            console.log('Captured initial cell states:', _initialCellStates);
+        }
+
+        function encodeCellStatesToUrl() {
+            // Get all cells and their collapse states
+            const cells = document.querySelectorAll('.cell');
+            const cellStates = {};
+            
+            console.log('Found cells:', cells.length);
+            
+            cells.forEach(cell => {
+                const cellId = cell.id.replace('cell-', '');
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                const initialState = _initialCellStates[cellId] || {};
+                
+                console.log(`Encoding cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    codeCollapsed: codeEl ? codeEl.classList.contains('collapsed') : 'N/A',
+                    outputCollapsed: outputEl ? outputEl.classList.contains('collapsed') : 'N/A',
+                    initialState: initialState
+                });
+                
+                if (codeEl || outputEl) {
+                    const state = {};
+                    let hasChanges = false;
+                    
+                    if (codeEl) {
+                        const currentCodeState = codeEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialCodeState = initialState.c;
+                        // Only encode if different from initial state
+                        if (initialCodeState !== undefined && currentCodeState !== initialCodeState) {
+                            state.c = currentCodeState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    if (outputEl) {
+                        const currentOutputState = outputEl.classList.contains('collapsed') ? 0 : 1;
+                        const initialOutputState = initialState.o;
+                        // Only encode if different from initial state
+                        if (initialOutputState !== undefined && currentOutputState !== initialOutputState) {
+                            state.o = currentOutputState;
+                            hasChanges = true;
+                        }
+                    }
+                    
+                    // Only include cell if it has changes from initial state
+                    if (hasChanges) {
+                        cellStates[cellId] = state;
+                        console.log(`Added cell ${cellId}:`, state);
+                    }
+                }
+            });
+            
+            console.log('Final cell states to encode:', cellStates);
+            
+            // Return empty string if no changed cells
+            if (Object.keys(cellStates).length === 0) return '';
+            
+            // Encode as compact base64 string
+            const encoded = btoa(JSON.stringify(cellStates));
+            console.log('Encoded cell states:', encoded);
+            return encoded;
+        }
+        
+        function decodeCellStatesFromUrl(encodedStates) {
+            if (!encodedStates) return {};
+            
+            try {
+                return JSON.parse(atob(encodedStates));
+            } catch (e) {
+                console.error('Failed to decode cell states:', e);
+                return {};
+            }
+        }
+        
+        function applyCellStatesFromUrl(cellStates) {
+            console.log('Applying cell states from URL:', cellStates);
+            Object.entries(cellStates).forEach(([cellId, state]) => {
+                const codeEl = document.getElementById('code-' + cellId);
+                const outputEl = document.getElementById('output-' + cellId);
+                
+                console.log(`Cell ${cellId}:`, {
+                    codeEl: !!codeEl,
+                    outputEl: !!outputEl,
+                    state: state
+                });
+                
+                if (codeEl && state.c !== undefined) {
+                    if (state.c === 0) {
+                        codeEl.classList.add('collapsed');
+                        console.log(`Collapsed code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    } else {
+                        codeEl.classList.remove('collapsed');
+                        codeEl.classList.add('expanded'); // Explicitly add expanded class
+                        console.log(`Expanded code for cell ${cellId}`, {
+                            hasCollapsedClass: codeEl.classList.contains('collapsed'),
+                            hasExpandedClass: codeEl.classList.contains('expanded'),
+                            computedDisplay: getComputedStyle(codeEl).display,
+                            classList: Array.from(codeEl.classList),
+                            elementId: codeEl.id
+                        });
+                    }
+                }
+                
+                if (outputEl && state.o !== undefined) {
+                    if (state.o === 0) {
+                        outputEl.classList.add('collapsed');
+                        console.log(`Collapsed output for cell ${cellId}`);
+                    } else {
+                        outputEl.classList.remove('collapsed');
+                        console.log(`Expanded output for cell ${cellId}`);
+                    }
+                }
+                
+                // Update visual indicators and force style recalculation
+                try {
+                    updateIndicators(cellId);
+                    // Force browser to recalculate styles
+                    if (codeEl) {
+                        codeEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - code visible: ${getComputedStyle(codeEl).display !== 'none'}`);
+                    }
+                    if (outputEl) {
+                        outputEl.offsetHeight; // Force reflow
+                        console.log(`After indicators update - output visible: ${getComputedStyle(outputEl).display !== 'none'}`);
+                    }
+                } catch (e) {
+                    console.error(`Error updating indicators for cell ${cellId}:`, e);
+                }
+            });
+        }
+
+        function encodeShapesToUrl() {
+            // Encode shapes as compact base64 string
+            if (_shapes.length === 0) return '';
+            
+            const shapeData = _shapes.map(shape => {
+                const baseData = {
+                    ct: shape.createdAt, // creation timestamp
+                    fo: shape.fadeoutTime || getFadeoutTime() // fadeout time for this shape
+                };
+                
+                if (shape.type === 'arrow') {
+                    return {
+                        ...baseData,
+                        t: 'a',
+                        x1: Math.round(shape.x1),
+                        y1: Math.round(shape.y1),
+                        x2: Math.round(shape.x2),
+                        y2: Math.round(shape.y2),
+                        c: shape.color.substring(1), // remove #
+                        w: shape.width
+                    };
+                } else if (shape.type === 'pen') {
+                    return {
+                        ...baseData,
+                        t: 'p',
+                        pts: shape.points.map(p => [Math.round(p.x), Math.round(p.y)]),
+                        c: shape.color.substring(1),
+                        w: shape.width
+                    };
+                } else if (shape.type === 'spotlight') {
+                    return {
+                        ...baseData,
+                        t: 's',
+                        x: Math.round(shape.x),
+                        y: Math.round(shape.y),
+                        r: Math.round(shape.radius)
+                    };
+                }
+            }).filter(Boolean);
+            
+            return btoa(JSON.stringify(shapeData));
+        }
+        
+        function decodeShapesFromUrl(encodedShapes) {
+            if (!encodedShapes) return [];
+            
+            try {
+                const shapeData = JSON.parse(atob(encodedShapes));
+                return shapeData.map(data => {
+                    const base = {
+                        createdAt: data.ct || Date.now(), // use encoded timestamp or current time
+                        fadeoutTime: data.fo || 0, // use encoded fadeout time or 0 (never fade)
+                        opacity: 1.0
+                    };
+                    
+                    if (data.t === 'a') {
+                        return {
+                            ...base,
+                            type: 'arrow',
+                            x1: data.x1,
+                            y1: data.y1,
+                            x2: data.x2,
+                            y2: data.y2,
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 'p') {
+                        return {
+                            ...base,
+                            type: 'pen',
+                            points: data.pts.map(([x, y]) => ({ x, y })),
+                            color: '#' + data.c,
+                            width: data.w
+                        };
+                    } else if (data.t === 's') {
+                        return {
+                            ...base,
+                            type: 'spotlight',
+                            x: data.x,
+                            y: data.y,
+                            radius: data.r,
+                            color: '#000000'
+                        };
+                    }
+                }).filter(Boolean);
+            } catch (e) {
+                console.error('Failed to decode shapes:', e);
+                return [];
+            }
+        }
+        
+        function encodeToolStateToUrl() {
+            // Don't update URL during initialization
+            if (_isInitializing) {
+                return window.location.href;
+            }
+            
+            const params = new URLSearchParams(window.location.search);
+            
+            // Check if tools widget is visible and has an active tool
+            const toolsWidget = document.querySelector('.tools-widget');
+            const activeTool = document.body.dataset.tool;
+            const hasActiveTool = activeTool && activeTool !== 'none';
+            const toolsWidgetVisible = toolsWidget && getComputedStyle(toolsWidget).display !== 'none';
+            
+            // Always handle shapes regardless of tool state
+            const encodedShapes = encodeShapesToUrl();
+            if (encodedShapes) {
+                params.set('shapes', encodedShapes);
+            } else {
+                params.delete('shapes');
+            }
+            
+            // Always preserve existing cell states from URL if present
+            const existingCellStates = params.get('cells');
+            if (existingCellStates) {
+                // Keep existing cell states - don't re-encode from DOM
+                params.set('cells', existingCellStates);
+            } else {
+                // Only encode new cell states if none exist in URL
+                const encodedCellStates = encodeCellStatesToUrl();
+                if (encodedCellStates) {
+                    params.set('cells', encodedCellStates);
+                }
+            }
+            
+            if (toolsWidgetVisible && hasActiveTool) {
+                // Include tool params when widget is visible AND tool is active
+                params.set('tools', '1');
+                params.set('tool', activeTool);
+                
+                // Get color (without # prefix)
+                const color = getArrowColor();
+                if (color && color.startsWith('#')) {
+                    params.set('color', color.substring(1));
+                }
+                
+                // Get thickness
+                const thickness = getLineThickness();
+                params.set('thickness', thickness.toString());
+            } else {
+                // Remove tool state params but keep shapes
+                params.delete('tools');
+                params.delete('tool');
+                params.delete('color');
+                params.delete('thickness');
+                params.delete('fadeout');
+            }
+            
+            // Update URL without reloading
+            const newUrl = window.location.pathname + (params.toString() ? '?' + params.toString() : '') + window.location.hash;
+            window.history.replaceState(null, '', newUrl);
+            
+            return window.location.href;
+        }
+
+        function resetLayout() {
+            try {
+                // Clear all uvnote-* keys
+                const allKeys = Object.keys(localStorage);
+                const uvnoteKeys = allKeys.filter(key => key.startsWith('uvnote-'));
+                uvnoteKeys.forEach(k => localStorage.removeItem(k));
+            } catch (_) {}
+            
+            // Clear any active selection and remove URL params
+            try { clearSelection(true); } catch(_) {}
+            // Reset active tool if any
+            try { window.setActiveTool('none'); } catch(_) {}
+            // Clear shapes
+            try { _shapes = []; saveShapes(); } catch(_) {}
+            // Reset URL-loaded tool flag
+            try { _urlLoadedTool = false; } catch(_) {}
+            // Reset all cells to expanded state
+            try {
+                const cells = document.querySelectorAll('.cell');
+                cells.forEach(cell => {
+                    const cellId = cell.id.replace('cell-', '');
+                    const codeEl = document.getElementById('code-' + cellId);
+                    const outputEl = document.getElementById('output-' + cellId);
+                    if (codeEl) codeEl.classList.remove('collapsed');
+                    if (outputEl) outputEl.classList.remove('collapsed');
+                    updateIndicators(cellId);
+                });
+            } catch(_) {}
+            
+            // Clear ALL URL parameters and reload with clean URL
+            try {
+                const cleanUrl = window.location.pathname + window.location.hash;
+                window.location.href = cleanUrl; // Use window.location.href instead of history.replaceState + reload
+            } catch (_) {
+                // Fallback - reload current page
+                location.reload();
+            }
+        }
+
+        function toggleMenu() {
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.toggle('active');
+            }
+        }
+
+        function toggleWidget(widgetName) {
+            let widget;
+            let checkbox;
+            
+            // Close the menu first
+            const menuButton = document.querySelector('.menu-button');
+            if (menuButton) {
+                menuButton.classList.remove('active');
+            }
+            
+            switch(widgetName) {
+                case 'tools':
+                    widget = document.querySelector('.tools-widget');
+                    checkbox = document.getElementById('checkbox-tools');
+                    break;
+                case 'file-explorer':
+                    widget = document.querySelector('.file-explorer');
+                    checkbox = document.getElementById('checkbox-file-explorer');
+                    break;
+                case 'minimap':
+                    widget = document.querySelector('.minimap');
+                    checkbox = document.getElementById('checkbox-minimap');
+                    break;
+                case 'status':
+                    widget = document.querySelector('.status-widget');
+                    checkbox = document.getElementById('checkbox-status');
+                    break;
+                default:
+                    return;
+            }
+            
+            if (widget && checkbox) {
+                const isVisible = getComputedStyle(widget).display !== 'none';
+                widget.style.display = isVisible ? 'none' : 'block';
+                checkbox.textContent = isVisible ? '☐' : '☑';
+                
+                // Save state to localStorage
+                try {
+                    localStorage.setItem(`uvnote-widget-${widgetName}`, isVisible ? 'hidden' : 'visible');
+                } catch (_) {}
+                
+                // Re-layout widgets after visibility change
+                try { 
+                    layoutWidgetsStackedBottomRight(); 
+                } catch (_) {}
+                
+                // Update URL when tools widget visibility changes
+                if (widgetName === 'tools') {
+                    encodeToolStateToUrl();
+                }
+            }
+        }
+        
+        function initializeWidgetVisibility() {
+            const widgets = [
+                { name: 'tools', selector: '.tools-widget' },
+                { name: 'file-explorer', selector: '.file-explorer' },
+                { name: 'minimap', selector: '.minimap' },
+                { name: 'status', selector: '.status-widget' }
+            ];
+            
+            widgets.forEach(({ name, selector }) => {
+                const defaultState = name === 'status' ? 'visible' : 'hidden';
+                const savedState = localStorage.getItem(`uvnote-widget-${name}`) || defaultState;
+                const widget = document.querySelector(selector);
+                const checkbox = document.getElementById(`checkbox-${name}`);
+                
+                if (widget && checkbox) {
+                    const isVisible = savedState === 'visible';
+                    widget.style.display = isVisible ? 'block' : 'none';
+                    checkbox.textContent = isVisible ? '☑' : '☐';
+                }
+            });
+        }
+
+        // Close menu when clicking outside
+        document.addEventListener('click', function(event) {
+            const menuButton = document.querySelector('.menu-button');
+            // Don't close if clicking on a menu item (let the item handler close it)
+            if (menuButton && !menuButton.contains(event.target)) {
+                menuButton.classList.remove('active');
+            }
+        });
+
+        // Layout: stack widgets bottom-right and equalize widths
+        function hasCustomWidgetPositions() {
+            try {
+                return (
+                    localStorage.getItem('uvnote-minimap-pos') ||
+                    localStorage.getItem('uvnote-file-explorer-pos') ||
+                    localStorage.getItem('uvnote-tools-pos')
+                );
+            } catch (_) { return false; }
+        }
+
+        function rectsOverlap(r1, r2) {
+            return !(r1.right <= r2.left || r2.right <= r1.left || r1.bottom <= r2.top || r2.bottom <= r1.top);
+        }
+
+        function widgetsOverlap(widgets) {
+            for (let i = 0; i < widgets.length; i++) {
+                const a = widgets[i];
+                const ra = a.getBoundingClientRect();
+                for (let j = i + 1; j < widgets.length; j++) {
+                    const b = widgets[j];
+                    const rb = b.getBoundingClientRect();
+                    if (rectsOverlap(ra, rb)) return true;
+                }
+            }
+            return false;
+        }
+
+        function applyStackLayout(widgets, order) {
+            if (!widgets.length) return;
+            // Fixed equal width
+            const fixedWidth = 220;
+            widgets.forEach(el => { el.style.width = fixedWidth + 'px'; });
+
+            // Fit heights if needed to avoid overflow
+            const gap = 12;
+            const available = Math.max(0, window.innerHeight - 40 - gap * (order.length - 1));
+            const eachMax = Math.floor(available / order.length);
+            order.forEach(el => {
+                el.style.maxHeight = eachMax + 'px';
+                el.style.overflowY = 'auto';
+            });
+
+            // Stack bottom-up in the requested order
+            let bottomOffset = 20; // base gutter
+            order.forEach(el => {
+                el.style.left = 'auto';
+                el.style.top = 'auto';
+                el.style.right = '20px';
+                el.style.bottom = bottomOffset + 'px';
+                bottomOffset += el.offsetHeight + gap;
+            });
+        }
+
+        function layoutWidgetsStackedBottomRight() {
+            const minimap = document.querySelector('.minimap');
+            const fileExplorer = document.querySelector('.file-explorer');
+            const tools = document.querySelector('.tools-widget');
+            const status = document.querySelector('.status-widget');
+            const widgets = [minimap, fileExplorer, tools, status].filter(el => el && getComputedStyle(el).display !== 'none');
+            if (!widgets.length) return;
+
+            const order = [minimap, fileExplorer, tools, status].filter(Boolean).filter(el => getComputedStyle(el).display !== 'none');
+
+            // If user placed custom positions and there is no overlap, respect them.
+            if (hasCustomWidgetPositions() && !widgetsOverlap(widgets)) return;
+
+            applyStackLayout(widgets, order);
+        }
+        
+        // Panel icon removed
+        
+        let _minimapScrollContainer = null;
+        let _minimapScrollHandler = null;
+        function initMinimap() {
+            // Generate minimap content
+            const minimap = createMinimap();
+            document.body.appendChild(minimap);
+            // Make draggable (use title as handle)
+            const mTitle = minimap.querySelector('.minimap-title');
+            makeDraggable(minimap, 'uvnote-minimap-pos', mTitle);
+
+            // Attach scroll listener to window (two-panel removed)
+            _minimapScrollContainer = window;
+
+            if (_minimapScrollContainer) {
+                _minimapScrollHandler = () => updateMinimapActive();
+                if (_minimapScrollContainer === window) {
+                    window.addEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.addEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            updateMinimapActive();
+        }
+
+        function teardownMinimap() {
+            const minimap = document.querySelector('.minimap');
+            if (minimap && minimap.parentNode) minimap.parentNode.removeChild(minimap);
+            if (_minimapScrollContainer && _minimapScrollHandler) {
+                if (_minimapScrollContainer === window) {
+                    window.removeEventListener('scroll', _minimapScrollHandler);
+                } else {
+                    _minimapScrollContainer.removeEventListener('scroll', _minimapScrollHandler);
+                }
+            }
+            _minimapScrollContainer = null;
+            _minimapScrollHandler = null;
+        }
+        
+        function initFileExplorer() {
+            // Generate file explorer content
+            const fileExplorer = createFileExplorer();
+            document.body.appendChild(fileExplorer);
+        }
+        
+        function createMinimap() {
+            const minimap = document.createElement('div');
+            minimap.className = 'minimap';
+            
+            const title = document.createElement('div');
+            title.className = 'minimap-title';
+            title.textContent = 'navigation';
+            minimap.appendChild(title);
+            
+            // Find all headings and cells
+            const root = document.querySelector('.main-content') || document;
+            const headings = root.querySelectorAll('h1, h2, h3, h4, h5, h6');
+            const cells = root.querySelectorAll('.cell');
+            
+            // Combine and sort by position
+            const items = [];
+            
+            headings.forEach(heading => {
+                const id = heading.id || generateId(heading.textContent);
+                if (!heading.id) heading.id = id;
+                
+                items.push({
+                    element: heading,
+                    type: 'heading',
+                    level: parseInt(heading.tagName.charAt(1)),
+                    text: heading.textContent.trim(),
+                    id: id,
+                    position: heading.getBoundingClientRect().top + window.scrollY
+                });
+            });
+            
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const id = cell.id || `cell-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+                    if (!cell.id) cell.id = id;
+                    
+                    items.push({
+                        element: cell,
+                        type: 'cell',
+                        text: header.textContent.trim(),
+                        id: id,
+                        position: cell.getBoundingClientRect().top + window.scrollY
+                    });
+                }
+            });
+            
+            // Sort by position
+            items.sort((a, b) => a.position - b.position);
+            
+            // Create minimap items
+            items.forEach(item => {
+                const link = document.createElement('a');
+                link.className = `minimap-item ${item.type === 'heading' ? 'minimap-heading' : 'minimap-cell'}`;
+                if (item.type === 'heading') {
+                    link.classList.add(`h${item.level}`);
+                }
+                link.textContent = item.text.length > 25 ? item.text.substring(0, 22) + '...' : item.text;
+                link.href = `#${item.id}`;
+                link.onclick = function(e) {
+                    e.preventDefault();
+                    item.element.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                };
+                minimap.appendChild(link);
+            });
+            
+            return minimap;
+        }
+        
+        function generateId(text) {
+            return text.toLowerCase()
+                .replace(/[^a-z0-9]+/g, '-')
+                .replace(/^-+|-+$/g, '')
+                .substring(0, 20);
+        }
+        
+        function updateMinimapActive() {
+            const minimapItems = document.querySelectorAll('.minimap-item');
+            const container = _minimapScrollContainer || window;
+            const containerRect = container === window ? null : container.getBoundingClientRect();
+            const scrollPos = (container === window ? window.scrollY : container.scrollTop) + 100; // Offset for better detection
+            
+            let activeItem = null;
+            minimapItems.forEach(item => {
+                const targetId = item.getAttribute('href').substring(1);
+                const target = document.getElementById(targetId);
+                
+                if (target) {
+                    const rectTop = target.getBoundingClientRect().top;
+                    const targetPos = (container === window)
+                        ? rectTop + window.scrollY
+                        : rectTop - containerRect.top + container.scrollTop;
+                    if (targetPos <= scrollPos) {
+                        activeItem = item;
+                    }
+                }
+                
+                item.classList.remove('active');
+            });
+            
+            if (activeItem) {
+                activeItem.classList.add('active');
+            }
+        }
+        
+        function createFileExplorer() {
+            const fileExplorer = document.createElement('div');
+            fileExplorer.className = 'file-explorer';
+            
+            const title = document.createElement('div');
+            title.className = 'file-explorer-title';
+            title.textContent = 'files';
+            fileExplorer.appendChild(title);
+            // Make draggable (use title as handle)
+            makeDraggable(fileExplorer, 'uvnote-file-explorer-pos', title);
+            
+            // Scripts section
+            const scriptsSection = document.createElement('div');
+            scriptsSection.className = 'file-explorer-section';
+            
+            const scriptsTitle = document.createElement('div');
+            scriptsTitle.className = 'file-explorer-section-title';
+            scriptsTitle.textContent = 'scripts';
+            scriptsSection.appendChild(scriptsTitle);
+            
+            // Find all cells and list their script files (single panel)
+            const root = document.querySelector('.main-content') || document;
+            const cells = root.querySelectorAll('.cell');
+            cells.forEach(cell => {
+                const header = cell.querySelector('.cell-header');
+                if (header) {
+                    const cellText = header.textContent.trim();
+                    const cellMatch = cellText.match(/Cell: ([a-zA-Z_][a-zA-Z0-9_]*)/);
+                    if (cellMatch) {
+                        const cellId = cellMatch[1];
+                        const scriptItem = document.createElement('div');
+                        scriptItem.className = 'file-explorer-item script';
+                        scriptItem.textContent = `${cellId}.py`;
+                        scriptItem.onclick = function() {
+                            cell.scrollIntoView({ behavior: 'smooth', block: 'start' });
+                        };
+                        scriptsSection.appendChild(scriptItem);
+                    }
+                }
+            });
+            
+            fileExplorer.appendChild(scriptsSection);
+            
+            // Artifacts section
+            const artifactsSection = document.createElement('div');
+            artifactsSection.className = 'file-explorer-section';
+            
+            const artifactsTitle = document.createElement('div');
+            artifactsTitle.className = 'file-explorer-section-title';
+            artifactsTitle.textContent = 'artifacts';
+            artifactsSection.appendChild(artifactsTitle);
+            
+            // Find all artifact links (single panel)
+            const artifactsRoot = document.querySelector('.main-content') || document;
+            const artifacts = artifactsRoot.querySelectorAll('.artifact');
+            if (artifacts.length === 0) {
+                const noArtifacts = document.createElement('div');
+                noArtifacts.className = 'file-explorer-item artifact';
+                noArtifacts.textContent = '(none)';
+                noArtifacts.style.opacity = '0.5';
+                artifactsSection.appendChild(noArtifacts);
+            } else {
+                artifacts.forEach(artifact => {
+                    const artifactItem = document.createElement('div');
+                    artifactItem.className = 'file-explorer-item artifact';
+                    artifactItem.textContent = artifact.textContent;
+                    artifactItem.onclick = function() {
+                        artifact.click();
+                    };
+                    artifactsSection.appendChild(artifactItem);
+                });
+            }
+            
+            fileExplorer.appendChild(artifactsSection);
+            
+            return fileExplorer;
+        }
+
+        function initStatusWidget() {
+            let el = document.querySelector('.status-widget');
+            if (!el) {
+                el = document.createElement('div');
+                el.className = 'status-widget';
+                el.id = 'status-widget';
+                el.textContent = 'ready — Esc';
+                document.body.appendChild(el);
+            }
+        }
+
+        // Tools widget
+        let _cursorX = 0;
+        let _cursorY = 0;
+        let _cursorVisible = false;
+        
+        function setActiveTool(tool) {
+            if (!tool || tool === 'none') {
+                document.body.dataset.tool = 'none';
+                localStorage.setItem('uvnote-active-tool', 'none');
+                setOverlayActive(false);
+                _cursorVisible = false;
+                // Remove active class from all tool buttons when deactivating
+                const toolButtons = document.querySelectorAll('.tools-widget .tool-button');
+                toolButtons.forEach(btn => btn.classList.remove('active'));
+                updateStateIndicator();
+                encodeToolStateToUrl();
+                return;
+            }
+            document.body.dataset.tool = tool;
+            localStorage.setItem('uvnote-active-tool', tool);
+            setOverlayActive(true);
+            _cursorVisible = true;
+            updateStateIndicator();
+            encodeToolStateToUrl();
+        }
+
+        // Make setActiveTool globally accessible for ESC key handler
+        window.setActiveTool = setActiveTool;
+        
+
+        function getArrowColor() {
+            const saved = localStorage.getItem('uvnote-arrow-color');
+            if (saved) return saved;
+            return '#e53935'; // Default red color
+        }
+
+        function setStoredArrowColor(color) {
+            try { localStorage.setItem('uvnote-arrow-color', color); } catch (_) {}
+        }
+
+        function getLineThickness() {
+            const saved = localStorage.getItem('uvnote-line-thickness');
+            if (saved) return parseInt(saved, 10);
+            return 6; // default thickness
+        }
+
+        function setStoredLineThickness(thickness) {
+            try { localStorage.setItem('uvnote-line-thickness', thickness); } catch (_) {}
+        }
+
+        function getFadeoutTime() {
+            const saved = localStorage.getItem('uvnote-fadeout-time');
+            if (saved) return parseInt(saved, 10);
+            return 5; // default 5 seconds
+        }
+
+        function setStoredFadeoutTime(seconds) {
+            try { localStorage.setItem('uvnote-fadeout-time', seconds); } catch (_) {}
+        }
+
+        function createToolsWidget() {
+            const tools = document.createElement('div');
+            tools.className = 'tools-widget';
+
+            const title = document.createElement('div');
+            title.className = 'tools-title';
+            title.textContent = 'tools';
+            tools.appendChild(title);
+
+            const row = document.createElement('div');
+            row.className = 'tools-row';
+            tools.appendChild(row);
+
+            // Arrow tool
+            const arrowBtn = document.createElement('div');
+            arrowBtn.className = 'tool-button';
+            arrowBtn.textContent = 'arrow';
+            arrowBtn.onclick = function() {
+                const isActive = arrowBtn.classList.contains('active');
+                if (isActive) {
+                    arrowBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    arrowBtn.classList.add('active');
+                    setActiveTool('arrow');
+                }
+            };
+            row.appendChild(arrowBtn);
+
+            // Pen tool
+            const penBtn = document.createElement('div');
+            penBtn.className = 'tool-button';
+            penBtn.textContent = 'pen';
+            penBtn.onclick = function() {
+                const isActive = penBtn.classList.contains('active');
+                if (isActive) {
+                    penBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    penBtn.classList.add('active');
+                    setActiveTool('pen');
+                }
+            };
+            row.appendChild(penBtn);
+
+            // Eraser tool
+            const eraseBtn = document.createElement('div');
+            eraseBtn.className = 'tool-button';
+            eraseBtn.textContent = 'eraser';
+            eraseBtn.onclick = function() {
+                const isActive = eraseBtn.classList.contains('active');
+                if (isActive) {
+                    eraseBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    eraseBtn.classList.add('active');
+                    setActiveTool('eraser');
+                }
+            };
+            row.appendChild(eraseBtn);
+
+            // Spotlight tool
+            const spotlightBtn = document.createElement('div');
+            spotlightBtn.className = 'tool-button';
+            spotlightBtn.textContent = 'spotlight';
+            spotlightBtn.onclick = function() {
+                const isActive = spotlightBtn.classList.contains('active');
+                if (isActive) {
+                    spotlightBtn.classList.remove('active');
+                    setActiveTool('none');
+                } else {
+                    tools.querySelectorAll('.tool-button').forEach(b => b.classList.remove('active'));
+                    spotlightBtn.classList.add('active');
+                    setActiveTool('spotlight');
+                }
+            };
+            row.appendChild(spotlightBtn);
+
+            // Clear all
+            const clearBtn = document.createElement('div');
+            clearBtn.className = 'tool-button';
+            clearBtn.textContent = 'clear';
+            clearBtn.onclick = function() {
+                _shapes = [];
+                saveShapes();
+                renderOverlay();
+            };
+            row.appendChild(clearBtn);
+            
+            // We'll add the copy button at the end of the widget
+
+            // Restore active state from storage
+            const saved = localStorage.getItem('uvnote-active-tool') || 'none';
+            if (saved === 'arrow') {
+                arrowBtn.classList.add('active');
+                setActiveTool('arrow');
+            } else if (saved === 'pen') {
+                penBtn.classList.add('active');
+                setActiveTool('pen');
+            } else if (saved === 'eraser') {
+                eraseBtn.classList.add('active');
+                setActiveTool('eraser');
+            } else if (saved === 'spotlight') {
+                spotlightBtn.classList.add('active');
+                setActiveTool('spotlight');
+            }
+
+            // Color selector
+            const colorTitle = document.createElement('div');
+            colorTitle.className = 'tools-section-title';
+            colorTitle.textContent = 'color';
+            tools.appendChild(colorTitle);
+
+            const colorRow = document.createElement('div');
+            colorRow.className = 'tools-row color-row';
+            tools.appendChild(colorRow);
+
+            const swatchColors = [
+                // Primary colors
+                '#e53935', '#fb8c00', '#fdd835', '#43a047', '#1e88e5', '#8e24aa',
+                // Additional useful colors  
+                '#ff5722', '#795548', '#607d8b', '#9c27b0',
+                // Grayscale
+                '#000000', '#424242', '#9e9e9e', '#ffffff'
+            ];
+            const swatches = [];
+            swatchColors.forEach(c => {
+                const s = document.createElement('div');
+                s.className = 'color-swatch';
+                s.style.backgroundColor = c;
+                s.title = c;
+                s.onclick = () => {
+                    setStoredArrowColor(c);
+                    refreshColorUI(c);
+                    if (_cursorVisible) renderOverlay();
+                    encodeToolStateToUrl();
+                };
+                colorRow.appendChild(s);
+                swatches.push(s);
+            });
+
+            const colorInput = document.createElement('input');
+            colorInput.type = 'color';
+            colorInput.className = 'color-input';
+            colorInput.oninput = () => {
+                setStoredArrowColor(colorInput.value);
+                refreshColorUI(colorInput.value);
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+            colorRow.appendChild(colorInput);
+
+            function refreshColorUI(selected) {
+                const selectedHex = selected.startsWith('#') ? selected.toLowerCase() : rgbToHex(selected);
+                
+                swatches.forEach((s, i) => {
+                    const swatchHex = swatchColors[i].toLowerCase();
+                    if (swatchHex === selectedHex) {
+                        s.classList.add('selected');
+                    } else {
+                        s.classList.remove('selected');
+                    }
+                });
+                
+                try { 
+                    colorInput.value = selectedHex; 
+                } catch (_) {}
+            }
+
+            function rgbToHex(rgb) {
+                const m = rgb.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)\)/i);
+                if (!m) return '#000000';
+                const r = parseInt(m[1]).toString(16).padStart(2, '0');
+                const g = parseInt(m[2]).toString(16).padStart(2, '0');
+                const b = parseInt(m[3]).toString(16).padStart(2, '0');
+                return `#${r}${g}${b}`;
+            }
+
+            // Restore color selection
+            refreshColorUI(getArrowColor());
+
+            // Thickness slider
+            const thicknessTitle = document.createElement('div');
+            thicknessTitle.className = 'tools-section-title';
+            thicknessTitle.textContent = 'thickness';
+            tools.appendChild(thicknessTitle);
+
+            const thicknessRow = document.createElement('div');
+            thicknessRow.className = 'thickness-row';
+            tools.appendChild(thicknessRow);
+
+            const thicknessSlider = document.createElement('input');
+            thicknessSlider.type = 'range';
+            thicknessSlider.className = 'thickness-slider';
+            thicknessSlider.min = '1';
+            thicknessSlider.max = '10';
+            thicknessSlider.value = getLineThickness();
+            
+            const thicknessValue = document.createElement('span');
+            thicknessValue.className = 'thickness-value';
+            thicknessValue.textContent = thicknessSlider.value + 'px';
+
+            thicknessSlider.oninput = function() {
+                const value = parseInt(thicknessSlider.value, 10);
+                setStoredLineThickness(value);
+                thicknessValue.textContent = value + 'px';
+                if (_cursorVisible) renderOverlay();
+                encodeToolStateToUrl();
+            };
+
+            thicknessRow.appendChild(thicknessSlider);
+            thicknessRow.appendChild(thicknessValue);
+
+            // Fadeout time slider
+            const fadeoutTitle = document.createElement('div');
+            fadeoutTitle.className = 'tools-section-title';
+            fadeoutTitle.textContent = 'fadeout time';
+            tools.appendChild(fadeoutTitle);
+
+            const fadeoutRow = document.createElement('div');
+            fadeoutRow.className = 'thickness-row';
+            tools.appendChild(fadeoutRow);
+
+            const fadeoutSlider = document.createElement('input');
+            fadeoutSlider.type = 'range';
+            fadeoutSlider.className = 'thickness-slider';
+            fadeoutSlider.min = '0';
+            fadeoutSlider.max = '30';
+            fadeoutSlider.value = getFadeoutTime();
+            
+            const fadeoutValue = document.createElement('span');
+            fadeoutValue.className = 'thickness-value';
+            fadeoutValue.textContent = fadeoutSlider.value === '0' ? 'never' : fadeoutSlider.value + 's';
+
+            fadeoutSlider.oninput = function() {
+                const value = parseInt(fadeoutSlider.value, 10);
+                setStoredFadeoutTime(value);
+                fadeoutValue.textContent = value === 0 ? 'never' : value + 's';
+                encodeToolStateToUrl();
+            };
+
+            fadeoutRow.appendChild(fadeoutSlider);
+            fadeoutRow.appendChild(fadeoutValue);
+
+            // Draggable behavior
+            makeDraggable(tools, 'uvnote-tools-pos', title);
+
+            return tools;
+        }
+
+        function initTools() {
+            const widget = createToolsWidget();
+            document.body.appendChild(widget);
+        }
+
+        function teardownTools() {
+            const w = document.querySelector('.tools-widget');
+            if (w && w.parentNode) w.parentNode.removeChild(w);
+        }
+
+        // --- Canvas overlay for tools ---
+        let _overlay = null;
+        let _overlayCtx = null;
+        let _overlayContainer = null; // window
+        let _overlayMode = 'single';
+        let _overlayResizeHandler = null;
+        let _overlayScrollHandler = null;
+        let _drawing = null; // current in-progress arrow {x1,y1,x2,y2}
+        let _shapes = []; // committed shapes for current mode
+        let _fadeTimer = null; // timer for fade animation
+        let _urlLoadedTool = false; // track if tool was loaded from URL
+        let _isInitializing = true; // prevent URL updates during initialization
+        let _initialCellStates = {}; // track initial cell states from page load
+
+        function getOverlayStorageKey() { return 'uvnote-shapes'; }
+
+        function loadShapes() {
+            try {
+                const raw = localStorage.getItem(getOverlayStorageKey());
+                _shapes = raw ? JSON.parse(raw) : [];
+            } catch (_) { _shapes = []; }
+        }
+
+        function saveShapes() {
+            try { 
+                localStorage.setItem(getOverlayStorageKey(), JSON.stringify(_shapes));
+                // Always update URL when shapes change
+                encodeToolStateToUrl();
+            } catch (_) {}
+        }
+
+        function updateShapesFade() {
+            const now = Date.now();
+            let needsUpdate = false;
+
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const shape = _shapes[i];
+                if (!shape.createdAt) continue; // Skip old shapes without timestamps
+                
+                // Use individual shape's fadeout time, or global if not set
+                const shapesFadeoutSeconds = shape.fadeoutTime !== undefined ? shape.fadeoutTime : getFadeoutTime();
+                
+                // Skip fading if fadeout is disabled for this shape
+                if (shapesFadeoutSeconds === 0) continue;
+                
+                const fadeStartTime = Math.max(0, (shapesFadeoutSeconds - 2) * 1000); // Start fading 2s before end
+                const fadeEndTime = shapesFadeoutSeconds * 1000; // Fully gone after specified time
+                const age = now - shape.createdAt;
+                
+                if (age >= fadeEndTime) {
+                    // Remove completely faded shapes
+                    _shapes.splice(i, 1);
+                    needsUpdate = true;
+                } else if (age >= fadeStartTime) {
+                    // Update opacity for fading shapes
+                    const fadeProgress = (age - fadeStartTime) / (fadeEndTime - fadeStartTime);
+                    const newOpacity = 1 - fadeProgress;
+                    if (Math.abs(shape.opacity - newOpacity) > 0.01) {
+                        shape.opacity = newOpacity;
+                        needsUpdate = true;
+                    }
+                }
+            }
+
+            if (needsUpdate) {
+                saveShapes();
+                renderOverlay();
+                // Update URL to remove faded shapes  
+                encodeToolStateToUrl();
+            }
+        }
+
+        function getContentContainer() { return window; }
+
+        function updateOverlayModeAndContainer() {
+            _overlayContainer = window;
+            _overlayMode = 'single';
+        }
+
+        function updateOverlayBounds() {
+            if (!_overlay) return;
+            if (_overlayContainer === window) {
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = '0px';
+                _overlay.style.top = '0px';
+                _overlay.width = window.innerWidth;
+                _overlay.height = window.innerHeight;
+            } else {
+                const rect = _overlayContainer.getBoundingClientRect();
+                _overlay.style.position = 'fixed';
+                _overlay.style.left = rect.left + 'px';
+                _overlay.style.top = rect.top + 'px';
+                _overlay.width = Math.max(0, Math.floor(rect.width));
+                _overlay.height = Math.max(0, Math.floor(rect.height));
+            }
+            renderOverlay();
+        }
+
+        function containerScrollLeft() {
+            return (_overlayContainer === window) ? (window.scrollX || 0) : (_overlayContainer.scrollLeft || 0);
+        }
+        function containerScrollTop() {
+            return (_overlayContainer === window) ? (window.scrollY || 0) : (_overlayContainer.scrollTop || 0);
+        }
+
+        function toCanvasCoords(clientX, clientY) {
+            const rect = _overlay.getBoundingClientRect();
+            return { x: clientX - rect.left, y: clientY - rect.top };
+        }
+
+        function onPointerDown(e) {
+            const tool = document.body.dataset.tool;
+            if (tool === 'arrow') {
+                startDrawArrow(e);
+            } else if (tool === 'pen') {
+                startDrawPen(e);
+            } else if (tool === 'eraser') {
+                eraseAt(e);
+            } else if (tool === 'spotlight') {
+                startDrawSpotlight(e);
+            }
+        }
+
+        function onPointerMove(e) {
+            // Update cursor position
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _cursorX = pt.x;
+            _cursorY = pt.y;
+            
+            if (!_drawing) {
+                // Just update cursor position and re-render
+                if (_cursorVisible) {
+                    renderOverlay();
+                }
+                return;
+            }
+            
+            if (_drawing.type === 'pen') {
+                moveDrawPen(e);
+            } else if (_drawing.type === 'spotlight') {
+                moveDrawSpotlight(e);
+            } else {
+                moveDrawArrow(e);
+            }
+        }
+        
+        function onPointerEnter(e) {
+            _cursorVisible = document.body.dataset.tool !== 'none';
+            if (_cursorVisible) {
+                renderOverlay();
+            }
+        }
+        
+        function onPointerLeave(e) {
+            _cursorVisible = false;
+            renderOverlay();
+        }
+
+        function onPointerUp(e) {
+            if (!_drawing) return;
+            if (_drawing.type === 'pen') {
+                endDrawPen();
+            } else if (_drawing.type === 'spotlight') {
+                endDrawSpotlight();
+            } else {
+                endDrawArrow();
+            }
+        }
+
+        function startDrawArrow(e) {
+            if (document.body.dataset.tool !== 'arrow') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                x1: pt.x + containerScrollLeft(),
+                y1: pt.y + containerScrollTop(),
+                x2: pt.x + containerScrollLeft(),
+                y2: pt.y + containerScrollTop(),
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawArrow(e) {
+            if (!_drawing) return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.x2 = pt.x + containerScrollLeft();
+            _drawing.y2 = pt.y + containerScrollTop();
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawArrow() {
+            if (!_drawing) return;
+            _shapes.push({ 
+                type: 'arrow', 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawPen(e) {
+            if (document.body.dataset.tool !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'pen',
+                points: [{
+                    x: pt.x + containerScrollLeft(),
+                    y: pt.y + containerScrollTop()
+                }],
+                color: getArrowColor(),
+                width: getLineThickness()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawPen(e) {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing.points.push({
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop()
+            });
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawPen() {
+            if (!_drawing || _drawing.type !== 'pen') return;
+            if (_drawing.points.length > 1) {
+                _shapes.push({ 
+                    ..._drawing,
+                    createdAt: Date.now(),
+                    fadeoutTime: getFadeoutTime(),
+                    opacity: 1.0
+                });
+            }
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function startDrawSpotlight(e) {
+            if (document.body.dataset.tool !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            _drawing = {
+                type: 'spotlight',
+                x: pt.x + containerScrollLeft(),
+                y: pt.y + containerScrollTop(),
+                radius: getLineThickness() * 20, // Use thickness to control spotlight size (bigger default)
+                color: getArrowColor()
+            };
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function moveDrawSpotlight(e) {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const dx = pt.x + containerScrollLeft() - _drawing.x;
+            const dy = pt.y + containerScrollTop() - _drawing.y;
+            _drawing.radius = Math.max(20, Math.sqrt(dx * dx + dy * dy)); // Minimum radius of 20
+            renderOverlay();
+            e.preventDefault();
+        }
+
+        function endDrawSpotlight() {
+            if (!_drawing || _drawing.type !== 'spotlight') return;
+            _shapes.push({ 
+                ..._drawing,
+                createdAt: Date.now(),
+                fadeoutTime: getFadeoutTime(),
+                opacity: 1.0
+            });
+            _drawing = null;
+            saveShapes();
+            renderOverlay();
+        }
+
+        function distPointToSegment(px, py, x1, y1, x2, y2) {
+            const dx = x2 - x1, dy = y2 - y1;
+            if (dx === 0 && dy === 0) return Math.hypot(px - x1, py - y1);
+            const t = Math.max(0, Math.min(1, ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)));
+            const cx = x1 + t * dx, cy = y1 + t * dy;
+            return Math.hypot(px - cx, py - cy);
+        }
+
+        function eraseAt(e) {
+            const pt = toCanvasCoords(e.touches ? e.touches[0].clientX : e.clientX, e.touches ? e.touches[0].clientY : e.clientY);
+            const x = pt.x + containerScrollLeft();
+            const y = pt.y + containerScrollTop();
+            const threshold = 10; // pixels
+            for (let i = _shapes.length - 1; i >= 0; i--) {
+                const s = _shapes[i];
+                if (s.type === 'arrow') {
+                    const d = distPointToSegment(x, y, s.x1, s.y1, s.x2, s.y2);
+                    if (d <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                } else if (s.type === 'pen' && s.points) {
+                    // Check if click is near any line segment in the pen stroke
+                    let minDist = Infinity;
+                    for (let j = 1; j < s.points.length; j++) {
+                        const d = distPointToSegment(x, y, s.points[j-1].x, s.points[j-1].y, s.points[j].x, s.points[j].y);
+                        minDist = Math.min(minDist, d);
+                    }
+                    if (minDist <= threshold) {
+                        _shapes.splice(i, 1);
+                        saveShapes();
+                        renderOverlay();
+                        break;
+                    }
+                }
+            }
+            e.preventDefault();
+        }
+
+        function drawArrow(ctx, x1, y1, x2, y2, color, width, opacity = 1.0) {
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.fillStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            
+            // Check if points are too close (initial state)
+            const dx = x2 - x1;
+            const dy = y2 - y1;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            
+            if (distance < 5) {
+                // Draw just a small arrowhead pointing down-right when first clicked
+                const defaultAngle = Math.PI / 4; // 45 degrees (down-right)
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate arrowhead points
+                const hx1 = x1 + headLength * Math.cos(defaultAngle - headAngle);
+                const hy1 = y1 + headLength * Math.sin(defaultAngle - headAngle);
+                const hx2 = x1 + headLength * Math.cos(defaultAngle + headAngle);
+                const hy2 = y1 + headLength * Math.sin(defaultAngle + headAngle);
+                
+                // Draw arrowhead only
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            } else {
+                // Normal arrow drawing - head at x1,y1, tail at x2,y2
+                const angle = Math.atan2(y1 - y2, x1 - x2);
+                const headLength = Math.min(15 + width * 1.5, 25);
+                const headAngle = Math.PI / 6;
+                
+                // Calculate where the line should end (before the arrowhead)
+                const lineEndX = x1 - headLength * 0.8 * Math.cos(angle);
+                const lineEndY = y1 - headLength * 0.8 * Math.sin(angle);
+                
+                // Draw the line from tail to near the head
+                ctx.beginPath();
+                ctx.moveTo(x2, y2);
+                ctx.lineTo(lineEndX, lineEndY);
+                ctx.stroke();
+                
+                // Calculate arrowhead points
+                const hx1 = x1 - headLength * Math.cos(angle - headAngle);
+                const hy1 = y1 - headLength * Math.sin(angle - headAngle);
+                const hx2 = x1 - headLength * Math.cos(angle + headAngle);
+                const hy2 = y1 - headLength * Math.sin(angle + headAngle);
+                
+                // Draw arrowhead
+                ctx.beginPath();
+                ctx.moveTo(x1, y1);
+                ctx.lineTo(hx1, hy1);
+                ctx.lineTo(hx2, hy2);
+                ctx.closePath();
+                ctx.fill();
+            }
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawPen(ctx, points, color, width, offX, offY, opacity = 1.0) {
+            if (!points || points.length < 2) return;
+            
+            // Set opacity
+            const oldAlpha = ctx.globalAlpha;
+            ctx.globalAlpha = opacity;
+            
+            ctx.strokeStyle = color;
+            ctx.lineWidth = width;
+            ctx.lineCap = 'round';
+            ctx.lineJoin = 'round';
+            ctx.beginPath();
+            ctx.moveTo(points[0].x - offX, points[0].y - offY);
+            for (let i = 1; i < points.length; i++) {
+                ctx.lineTo(points[i].x - offX, points[i].y - offY);
+            }
+            ctx.stroke();
+            
+            // Restore opacity
+            ctx.globalAlpha = oldAlpha;
+        }
+
+        function drawAllSpotlights(ctx, spotlights, offX, offY) {
+            if (!spotlights || spotlights.length === 0) return;
+            
+            ctx.save();
+            
+            // Calculate the overall opacity based on all spotlights
+            const maxOpacity = Math.max(...spotlights.map(s => s.opacity || 1.0));
+            
+            // Fill entire canvas with dark overlay
+            ctx.fillStyle = `rgba(0, 0, 0, ${0.7 * maxOpacity})`;
+            ctx.fillRect(0, 0, ctx.canvas.width, ctx.canvas.height);
+            
+            // Cut out completely transparent holes for all spotlights
+            ctx.globalCompositeOperation = 'destination-out';
+            ctx.fillStyle = 'rgba(0, 0, 0, 1)'; // Solid black to ensure complete removal
+            for (const spotlight of spotlights) {
+                ctx.beginPath();
+                ctx.arc(spotlight.x - offX, spotlight.y - offY, spotlight.radius, 0, 2 * Math.PI);
+                ctx.fill();
+            }
+            
+            ctx.restore();
+        }
+
+        function renderOverlay() {
+            if (!_overlay || !_overlayCtx) return;
+            _overlayCtx.clearRect(0, 0, _overlay.width, _overlay.height);
+            const offX = containerScrollLeft();
+            const offY = containerScrollTop();
+            // Draw non-spotlight shapes first
+            for (const s of _shapes) {
+                const opacity = s.opacity !== undefined ? s.opacity : 1.0;
+                if (s.type === 'arrow') {
+                    drawArrow(_overlayCtx, s.x1 - offX, s.y1 - offY, s.x2 - offX, s.y2 - offY, s.color || '#f00', s.width || 2, opacity);
+                } else if (s.type === 'pen') {
+                    drawPen(_overlayCtx, s.points, s.color || '#f00', s.width || 2, offX, offY, opacity);
+                }
+            }
+            // Draw current drawing (non-spotlight)
+            if (_drawing) {
+                if (_drawing.type === 'pen') {
+                    drawPen(_overlayCtx, _drawing.points, _drawing.color, _drawing.width, offX, offY);
+                } else if (_drawing.type !== 'spotlight') {
+                    drawArrow(_overlayCtx, _drawing.x1 - offX, _drawing.y1 - offY, _drawing.x2 - offX, _drawing.y2 - offY, _drawing.color, _drawing.width);
+                }
+            }
+            
+            // Collect all spotlights (existing + current drawing + cursor preview)
+            const spotlights = [];
+            
+            // Add existing spotlight shapes
+            for (const s of _shapes) {
+                if (s.type === 'spotlight') {
+                    spotlights.push({
+                        x: s.x,
+                        y: s.y, 
+                        radius: s.radius,
+                        opacity: s.opacity !== undefined ? s.opacity : 1.0
+                    });
+                }
+            }
+            
+            // Add current spotlight being drawn
+            if (_drawing && _drawing.type === 'spotlight') {
+                spotlights.push({
+                    x: _drawing.x,
+                    y: _drawing.y,
+                    radius: _drawing.radius,
+                    opacity: 1.0
+                });
+            }
+            
+            // Add cursor preview spotlight if tool is active
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                if (tool === 'spotlight') {
+                    const thickness = getLineThickness();
+                    const radius = thickness * 20;
+                    const cursorWorldX = _cursorX + containerScrollLeft();
+                    const cursorWorldY = _cursorY + containerScrollTop();
+                    spotlights.push({
+                        x: cursorWorldX,
+                        y: cursorWorldY,
+                        radius: radius,
+                        opacity: 0.8
+                    });
+                }
+            }
+            
+            // Draw all spotlights as a single overlay with multiple holes
+            drawAllSpotlights(_overlayCtx, spotlights, offX, offY);
+            
+            // Draw cursor indicators for non-spotlight tools
+            if (_cursorVisible && !_drawing) {
+                const tool = document.body.dataset.tool;
+                const color = getArrowColor();
+                const thickness = getLineThickness();
+                
+                if (tool !== 'spotlight') {
+                    _overlayCtx.save();
+                    _overlayCtx.fillStyle = color;
+                    _overlayCtx.globalAlpha = 0.7;
+                    
+                    if (tool === 'eraser') {
+                        // Draw eraser indicator
+                        _overlayCtx.strokeStyle = color;
+                        _overlayCtx.lineWidth = 2;
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, 10, 0, 2 * Math.PI);
+                        _overlayCtx.stroke();
+                    } else {
+                        // Draw dot for pen/arrow
+                        _overlayCtx.beginPath();
+                        _overlayCtx.arc(_cursorX, _cursorY, thickness / 2, 0, 2 * Math.PI);
+                        _overlayCtx.fill();
+                    }
+                    
+                    _overlayCtx.restore();
+                }
+            }
+        }
+
+        function setOverlayActive(active) {
+            if (!_overlay) initOverlay();
+            _overlay.style.pointerEvents = active ? 'auto' : 'none';
+            _overlay.style.cursor = active ? 'none' : 'auto';
+            // Re-render to ensure visibility aligns with content
+            renderOverlay();
+        }
+
+        function initOverlay() {
+            if (_overlay) return;
+            updateOverlayModeAndContainer();
+            _overlay = document.createElement('canvas');
+            _overlay.className = 'draw-overlay';
+            _overlayCtx = _overlay.getContext('2d');
+            document.body.appendChild(_overlay);
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+
+            // Events
+            _overlay.addEventListener('mousedown', onPointerDown);
+            _overlay.addEventListener('mousemove', onPointerMove);
+            _overlay.addEventListener('mouseenter', onPointerEnter);
+            _overlay.addEventListener('mouseleave', onPointerLeave);
+            document.addEventListener('mouseup', onPointerUp);
+            _overlay.addEventListener('touchstart', onPointerDown, { passive: false });
+            _overlay.addEventListener('touchmove', onPointerMove, { passive: false });
+            document.addEventListener('touchend', onPointerUp);
+
+            _overlayResizeHandler = () => updateOverlayBounds();
+            window.addEventListener('resize', _overlayResizeHandler);
+
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+            
+            // Start fade animation timer
+            _fadeTimer = setInterval(updateShapesFade, 100); // Update every 100ms for smooth fade
+        }
+
+        function rebindOverlayContainer() {
+            if (!_overlay) return;
+            // Remove old scroll handler
+            if (_overlayScrollHandler) { window.removeEventListener('scroll', _overlayScrollHandler); }
+            updateOverlayModeAndContainer();
+            updateOverlayBounds();
+            loadShapes();
+            renderOverlay();
+            _overlayScrollHandler = () => renderOverlay();
+            window.addEventListener('scroll', _overlayScrollHandler);
+        }
+
+        function teardownOverlay() {
+            if (!_overlay) return;
+            _overlay.removeEventListener('mousedown', onPointerDown);
+            _overlay.removeEventListener('mousemove', onPointerMove);
+            _overlay.removeEventListener('mouseenter', onPointerEnter);
+            _overlay.removeEventListener('mouseleave', onPointerLeave);
+            document.removeEventListener('mouseup', onPointerUp);
+            _overlay.removeEventListener('touchstart', onPointerDown);
+            _overlay.removeEventListener('touchmove', onPointerMove);
+            document.removeEventListener('touchend', onPointerUp);
+            if (_overlayResizeHandler) window.removeEventListener('resize', _overlayResizeHandler);
+            if (_overlayScrollHandler) {
+                if (_overlayContainer === window) {
+                    window.removeEventListener('scroll', _overlayScrollHandler);
+                } else if (_overlayContainer) {
+                    _overlayContainer.removeEventListener('scroll', _overlayScrollHandler);
+                }
+            }
+            if (_fadeTimer) {
+                clearInterval(_fadeTimer);
+                _fadeTimer = null;
+            }
+            if (_overlay.parentNode) _overlay.parentNode.removeChild(_overlay);
+            _overlay = null; _overlayCtx = null; _overlayContainer = null; _overlayResizeHandler = null; _overlayScrollHandler = null; _drawing = null;
+        }
+        
+        function teardownFileExplorer() {
+            const fe = document.querySelector('.file-explorer');
+            if (fe && fe.parentNode) fe.parentNode.removeChild(fe);
+        }
+
+        function escapeHtml(text) {
+            const div = document.createElement('div');
+            div.textContent = text;
+            return div.innerHTML;
+        }
+
+        function runCell(cellId){
+            const btn=document.querySelector('.run-btn[onclick*="'+cellId+'"]');
+            const output=document.getElementById('output-'+cellId);
+            if(btn){btn.textContent='⏳ running...';btn.disabled=true;}
+            if(output){output.classList.add('output-stale');}
+            fetch('/run/'+cellId,{method:'POST'}).then(r=>r.json()).then(data=>{
+                if(output){
+                    output.classList.remove('output-stale');
+                    let html='';
+                    if(data.stdout) html+='<div class="cell-stdout">'+escapeHtml(data.stdout)+'</div>';
+                    console.log('UV Logs:', data);
+                    if(data.stderr) {
+                        // Split UV logs from regular stderr
+                        const lines = data.stderr.split('\\n');
+                        let uvLogs = [];
+                        let regularLogs = [];
+                        let inUvSection = true;
+
+                        for (const line of lines) {
+                            if (inUvSection) {
+                                uvLogs.push(line);
+                                if (line.startsWith('Installed ')) {
+                                    inUvSection = false;
+                                }
+                            } else {
+                                regularLogs.push(line);
+                            }
+                        }
+                        
+
+                        // If we never found "Installed", treat it all as regular stderr
+                        if (inUvSection) {
+                            html+='<div class="cell-stderr">'+escapeHtml(data.stderr)+'</div>';
+                        } else {
+                            const uvLogsStr = uvLogs.join('\\n');
+                            const regularLogsStr = regularLogs.join('\\n').trim();
+
+                            if (uvLogsStr) {
+                                html+='<div class="uv-install-logs">';
+                                html+='<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>';
+                                html+='<div class="uv-logs-content" style="display: none;">'+escapeHtml(uvLogsStr)+'</div>';
+                                html+='</div>';
+                            }
+                            if (regularLogsStr) {
+                                html+='<div class="cell-stderr">'+escapeHtml(regularLogsStr)+'</div>';
+                            }
+                        }
+                    }
+                    output.innerHTML=html;
+                }
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            }).catch(e=>{
+                console.error('Run failed:',e);
+                if(output){output.classList.remove('output-stale');}
+                if(btn){btn.textContent='▶ run';btn.disabled=false;}
+            });
+        }
+
+        function copyCell(cellId){
+            console.log('copyCell called with cellId:', cellId);
+            
+            // Try multiple selectors to find the code element
+            let codeElement = document.querySelector('#code-'+cellId+' code');
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' pre code');
+            }
+            if (!codeElement) {
+                codeElement = document.querySelector('#code-'+cellId+' .highlight code');
+            }
+            if (!codeElement) {
+                // Try finding any code element within the cell
+                const codeDiv = document.getElementById('code-'+cellId);
+                if (codeDiv) {
+                    codeElement = codeDiv.querySelector('code');
+                }
+            }
+            
+            const btn = document.querySelector('.copy-btn[onclick*="'+cellId+'"]');
+            
+            console.log('Found codeElement:', codeElement);
+            console.log('Found btn:', btn);
+            console.log('Code div structure:', document.getElementById('code-'+cellId));
+            
+            if (!codeElement) {
+                console.error('Code element not found for cell:', cellId);
+                // Log the actual structure for debugging
+                const codeDiv = document.getElementById('code-'+cellId);
+                if (codeDiv) {
+                    console.log('Code div HTML:', codeDiv.innerHTML);
+                }
+                return;
+            }
+            if (!btn) {
+                console.error('Copy button not found for cell:', cellId);
+                return;
+            }
+            
+            const codeText = codeElement.textContent;
+            console.log('Code text to copy:', codeText ? codeText.substring(0, 50) + '...' : 'empty');
+            
+            if (navigator.clipboard && navigator.clipboard.writeText) {
+                navigator.clipboard.writeText(codeText).then(function() {
+                    console.log('Clipboard copy successful');
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                }).catch(function(err) {
+                    console.warn('Clipboard copy failed:', err);
+                    fallbackCopy();
+                });
+            } else {
+                console.log('Using fallback copy method');
+                fallbackCopy();
+            }
+            
+            function fallbackCopy() {
+                const textarea = document.createElement('textarea');
+                textarea.value = codeText;
+                textarea.style.position = 'absolute';
+                textarea.style.left = '-9999px';
+                document.body.appendChild(textarea);
+                textarea.select();
+                try {
+                    const success = document.execCommand('copy');
+                    console.log('Fallback copy success:', success);
+                    btn.textContent = '✓ Copied!';
+                    btn.classList.add('copied');
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                        btn.classList.remove('copied');
+                    }, 2000);
+                } catch (err) {
+                    console.error('Fallback copy failed:', err);
+                    btn.textContent = 'Copy failed';
+                    setTimeout(function() {
+                        btn.textContent = 'Copy';
+                    }, 2000);
+                }
+                document.body.removeChild(textarea);
+            }
+        }
+
+        // Live reload functionality (robust SSE handling)
+        (function(){
+            if (!('EventSource' in window)) {
+                console.warn('SSE not supported in this browser');
+                return;
+            }
+            let source = new EventSource('/events');
+            let isOpen = false;
+            source.onopen = function(){ isOpen = true; console.log('SSE connected'); };
+            source.onmessage = function(e){
+                const msg=(e.data||'').trim(); if(!msg) return;
+                console.log('SSE message:', msg);
+                if (msg==='reload' || msg==='incremental') { location.reload(); }
+                // Ignore 'loading' to avoid premature reload loops
+            };
+            source.onerror = function(e){
+                // Let EventSource auto-reconnect instead of forcing a reload
+                if (isOpen) console.warn('SSE error after open, retrying...', e);
+            };
+            window.addEventListener('beforeunload', function(){ try{source.close();}catch(_){} });
+        })();
+
+
+        document.addEventListener('DOMContentLoaded', function() {
+            // Capture initial cell states before any modifications
+            captureInitialCellStates();
+            
+            updateThemeIcon();
+            updateUiMenu();
+            updateUiDebug();
+            const widgetsEnabled = (document.documentElement.getAttribute('data-widgets') || 'on') === 'on';
+            if (widgetsEnabled) {
+            initMinimap();
+            initFileExplorer();
+            initTools();
+            initOverlay();
+            initStatusWidget();
+            initializeWidgetVisibility();
+            layoutWidgetsStackedBottomRight();
+            window.addEventListener('resize', layoutWidgetsStackedBottomRight);
+            }
+
+            // Apply deep-link selection if present
+            applyLocationFromUrl();
+            updateStateIndicator();
+            
+            // Apply cell states from URL immediately
+            const url = new URL(window.location.href);
+            const encodedCellStates = url.searchParams.get('cells');
+            if (encodedCellStates) {
+                console.log('Applying cell states from URL...');
+                const cellStates = decodeCellStatesFromUrl(encodedCellStates);
+                
+                // Use requestAnimationFrame to ensure DOM is ready
+                requestAnimationFrame(() => {
+                    applyCellStatesFromUrl(cellStates);
+                    
+                    // Clear initialization flag after cell states are applied
+                    if (typeof _isInitializing !== 'undefined') {
+                        _isInitializing = false;
+                    }
+                });
+            } else {
+                // Clear initialization flag even if no cell states
+                if (typeof _isInitializing !== 'undefined') {
+                    requestAnimationFrame(() => {
+                        _isInitializing = false;
+                    });
+                }
+            }
+
+            // Bind drag selection on line numbers
+            document.addEventListener('mousedown', onLineNumberMouseDown);
+            document.addEventListener('mousemove', onDocMouseMove);
+            document.addEventListener('mouseup', onDocMouseUp);
+
+            // Add ESC key handler to exit tools
+            document.addEventListener('keydown', function(e) {
+                if (e.key === 'Escape' || e.keyCode === 27) {
+                    const currentTool = document.body.dataset.tool;
+                    if (currentTool && currentTool !== 'none') {
+                        // Deactivate the current tool
+                        window.setActiveTool('none');
+                    }
+                    // Also clear any active line selection
+                    clearSelection(true);
+                }
+            });
+        });
+
+        function updateStateIndicator() {
+            try {
+                const el = document.getElementById('status-widget');
+                if (!el) return;
+                const tool = document.body.dataset.tool || 'none';
+                if (tool && tool !== 'none') {
+                    el.textContent = `tool: ${tool} — Esc`;
+                    return;
+                }
+                if (_selection) {
+                    const t = _selection.a === _selection.b ? `L${_selection.a}` : `L${_selection.a}-${_selection.b}`;
+                    el.textContent = `selected: ${t} — Esc`;
+                    return;
+                }
+                el.textContent = 'ready — Esc';
+            } catch (_) {}
+        }
+    </script>
+</head>
+
+
+<body>
+    <div class="controls">
+        <div class="controls-buttons">
+            <div class="theme-toggle" onclick="toggleTheme()">light</div>
+            <div class="reset-toggle" onclick="resetLayout()">reset</div>
+            <div class="menu-button" onclick="toggleMenu()">
+                menu ▼
+                <div class="menu-dropdown">
+                    <div class="menu-item" onclick="setUiTheme('default')">
+                        <span class="menu-checkbox" id="checkbox-ui-default">☑</span> Theme: default
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('none')">
+                        <span class="menu-checkbox" id="checkbox-ui-none">☐</span> Theme: none
+                    </div>
+                    <div class="menu-item" onclick="setUiTheme('monocolor')">
+                        <span class="menu-checkbox" id="checkbox-ui-monocolor">☐</span> Theme: monocolor
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('tools')">
+                        <span class="menu-checkbox" id="checkbox-tools">☐</span> Tools
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('file-explorer')">
+                        <span class="menu-checkbox" id="checkbox-file-explorer">☐</span> File Explorer
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('minimap')">
+                        <span class="menu-checkbox" id="checkbox-minimap">☐</span> Table of Contents
+                    </div>
+                    <div class="menu-item" onclick="toggleWidget('status')">
+                        <span class="menu-checkbox" id="checkbox-status">☑</span> Status Indicator
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    
+    <div class="system-info">
+        <div class="system-info-header">Generated on:</div>
+        <div class="system-info-content">
+            Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
+        </div>
+    </div>
+    
+    <div class="main-content">
+        <hr />
+<p>title: "Flash Attention Benchmark"
+author: "uvnote"
+theme: "dark"
+syntax_theme: "monokai"
+show_line_numbers: true
+collapse_code: false
+custom_css: |
+    #output-setup {
+        overflow-x: auto;
+    }
+    .cell-output {
+        overflow: scroll;
+    }
+    .cell-stdout {
+        width: max-content;
+        overflow: scroll;
+    }
+    .cell-stderr {
+        width: max-content;
+        overflow: scroll;
+        max-height: 300px;
+    }</p>
+<hr />
+<div class="cell cell-failed" id="cell-benchmark">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: benchmark | 50.28s | FAILED
+ | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
+<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
+</div>
+<div id="code-benchmark" class="cell-code" data-lines="341">
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#   &quot;numpy&quot;,</span>
+<span class="c1">#   &quot;torch&quot;,</span>
+<span class="c1">#   &quot;kernels&quot;,</span>
+<span class="c1">#   &quot;pandas&quot;,</span>
+<span class="c1">#   &quot;matplotlib&quot;</span>
+<span class="c1"># ]</span>
+<span class="c1"># ///</span>
+<span class="c1"># Benchmarking common shapes for Flux 1024x1024px image + varying text sequence lengths</span>
+
+<span class="kn">import</span><span class="w"> </span><span class="nn">functools</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">pathlib</span>
+
+<span class="kn">import</span><span class="w"> </span><span class="nn">matplotlib.pyplot</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">plt</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch._dynamo.config</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">triton</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">triton.language</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">tl</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">from</span><span class="w"> </span><span class="nn">flash_attn</span><span class="w"> </span><span class="kn">import</span> <span class="n">flash_attn_func</span>
+<span class="k">except</span><span class="p">:</span>
+    <span class="n">flash_attn_func</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Flash Attention 2 not found.&quot;</span><span class="p">)</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">from</span><span class="w"> </span><span class="nn">flash_attn_interface</span><span class="w"> </span><span class="kn">import</span> <span class="n">flash_attn_func</span> <span class="k">as</span> <span class="n">flash_attn_3_func</span>
+<span class="k">except</span><span class="p">:</span>
+    <span class="n">flash_attn_3_func</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Flash Attention 3 not found.&quot;</span><span class="p">)</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span>
+    <span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/flash-attn&quot;</span><span class="p">)</span>
+    <span class="n">hf_kernels_flash_attn_3</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/flash-attn3&quot;</span><span class="p">)</span>
+<span class="k">except</span><span class="p">:</span>
+    <span class="n">hf_kernels_flash_attn</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="n">hf_kernels_flash_attn_3</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;HF Kernels not found.&quot;</span><span class="p">)</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">from</span><span class="w"> </span><span class="nn">sageattention</span><span class="w"> </span><span class="kn">import</span> <span class="n">sageattn_qk_int8_pv_fp16_cuda</span><span class="p">,</span> <span class="n">sageattn_qk_int8_pv_fp16_triton</span><span class="p">,</span> <span class="n">sageattn_qk_int8_pv_fp8_cuda_sm90</span>
+<span class="k">except</span><span class="p">:</span>
+    <span class="n">sageattn_qk_int8_pv_fp16_cuda</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="n">sageattn_qk_int8_pv_fp16_triton</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="n">sageattn_qk_int8_pv_fp8_cuda_sm90</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;SageAttention not found.&quot;</span><span class="p">)</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">from</span><span class="w"> </span><span class="nn">transformer_engine.pytorch.attention</span><span class="w"> </span><span class="kn">import</span> <span class="n">DotProductAttention</span>
+<span class="k">except</span><span class="p">:</span>
+    <span class="n">DotProductAttention</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Transformer Engine not found.&quot;</span><span class="p">)</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="kn">import</span><span class="w"> </span><span class="nn">xformers.ops</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">xops</span>
+<span class="k">except</span><span class="p">:</span>
+    <span class="n">xops</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;xFormers not found.&quot;</span><span class="p">)</span>
+
+
+<span class="n">plt</span><span class="o">.</span><span class="n">rcParams</span><span class="o">.</span><span class="n">update</span><span class="p">({</span>
+    <span class="s2">&quot;figure.figsize&quot;</span><span class="p">:</span> <span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">10</span><span class="p">),</span>
+    <span class="s2">&quot;figure.dpi&quot;</span><span class="p">:</span> <span class="mi">120</span><span class="p">,</span>
+    <span class="s2">&quot;font.size&quot;</span><span class="p">:</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="s2">&quot;axes.titlesize&quot;</span><span class="p">:</span> <span class="mi">12</span><span class="p">,</span>
+    <span class="s2">&quot;axes.labelsize&quot;</span><span class="p">:</span> <span class="mi">14</span><span class="p">,</span>
+    <span class="s2">&quot;xtick.labelsize&quot;</span><span class="p">:</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="s2">&quot;ytick.labelsize&quot;</span><span class="p">:</span> <span class="mi">10</span><span class="p">,</span>
+    <span class="s2">&quot;legend.fontsize&quot;</span><span class="p">:</span> <span class="mi">8</span><span class="p">,</span>
+    <span class="s2">&quot;axes.grid&quot;</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="s2">&quot;grid.alpha&quot;</span><span class="p">:</span> <span class="mf">0.3</span><span class="p">,</span>
+    <span class="s2">&quot;grid.linestyle&quot;</span><span class="p">:</span> <span class="s2">&quot;--&quot;</span><span class="p">,</span>
+    <span class="s2">&quot;lines.linewidth&quot;</span><span class="p">:</span> <span class="mf">2.0</span><span class="p">,</span>
+    <span class="s2">&quot;lines.markersize&quot;</span><span class="p">:</span> <span class="mi">6</span><span class="p">,</span>
+    <span class="s2">&quot;legend.frameon&quot;</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
+    <span class="s2">&quot;legend.framealpha&quot;</span><span class="p">:</span> <span class="mf">0.9</span><span class="p">,</span>
+    <span class="s2">&quot;legend.loc&quot;</span><span class="p">:</span> <span class="s2">&quot;best&quot;</span><span class="p">,</span>
+    <span class="s2">&quot;axes.spines.top&quot;</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
+    <span class="s2">&quot;axes.spines.right&quot;</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
+<span class="p">})</span>
+
+
+<span class="c1"># We want to compare the best compiled version for each specific shape (dynamic=False)</span>
+<span class="n">torch</span><span class="o">.</span><span class="n">_dynamo</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">cache_size_limit</span> <span class="o">=</span> <span class="mi">10000</span>
+
+<span class="c1"># We need to suppress_errors for FA3 to work. It makes it run in eager mode.</span>
+<span class="c1"># I can&#39;t seem to get it to work any other way under torch.compile, so any suggestions are welcome!</span>
+<span class="n">torch</span><span class="o">.</span><span class="n">_dynamo</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">suppress_errors</span> <span class="o">=</span> <span class="kc">True</span>
+
+<span class="n">output_dir</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="s2">&quot;dump_attention_benchmark&quot;</span><span class="p">)</span>
+<span class="n">output_dir</span><span class="o">.</span><span class="n">mkdir</span><span class="p">(</span><span class="n">parents</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">exist_ok</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+
+<span class="n">batch_size</span> <span class="o">=</span> <span class="mi">1</span>
+<span class="n">num_attention_heads</span> <span class="o">=</span> <span class="mi">24</span>
+<span class="n">attention_head_dim</span> <span class="o">=</span> <span class="mi">128</span>
+<span class="n">image_sequence_length</span> <span class="o">=</span> <span class="mi">4096</span>  <span class="c1"># 1024x1024px</span>
+<span class="n">text_sequence_lengths</span> <span class="o">=</span> <span class="p">[</span><span class="mi">128</span><span class="p">,</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">320</span><span class="p">,</span> <span class="mi">384</span><span class="p">,</span> <span class="mi">448</span><span class="p">,</span> <span class="mi">512</span><span class="p">]</span>
+<span class="n">sequence_lengths</span> <span class="o">=</span> <span class="p">[</span><span class="n">image_sequence_length</span> <span class="o">+</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">text_sequence_lengths</span><span class="p">]</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_torch</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">backend</span><span class="p">):</span>
+    <span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">))</span>
+    <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">sdpa_kernel</span><span class="p">(</span><span class="n">backend</span><span class="p">):</span>
+        <span class="n">out</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">scaled_dot_product_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">out</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
+    <span class="k">return</span> <span class="n">out</span>
+
+
+<span class="n">_compiled_attention_torch_default</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_torch</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;default&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_torch_compile_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">backend</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_attention_torch_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">backend</span><span class="p">)</span>
+
+
+<span class="n">_compiled_attention_torch_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_torch</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_torch_compile_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">backend</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_attention_torch_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">backend</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_flash_attn_2</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">flash_attn_func</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="n">_compiled_flash_attn_2_default</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_flash_attn_2</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;default&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_flash_attn_2_compile_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_flash_attn_2_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="n">_compiled_flash_attn_2_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_flash_attn_2</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_flash_attn_2_compile_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_flash_attn_2_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="c1"># For fullgraph=True tracing to be compatible</span>
+<span class="nd">@torch</span><span class="o">.</span><span class="n">library</span><span class="o">.</span><span class="n">custom_op</span><span class="p">(</span><span class="s2">&quot;flash_attn_3::_flash_attn_forward&quot;</span><span class="p">,</span> <span class="n">mutates_args</span><span class="o">=</span><span class="p">(),</span> <span class="n">device_types</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_wrapped_flash_attn_3</span><span class="p">(</span><span class="n">query</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">:</span>
+    <span class="n">out</span><span class="p">,</span> <span class="n">lse</span> <span class="o">=</span> <span class="n">flash_attn_3_func</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">out</span>
+
+
+<span class="nd">@torch</span><span class="o">.</span><span class="n">library</span><span class="o">.</span><span class="n">register_fake</span><span class="p">(</span><span class="s2">&quot;flash_attn_3::_flash_attn_forward&quot;</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_</span><span class="p">(</span><span class="n">query</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">:</span>
+    <span class="k">return</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty_like</span><span class="p">(</span><span class="n">query</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_flash_attn_3</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">_wrapped_flash_attn_3</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">out</span>
+
+
+<span class="n">_compiled_flash_attn_3_default</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_flash_attn_3</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;default&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_flash_attn_3_compile_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_flash_attn_3_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="n">_compiled_flash_attn_3_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_flash_attn_3</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_flash_attn_3_compile_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_flash_attn_3_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_hf_kernels_flash_attn</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">hf_kernels_flash_attn</span><span class="o">.</span><span class="n">fwd</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">is_causal</span><span class="o">=</span><span class="kc">False</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_hf_kernels_flash_attn3</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">hf_kernels_flash_attn_3</span><span class="o">.</span><span class="n">flash_attn_func</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">causal</span><span class="o">=</span><span class="kc">False</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_sageattn_qk_int8_pv_fp16_cuda</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">sageattn_qk_int8_pv_fp16_cuda</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">tensor_layout</span><span class="o">=</span><span class="s2">&quot;NHD&quot;</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_sageattn_qk_int8_pv_fp16_triton</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">sageattn_qk_int8_pv_fp16_triton</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">tensor_layout</span><span class="o">=</span><span class="s2">&quot;NHD&quot;</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_sageattn_qk_int8_pv_fp8_cuda_sm90</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">sageattn_qk_int8_pv_fp8_cuda_sm90</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">tensor_layout</span><span class="o">=</span><span class="s2">&quot;NHD&quot;</span><span class="p">)</span>
+
+
+<span class="k">if</span> <span class="n">DotProductAttention</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">set_te_backend</span><span class="p">(</span><span class="n">backend</span><span class="p">):</span>
+        <span class="c1"># must be applied before first use of</span>
+        <span class="c1"># transformer_engine.pytorch.attention</span>
+        <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;NVTE_FLASH_ATTN&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;0&#39;</span>
+        <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;NVTE_FUSED_ATTN&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;0&#39;</span>
+        <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;NVTE_UNFUSED_ATTN&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;0&#39;</span>
+        <span class="k">if</span> <span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;flash&#39;</span><span class="p">:</span>
+            <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;NVTE_FLASH_ATTN&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;1&#39;</span>
+        <span class="k">if</span> <span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;fused&#39;</span><span class="p">:</span>
+            <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;NVTE_FUSED_ATTN&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;1&#39;</span>
+        <span class="k">if</span> <span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;unfused&#39;</span><span class="p">:</span>
+            <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;NVTE_UNFUSED_ATTN&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;1&#39;</span>
+
+    <span class="n">set_te_backend</span><span class="p">(</span><span class="s2">&quot;fused&quot;</span><span class="p">)</span>
+    <span class="n">te_attn_fn</span> <span class="o">=</span> <span class="n">DotProductAttention</span><span class="p">(</span>
+        <span class="n">num_attention_heads</span><span class="o">=</span><span class="n">num_attention_heads</span><span class="p">,</span>
+        <span class="n">kv_channels</span><span class="o">=</span><span class="n">attention_head_dim</span><span class="p">,</span>
+        <span class="n">qkv_format</span><span class="o">=</span><span class="s2">&quot;bshd&quot;</span><span class="p">,</span>
+        <span class="n">attn_mask_type</span><span class="o">=</span><span class="s2">&quot;no_mask&quot;</span><span class="p">,</span>
+    <span class="p">)</span>
+<span class="k">else</span><span class="p">:</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">te_attn_fn</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+        <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">&quot;Transformer Engine is not available. Please install it for TE-based attention.&quot;</span><span class="p">)</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_te</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">te_attn_fn</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">out</span><span class="o">.</span><span class="n">unflatten</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">(</span><span class="n">num_attention_heads</span><span class="p">,</span> <span class="n">attention_head_dim</span><span class="p">))</span>
+    <span class="k">return</span> <span class="n">out</span>
+
+
+<span class="c1"># Cannot fullgraph compile TE</span>
+<span class="n">_compiled_te_attn_fn_default</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_te</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;default&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_te_compile_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_te_attn_fn_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="c1"># Cannot fullgraph compile TE</span>
+<span class="n">_compiled_te_attn_fn_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_te</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_te_compile_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_te_attn_fn_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_xformers</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">xops</span><span class="o">.</span><span class="n">memory_efficient_attention</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="n">_compiled_xformers_default</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_xformers</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;default&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_xformers_compile_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_xformers_default</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="n">_compiled_xformers_max_autotune</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="n">_attention_xformers</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s2">&quot;max-autotune&quot;</span><span class="p">,</span> <span class="n">fullgraph</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">dynamic</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">_attention_xformers_compile_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span>
+    <span class="k">return</span> <span class="n">_compiled_xformers_max_autotune</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+
+
+<span class="n">attention_ops</span> <span class="o">=</span> <span class="p">{}</span>
+<span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;torch_cudnn&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_attention_torch</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">CUDNN_ATTENTION</span><span class="p">)</span>
+<span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;torch_cudnn_compile_d&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_attention_torch_compile_default</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">CUDNN_ATTENTION</span><span class="p">)</span>
+<span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;torch_cudnn_compile_ma&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_attention_torch_compile_max_autotune</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">CUDNN_ATTENTION</span><span class="p">)</span>
+<span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;torch_flash&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_attention_torch</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">)</span>
+<span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;torch_flash_compile_d&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_attention_torch_compile_default</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">)</span>
+<span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;torch_flash_compile_ma&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_attention_torch_compile_max_autotune</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">FLASH_ATTENTION</span><span class="p">)</span>
+<span class="k">if</span> <span class="n">hf_kernels_flash_attn</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;hf_flash_attn&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_hf_kernels_flash_attn</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;hf_flash_attn3&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_hf_kernels_flash_attn3</span>
+<span class="k">if</span> <span class="n">flash_attn_func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;flash_attn_2&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_flash_attn_2</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;flash_attn_2_compile_d&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_flash_attn_2_compile_default</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;flash_attn_2_compile_ma&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_flash_attn_2_compile_max_autotune</span>
+<span class="k">if</span> <span class="n">flash_attn_3_func</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;flash_attn_3&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_flash_attn_3</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;flash_attn_3_compile_d&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_flash_attn_3_compile_default</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;flash_attn_3_compile_ma&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_flash_attn_3_compile_max_autotune</span>
+<span class="k">if</span> <span class="n">sageattn_qk_int8_pv_fp16_cuda</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;sageattn_qk_int8_pv_fp16_cuda&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_sageattn_qk_int8_pv_fp16_cuda</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;sageattn_qk_int8_pv_fp16_triton&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_sageattn_qk_int8_pv_fp16_triton</span>
+    <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">get_device_capability</span><span class="p">()[</span><span class="mi">0</span><span class="p">]</span> <span class="o">&gt;=</span> <span class="mi">9</span><span class="p">:</span>
+        <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;sageattn_qk_int8_pv_fp8_cuda_sm90&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_sageattn_qk_int8_pv_fp8_cuda_sm90</span>
+<span class="k">if</span> <span class="n">DotProductAttention</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;te_fused&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_te</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;te_fused_compile_d&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_te_compile_default</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;te_fused_compile_ma&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_te_compile_max_autotune</span>
+<span class="k">if</span> <span class="n">xops</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;xformers&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_xformers</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;xformers_compile_d&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_xformers_compile_default</span>
+    <span class="n">attention_ops</span><span class="p">[</span><span class="s2">&quot;xformers_compile_ma&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">_attention_xformers_compile_max_autotune</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">get_color_and_linestyle</span><span class="p">(</span><span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]:</span>
+    <span class="n">colors</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;#e41a1c&quot;</span><span class="p">,</span> <span class="s2">&quot;#377eb8&quot;</span><span class="p">,</span> <span class="s2">&quot;#4daf4a&quot;</span><span class="p">,</span> <span class="s2">&quot;#984ea3&quot;</span><span class="p">,</span> <span class="s2">&quot;#ff7f00&quot;</span><span class="p">,</span> <span class="s2">&quot;#a65628&quot;</span><span class="p">,</span> <span class="s2">&quot;#f781bf&quot;</span><span class="p">,</span> <span class="s2">&quot;#999999&quot;</span><span class="p">]</span>
+    <span class="n">line_styles</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;-&quot;</span><span class="p">,</span> <span class="s2">&quot;:&quot;</span><span class="p">,</span> <span class="s2">&quot;-.&quot;</span><span class="p">,</span> <span class="s2">&quot;--&quot;</span><span class="p">]</span>
+    <span class="k">if</span> <span class="n">n</span> <span class="o">&gt;</span> <span class="nb">len</span><span class="p">(</span><span class="n">colors</span><span class="p">)</span> <span class="o">*</span> <span class="nb">len</span><span class="p">(</span><span class="n">line_styles</span><span class="p">):</span>
+        <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Required </span><span class="si">{</span><span class="n">n</span><span class="si">=}</span><span class="s2"> styles but maximum is </span><span class="si">{</span><span class="nb">len</span><span class="p">(</span><span class="n">colors</span><span class="p">)</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="nb">len</span><span class="p">(</span><span class="n">line_styles</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="n">styles</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n</span><span class="p">):</span>
+        <span class="n">color</span> <span class="o">=</span> <span class="n">colors</span><span class="p">[</span><span class="n">i</span> <span class="o">%</span> <span class="nb">len</span><span class="p">(</span><span class="n">colors</span><span class="p">)]</span>
+        <span class="n">linestyle</span> <span class="o">=</span> <span class="n">line_styles</span><span class="p">[</span><span class="n">i</span> <span class="o">//</span> <span class="nb">len</span><span class="p">(</span><span class="n">colors</span><span class="p">)]</span>
+        <span class="n">styles</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">color</span><span class="p">,</span> <span class="n">linestyle</span><span class="p">))</span>
+    <span class="k">return</span> <span class="n">styles</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">correctness</span><span class="p">():</span>
+    <span class="k">for</span> <span class="n">seq_len</span> <span class="ow">in</span> <span class="n">sequence_lengths</span><span class="p">:</span>
+        <span class="n">shape</span> <span class="o">=</span> <span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">num_attention_heads</span><span class="p">,</span> <span class="n">attention_head_dim</span><span class="p">)</span>
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n\n</span><span class="s2">===== Testing shape: </span><span class="si">{</span><span class="n">shape</span><span class="si">}</span><span class="s2"> =====&quot;</span><span class="p">)</span>
+
+        <span class="n">query</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+        <span class="n">key</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+        <span class="n">value</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+
+        <span class="n">golden_truth</span> <span class="o">=</span> <span class="n">_attention_torch</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">,</span> <span class="n">backend</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">attention</span><span class="o">.</span><span class="n">SDPBackend</span><span class="o">.</span><span class="n">MATH</span><span class="p">)</span>
+        <span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">()</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">))</span>
+
+        <span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">fn</span> <span class="ow">in</span> <span class="n">attention_ops</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
+            <span class="n">out</span> <span class="o">=</span> <span class="n">fn</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+            <span class="n">absdiff</span> <span class="o">=</span> <span class="p">(</span><span class="n">out</span> <span class="o">-</span> <span class="n">golden_truth</span><span class="p">)</span><span class="o">.</span><span class="n">abs</span><span class="p">()</span>
+            <span class="n">absmax</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">absdiff</span><span class="p">)</span>
+            <span class="n">mae</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">absdiff</span><span class="p">)</span>
+            <span class="n">mse</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">mean</span><span class="p">((</span><span class="n">golden_truth</span> <span class="o">-</span> <span class="n">out</span><span class="p">)</span> <span class="o">**</span> <span class="mi">2</span><span class="p">)</span>
+            <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">name</span><span class="si">:</span><span class="s2">&lt;30</span><span class="si">}</span><span class="s2">: absmax=</span><span class="si">{</span><span class="n">absmax</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">, mae=</span><span class="si">{</span><span class="n">mae</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">, mse=</span><span class="si">{</span><span class="n">mse</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+
+<span class="nd">@triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">perf_report</span><span class="p">(</span>
+    <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">Benchmark</span><span class="p">(</span>
+        <span class="n">x_names</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;seq_len&quot;</span><span class="p">],</span>
+        <span class="n">x_vals</span><span class="o">=</span><span class="n">sequence_lengths</span><span class="p">,</span>
+        <span class="n">x_log</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">line_arg</span><span class="o">=</span><span class="s2">&quot;provider&quot;</span><span class="p">,</span>
+        <span class="n">line_vals</span><span class="o">=</span><span class="nb">list</span><span class="p">(</span><span class="n">attention_ops</span><span class="o">.</span><span class="n">keys</span><span class="p">()),</span>
+        <span class="n">line_names</span><span class="o">=</span><span class="p">[</span><span class="n">x</span><span class="o">.</span><span class="n">removeprefix</span><span class="p">(</span><span class="s2">&quot;solution_&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">attention_ops</span><span class="o">.</span><span class="n">keys</span><span class="p">()],</span>
+        <span class="n">ylabel</span><span class="o">=</span><span class="s2">&quot;Time (ms)&quot;</span><span class="p">,</span>
+        <span class="n">styles</span><span class="o">=</span><span class="n">get_color_and_linestyle</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">attention_ops</span><span class="p">)),</span>
+        <span class="n">plot_name</span><span class="o">=</span><span class="s2">&quot;Attention Benchmark&quot;</span><span class="p">,</span>
+        <span class="n">args</span><span class="o">=</span><span class="p">{},</span>
+    <span class="p">)</span>
+<span class="p">)</span>
+<span class="k">def</span><span class="w"> </span><span class="nf">benchmark_fn</span><span class="p">(</span><span class="n">seq_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">provider</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
+    <span class="n">torch</span><span class="o">.</span><span class="n">manual_seed</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+
+    <span class="n">shape</span> <span class="o">=</span> <span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">num_attention_heads</span><span class="p">,</span> <span class="n">attention_head_dim</span><span class="p">)</span>
+    <span class="n">query</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">)</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">)</span>
+    <span class="n">key</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">)</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">)</span>
+    <span class="n">value</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">)</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">randint</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="n">shape</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">bfloat16</span><span class="p">)</span>
+
+    <span class="n">fn</span> <span class="o">=</span> <span class="n">attention_ops</span><span class="p">[</span><span class="n">provider</span><span class="p">]</span>
+    <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span>
+        <span class="k">lambda</span><span class="p">:</span> <span class="n">fn</span><span class="p">(</span><span class="n">query</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">),</span>
+        <span class="n">warmup</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span>
+        <span class="n">rep</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
+        <span class="n">quantiles</span><span class="o">=</span><span class="p">[</span><span class="mf">0.5</span><span class="p">,</span> <span class="mf">0.2</span><span class="p">,</span> <span class="mf">0.8</span><span class="p">],</span>
+    <span class="p">)</span>
+    <span class="k">return</span> <span class="n">ms</span><span class="p">,</span> <span class="n">max_ms</span><span class="p">,</span> <span class="n">min_ms</span>
+
+
+<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
+    <span class="n">correctness</span><span class="p">()</span>
+    <span class="n">benchmark_fn</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">save_path</span><span class="o">=</span><span class="n">output_dir</span><span class="o">.</span><span class="n">as_posix</span><span class="p">())</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-benchmark"></div>
+</div>
+</div>
+<div id="output-benchmark" class="cell-output">
+<div class="cell-stdout">Flash Attention 2 not found.
+Flash Attention 3 not found.
+SageAttention not found.
+Transformer Engine not found.
+xFormers not found.
+
+
+===== Testing shape: (1, 4224, 24, 128) =====
+torch_cudnn                   : absmax=0.001547, mae=0.000075, mse=0.000000
+</div>
+<div class="uv-install-logs" id="uv-logs-benchmark">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Downloading networkx (1.9MiB)
+Downloading sympy (6.0MiB)
+Downloading pillow (6.3MiB)
+Downloading matplotlib (8.3MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading fonttools (4.7MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading setuptools (1.1MiB)
+Downloading kiwisolver (1.4MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading pandas (11.8MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading triton (148.3MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading hf-xet (3.0MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading kiwisolver
+ Downloading hf-xet
+ Downloading setuptools
+ Downloading networkx
+ Downloading fonttools
+ Downloading pillow
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading matplotlib
+ Downloading sympy
+ Downloading numpy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading pandas
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 49 packages in 539ms
+</div>
+</div>
+<div class="cell-stderr">Fetching 20 files:   0%|          | 0/20 [00:00&lt;?, ?it/s]
+Fetching 20 files:   5%|▌         | 1/20 [00:00&lt;00:08,  2.21it/s]
+Fetching 20 files:  10%|█         | 2/20 [00:02&lt;00:21,  1.17s/it]
+Fetching 20 files: 100%|██████████| 20/20 [00:02&lt;00:00,  9.41it/s]
+
+Fetching 4 files:   0%|          | 0/4 [00:00&lt;?, ?it/s]
+Fetching 4 files:  25%|██▌       | 1/4 [00:00&lt;00:00,  5.28it/s]
+Fetching 4 files:  50%|█████     | 2/4 [00:02&lt;00:02,  1.15s/it]
+Fetching 4 files: 100%|██████████| 4/4 [00:02&lt;00:00,  1.99it/s]
+/tmp/tmpyw1le_3d/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
+    5 | #include &lt;Python.h&gt;
+      |          ^~~~~~~~~~
+compilation terminated.
+Traceback (most recent call last):
+  File &quot;/repo/flash_attn/.uvnote/cells/benchmark.py&quot;, line 340, in &lt;module&gt;
+    correctness()
+  File &quot;/repo/flash_attn/.uvnote/cells/benchmark.py&quot;, line 299, in correctness
+    out = fn(query, key, value)
+          ^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/repo/flash_attn/.uvnote/cells/benchmark.py&quot;, line 114, in _attention_torch_compile_default
+    return _compiled_attention_torch_default(query, key, value, backend=backend)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py&quot;, line 749, in compile_wrapper
+    raise e.remove_dynamo_frames() from None  # see TORCHDYNAMO_VERBOSE=1
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py&quot;, line 923, in _compile_fx_inner
+    raise InductorError(e, currentframe()).with_traceback(
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py&quot;, line 907, in _compile_fx_inner
+    mb_compiled_graph = fx_codegen_and_compile(
+                        ^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py&quot;, line 1578, in fx_codegen_and_compile
+    return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/compile_fx.py&quot;, line 1456, in codegen_and_compile
+    compiled_module = graph.compile_to_module()
+                      ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/graph.py&quot;, line 2293, in compile_to_module
+    return self._compile_to_module()
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/graph.py&quot;, line 2299, in _compile_to_module
+    self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
+                                                             ^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/graph.py&quot;, line 2238, in codegen
+    self.scheduler.codegen()
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/scheduler.py&quot;, line 4598, in codegen
+    else self._codegen(self.nodes)
+         ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/scheduler.py&quot;, line 4750, in _codegen
+    self.get_backend(device).codegen_node(node)
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/cuda_combined_scheduling.py&quot;, line 107, in codegen_node
+    return self._triton_scheduling.codegen_node(node)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/simd.py&quot;, line 1371, in codegen_node
+    return self.codegen_node_schedule(
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/simd.py&quot;, line 1424, in codegen_node_schedule
+    src_code = kernel.codegen_kernel()
+               ^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py&quot;, line 3677, in codegen_kernel
+    **self.inductor_meta_common(),
+      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/_inductor/codegen/triton.py&quot;, line 3501, in inductor_meta_common
+    &quot;backend_hash&quot;: torch.utils._triton.triton_hash_with_backend(),
+                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/utils/_triton.py&quot;, line 165, in triton_hash_with_backend
+    backend = triton_backend()
+              ^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/torch/utils/_triton.py&quot;, line 157, in triton_backend
+    target = driver.active.get_current_target()
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 30, in __getattr__
+    return getattr(self._initialize_obj(), name)
+                   ^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 26, in _initialize_obj
+    self._obj = self._init_fn()
+                ^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 12, in _create_driver
+    return active_drivers[0]()
+           ^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 715, in __init__
+    self.utils = CudaUtils()  # TODO: make static
+                 ^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 62, in __init__
+    mod = compile_module_from_src(
+          ^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 88, in compile_module_from_src
+    so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 51, in _build
+    subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
+  File &quot;/usr/lib/python3.11/subprocess.py&quot;, line 413, in check_call
+    raise CalledProcessError(retcode, cmd)
+torch._inductor.exc.InductorError: CalledProcessError: Command &#x27;[&#x27;/usr/bin/gcc&#x27;, &#x27;/tmp/tmpyw1le_3d/cuda_utils.c&#x27;, &#x27;-O3&#x27;, &#x27;-shared&#x27;, &#x27;-fPIC&#x27;, &#x27;-Wno-psabi&#x27;, &#x27;-o&#x27;, &#x27;/tmp/tmpyw1le_3d/cuda_utils.cpython-311-x86_64-linux-gnu.so&#x27;, &#x27;-lcuda&#x27;, &#x27;-L/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/lib&#x27;, &#x27;-L/usr/lib/x86_64-linux-gnu&#x27;, &#x27;-I/tmp/uvnote-run-08by6gh7/home/.cache/uv/environments-v2/benchmark-bfbc462482636f25/lib/python3.11/site-packages/triton/backends/nvidia/include&#x27;, &#x27;-I/tmp/tmpyw1le_3d&#x27;, &#x27;-I/usr/include/python3.11&#x27;]&#x27; returned non-zero exit status 1.
+
+Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you&#x27;re reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=&quot;+dynamo&quot;</div>
+</div>
+</div>
+    </div>
+    
+</body>
+</html>
\ No newline at end of file
diff --git a/flash_attn/cells/benchmark.py b/flash_attn/cells/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5caf7572b9a79185f0d090fff80e62027a77237
--- /dev/null
+++ b/flash_attn/cells/benchmark.py
@@ -0,0 +1,341 @@
+# /// script
+# dependencies = [
+#   "numpy",
+#   "torch",
+#   "kernels",
+#   "pandas",
+#   "matplotlib"
+# ]
+# ///
+# Benchmarking common shapes for Flux 1024x1024px image + varying text sequence lengths
+
+import functools
+import os
+import pathlib
+
+import matplotlib.pyplot as plt
+import torch
+import torch._dynamo.config
+import triton
+import triton.language as tl
+
+try:
+    from flash_attn import flash_attn_func
+except:
+    flash_attn_func = None
+    print("Flash Attention 2 not found.")
+
+try:
+    from flash_attn_interface import flash_attn_func as flash_attn_3_func
+except:
+    flash_attn_3_func = None
+    print("Flash Attention 3 not found.")
+
+try:
+    from kernels import get_kernel
+    hf_kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+    hf_kernels_flash_attn_3 = get_kernel("kernels-community/flash-attn3")
+except:
+    hf_kernels_flash_attn = None
+    hf_kernels_flash_attn_3 = None
+    print("HF Kernels not found.")
+
+try:
+    from sageattention import sageattn_qk_int8_pv_fp16_cuda, sageattn_qk_int8_pv_fp16_triton, sageattn_qk_int8_pv_fp8_cuda_sm90
+except:
+    sageattn_qk_int8_pv_fp16_cuda = None
+    sageattn_qk_int8_pv_fp16_triton = None
+    sageattn_qk_int8_pv_fp8_cuda_sm90 = None
+    print("SageAttention not found.")
+
+try:
+    from transformer_engine.pytorch.attention import DotProductAttention
+except:
+    DotProductAttention = None
+    print("Transformer Engine not found.")
+
+try:
+    import xformers.ops as xops
+except:
+    xops = None
+    print("xFormers not found.")
+
+
+plt.rcParams.update({
+    "figure.figsize": (12, 10),
+    "figure.dpi": 120,
+    "font.size": 10,
+    "axes.titlesize": 12,
+    "axes.labelsize": 14,
+    "xtick.labelsize": 10,
+    "ytick.labelsize": 10,
+    "legend.fontsize": 8,
+    "axes.grid": True,
+    "grid.alpha": 0.3,
+    "grid.linestyle": "--",
+    "lines.linewidth": 2.0,
+    "lines.markersize": 6,
+    "legend.frameon": True,
+    "legend.framealpha": 0.9,
+    "legend.loc": "best",
+    "axes.spines.top": False,
+    "axes.spines.right": False,
+})
+
+
+# We want to compare the best compiled version for each specific shape (dynamic=False)
+torch._dynamo.config.cache_size_limit = 10000
+
+# We need to suppress_errors for FA3 to work. It makes it run in eager mode.
+# I can't seem to get it to work any other way under torch.compile, so any suggestions are welcome!
+torch._dynamo.config.suppress_errors = True
+
+output_dir = pathlib.Path("dump_attention_benchmark")
+output_dir.mkdir(parents=True, exist_ok=True)
+
+batch_size = 1
+num_attention_heads = 24
+attention_head_dim = 128
+image_sequence_length = 4096  # 1024x1024px
+text_sequence_lengths = [128, 256, 320, 384, 448, 512]
+sequence_lengths = [image_sequence_length + i for i in text_sequence_lengths]
+
+
+def _attention_torch(query, key, value, *, backend):
+    query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
+    with torch.nn.attention.sdpa_kernel(backend):
+        out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
+    out = out.transpose(1, 2).contiguous()
+    return out
+
+
+_compiled_attention_torch_default = torch.compile(_attention_torch, mode="default", fullgraph=True, dynamic=False)
+def _attention_torch_compile_default(query, key, value, *, backend):
+    return _compiled_attention_torch_default(query, key, value, backend=backend)
+
+
+_compiled_attention_torch_max_autotune = torch.compile(_attention_torch, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_torch_compile_max_autotune(query, key, value, *, backend):
+    return _compiled_attention_torch_max_autotune(query, key, value, backend=backend)
+
+
+def _attention_flash_attn_2(query, key, value):
+    return flash_attn_func(query, key, value)
+
+
+_compiled_flash_attn_2_default = torch.compile(_attention_flash_attn_2, mode="default", fullgraph=True, dynamic=False)
+def _attention_flash_attn_2_compile_default(query, key, value):
+    return _compiled_flash_attn_2_default(query, key, value)
+
+
+_compiled_flash_attn_2_max_autotune = torch.compile(_attention_flash_attn_2, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_flash_attn_2_compile_max_autotune(query, key, value):
+    return _compiled_flash_attn_2_max_autotune(query, key, value)
+
+
+# For fullgraph=True tracing to be compatible
+@torch.library.custom_op("flash_attn_3::_flash_attn_forward", mutates_args=(), device_types="cuda")
+def _wrapped_flash_attn_3(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+    out, lse = flash_attn_3_func(query, key, value)
+    return out
+
+
+@torch.library.register_fake("flash_attn_3::_flash_attn_forward")
+def _(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+    return torch.empty_like(query)
+
+
+def _attention_flash_attn_3(query, key, value):
+    out = _wrapped_flash_attn_3(query, key, value)
+    return out
+
+
+_compiled_flash_attn_3_default = torch.compile(_attention_flash_attn_3, mode="default", fullgraph=True, dynamic=False)
+def _attention_flash_attn_3_compile_default(query, key, value):
+    return _compiled_flash_attn_3_default(query, key, value)
+
+
+_compiled_flash_attn_3_max_autotune = torch.compile(_attention_flash_attn_3, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_flash_attn_3_compile_max_autotune(query, key, value):
+    return _compiled_flash_attn_3_max_autotune(query, key, value)
+
+
+def _attention_hf_kernels_flash_attn(query, key, value):
+    return hf_kernels_flash_attn.fwd(query, key, value, is_causal=False)[0]
+
+
+def _attention_hf_kernels_flash_attn3(query, key, value):
+    return hf_kernels_flash_attn_3.flash_attn_func(query, key, value, causal=False)[0]
+
+
+def _attention_sageattn_qk_int8_pv_fp16_cuda(query, key, value):
+    return sageattn_qk_int8_pv_fp16_cuda(query, key, value, tensor_layout="NHD")
+
+
+def _attention_sageattn_qk_int8_pv_fp16_triton(query, key, value):
+    return sageattn_qk_int8_pv_fp16_triton(query, key, value, tensor_layout="NHD")
+
+
+def _attention_sageattn_qk_int8_pv_fp8_cuda_sm90(query, key, value):
+    return sageattn_qk_int8_pv_fp8_cuda_sm90(query, key, value, tensor_layout="NHD")
+
+
+if DotProductAttention is not None:
+    def set_te_backend(backend):
+        # must be applied before first use of
+        # transformer_engine.pytorch.attention
+        os.environ["NVTE_FLASH_ATTN"] = '0'
+        os.environ["NVTE_FUSED_ATTN"] = '0'
+        os.environ["NVTE_UNFUSED_ATTN"] = '0'
+        if backend == 'flash':
+            os.environ["NVTE_FLASH_ATTN"] = '1'
+        if backend == 'fused':
+            os.environ["NVTE_FUSED_ATTN"] = '1'
+        if backend == 'unfused':
+            os.environ["NVTE_UNFUSED_ATTN"] = '1'
+    
+    set_te_backend("fused")
+    te_attn_fn = DotProductAttention(
+        num_attention_heads=num_attention_heads,
+        kv_channels=attention_head_dim,
+        qkv_format="bshd",
+        attn_mask_type="no_mask",
+    )
+else:
+    def te_attn_fn(query, key, value):
+        raise RuntimeError("Transformer Engine is not available. Please install it for TE-based attention.")
+
+def _attention_te(query, key, value):
+    out = te_attn_fn(query, key, value)
+    out = out.unflatten(2, (num_attention_heads, attention_head_dim))
+    return out
+
+
+# Cannot fullgraph compile TE
+_compiled_te_attn_fn_default = torch.compile(_attention_te, mode="default", fullgraph=False, dynamic=False)
+def _attention_te_compile_default(query, key, value):
+    return _compiled_te_attn_fn_default(query, key, value)
+
+
+# Cannot fullgraph compile TE
+_compiled_te_attn_fn_max_autotune = torch.compile(_attention_te, mode="max-autotune", fullgraph=False, dynamic=False)
+def _attention_te_compile_max_autotune(query, key, value):
+    return _compiled_te_attn_fn_max_autotune(query, key, value)
+
+
+def _attention_xformers(query, key, value):
+    return xops.memory_efficient_attention(query, key, value)
+
+
+_compiled_xformers_default = torch.compile(_attention_xformers, mode="default", fullgraph=True, dynamic=False)
+def _attention_xformers_compile_default(query, key, value):
+    return _compiled_xformers_default(query, key, value)
+
+
+_compiled_xformers_max_autotune = torch.compile(_attention_xformers, mode="max-autotune", fullgraph=True, dynamic=False)
+def _attention_xformers_compile_max_autotune(query, key, value):
+    return _compiled_xformers_max_autotune(query, key, value)
+
+
+attention_ops = {}
+attention_ops["torch_cudnn"] = functools.partial(_attention_torch, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
+attention_ops["torch_cudnn_compile_d"] = functools.partial(_attention_torch_compile_default, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
+attention_ops["torch_cudnn_compile_ma"] = functools.partial(_attention_torch_compile_max_autotune, backend=torch.nn.attention.SDPBackend.CUDNN_ATTENTION)
+attention_ops["torch_flash"] = functools.partial(_attention_torch, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
+attention_ops["torch_flash_compile_d"] = functools.partial(_attention_torch_compile_default, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
+attention_ops["torch_flash_compile_ma"] = functools.partial(_attention_torch_compile_max_autotune, backend=torch.nn.attention.SDPBackend.FLASH_ATTENTION)
+if hf_kernels_flash_attn is not None:
+    attention_ops["hf_flash_attn"] = _attention_hf_kernels_flash_attn
+    attention_ops["hf_flash_attn3"] = _attention_hf_kernels_flash_attn3
+if flash_attn_func is not None:
+    attention_ops["flash_attn_2"] = _attention_flash_attn_2
+    attention_ops["flash_attn_2_compile_d"] = _attention_flash_attn_2_compile_default
+    attention_ops["flash_attn_2_compile_ma"] = _attention_flash_attn_2_compile_max_autotune
+if flash_attn_3_func is not None:
+    attention_ops["flash_attn_3"] = _attention_flash_attn_3
+    attention_ops["flash_attn_3_compile_d"] = _attention_flash_attn_3_compile_default
+    attention_ops["flash_attn_3_compile_ma"] = _attention_flash_attn_3_compile_max_autotune
+if sageattn_qk_int8_pv_fp16_cuda is not None:
+    attention_ops["sageattn_qk_int8_pv_fp16_cuda"] = _attention_sageattn_qk_int8_pv_fp16_cuda
+    attention_ops["sageattn_qk_int8_pv_fp16_triton"] = _attention_sageattn_qk_int8_pv_fp16_triton
+    if torch.cuda.get_device_capability()[0] >= 9:
+        attention_ops["sageattn_qk_int8_pv_fp8_cuda_sm90"] = _attention_sageattn_qk_int8_pv_fp8_cuda_sm90
+if DotProductAttention is not None:
+    attention_ops["te_fused"] = _attention_te
+    attention_ops["te_fused_compile_d"] = _attention_te_compile_default
+    attention_ops["te_fused_compile_ma"] = _attention_te_compile_max_autotune
+if xops is not None:
+    attention_ops["xformers"] = _attention_xformers
+    attention_ops["xformers_compile_d"] = _attention_xformers_compile_default
+    attention_ops["xformers_compile_ma"] = _attention_xformers_compile_max_autotune
+
+
+def get_color_and_linestyle(n: int) -> tuple[str, str]:
+    colors = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#ff7f00", "#a65628", "#f781bf", "#999999"]
+    line_styles = ["-", ":", "-.", "--"]
+    if n > len(colors) * len(line_styles):
+        raise ValueError(f"Required {n=} styles but maximum is {len(colors) * len(line_styles)}")
+    styles = []
+    for i in range(n):
+        color = colors[i % len(colors)]
+        linestyle = line_styles[i // len(colors)]
+        styles.append((color, linestyle))
+    return styles
+
+
+def correctness():
+    for seq_len in sequence_lengths:
+        shape = (batch_size, seq_len, num_attention_heads, attention_head_dim)
+        print(f"\n\n===== Testing shape: {shape} =====")
+        
+        query = torch.randn(shape, device="cuda", dtype=torch.float32)
+        key = torch.randn(shape, device="cuda", dtype=torch.float32)
+        value = torch.randn(shape, device="cuda", dtype=torch.float32)
+
+        golden_truth = _attention_torch(query, key, value, backend=torch.nn.attention.SDPBackend.MATH)
+        query, key, value = (x.bfloat16() for x in (query, key, value))
+
+        for name, fn in attention_ops.items():
+            out = fn(query, key, value)
+            absdiff = (out - golden_truth).abs()
+            absmax = torch.max(absdiff)
+            mae = torch.mean(absdiff)
+            mse = torch.mean((golden_truth - out) ** 2)
+            print(f"{name:<30}: absmax={absmax:.6f}, mae={mae:.6f}, mse={mse:.6f}")
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["seq_len"],
+        x_vals=sequence_lengths,
+        x_log=False,
+        line_arg="provider",
+        line_vals=list(attention_ops.keys()),
+        line_names=[x.removeprefix("solution_") for x in attention_ops.keys()],
+        ylabel="Time (ms)",
+        styles=get_color_and_linestyle(len(attention_ops)),
+        plot_name="Attention Benchmark",
+        args={},
+    )
+)
+def benchmark_fn(seq_len: int, provider: str):
+    torch.manual_seed(0)
+    
+    shape = (batch_size, seq_len, num_attention_heads, attention_head_dim)
+    query = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
+    key = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
+    value = torch.randn(shape, device="cuda", dtype=torch.bfloat16) * torch.randint(1, 5, shape, device="cuda", dtype=torch.bfloat16)
+    
+    fn = attention_ops[provider]
+    ms, min_ms, max_ms = triton.testing.do_bench(
+        lambda: fn(query, key, value),
+        warmup=3,
+        rep=10,
+        quantiles=[0.5, 0.2, 0.8],
+    )
+    return ms, max_ms, min_ms
+
+
+with torch.inference_mode():
+    correctness()
+    benchmark_fn.run(print_data=True, save_path=output_dir.as_posix())
diff --git a/flash_attn/index.html b/flash_attn/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..398172379434c672102a2bd0e4175dcb8e06f75e
--- /dev/null
+++ b/flash_attn/index.html
@@ -0,0 +1,24 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset='UTF-8'>
+  <title>Directory Index</title>
+  <style>
+    body { font-family: monospace; margin: 20px; }
+    h1 { font-size: 1.5em; }
+    ul { list-style-type: none; padding-left: 20px; }
+    li { margin: 5px 0; }
+    .dir { font-weight: bold; }
+    .file { color: #0066cc; }
+    a { text-decoration: none; }
+    a:hover { text-decoration: underline; }
+  </style>
+</head>
+<body>
+  <h1>Index of /flash_attn</h1>
+  <ul>
+    <li><a href='../index.html' class='dir'>../</a></li>
+    <li><a href='benchmark.html' class='file'>benchmark.html</a></li>
+  </ul>
+</body>
+</html>
\ No newline at end of file
diff --git a/index.html b/index.html
index 689cacba8fe49eeacaab52239a8357160121c5b8..0c248cc83161f77a6f15eafdaf3758bde0650b7a 100644
--- a/index.html
+++ b/index.html
@@ -17,8 +17,8 @@
 <body>
   <h1>Index of /</h1>
   <ul>
-    <li><a href='megablocks/index.html' class='dir'>megablocks/</a></li>
-    <li><a href='megablocks_yamoe/index.html' class='dir'>megablocks_yamoe/</a></li>
+    <li><a href='flash_attn/index.html' class='dir'>flash_attn/</a></li>
+    <li><a href='moe_benchmarks/index.html' class='dir'>moe_benchmarks/</a></li>
   </ul>
 </body>
 </html>
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks/cells/forward_and_backward.py b/moe_benchmarks/megablocks/cells/forward_and_backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ac420c8a43009eb857f3a7889b4f79ad5a1191
--- /dev/null
+++ b/moe_benchmarks/megablocks/cells/forward_and_backward.py
@@ -0,0 +1,196 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
+import sys
+import torch.profiler
+import gc
+import logging
+from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
+
+# remove liger kernel for testing 
+replace_kernel_forward_from_hub(GptOssRMSNorm, None)
+
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+
+
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+
+max_tokens = 128  # Reduced to help with memory usage
+
+# Clear memory before backward pass
+reset_peak_memory_stats()
+print(f"Pre-generation memory: {get_memory_stats()}")
+
+# forward and backward pass
+with torch.autograd.set_grad_enabled(True):
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+    print(tokenizer.decode(generated[0], skip_special_tokens=False))
+    print(f"Generation took {end_time - start_time:.2f} seconds")
+    print(f"Post-generation memory: {get_memory_stats()}")
+
+    # Use gradient checkpointing to reduce memory usage
+    if hasattr(model, 'gradient_checkpointing_enable'):
+        model.gradient_checkpointing_enable()
+        print("Enabled gradient checkpointing")
+
+    # Reduce sequence length if needed for memory
+    max_seq_len = 512  # Limit sequence length for backward pass
+    if generated.size(1) > max_seq_len:
+        print(f"Truncating sequence from {generated.size(1)} to {max_seq_len} tokens")
+        full_sequence = generated[:, -max_seq_len:]
+    else:
+        full_sequence = generated
+
+    # Get model outputs for the full sequence
+    model.train()  # Enable dropout and other training behaviors
+
+    try:
+        outputs = model(
+            input_ids=full_sequence,
+            labels=full_sequence,  # This will compute loss internally
+            return_dict=True
+        )
+        print(f"Post-forward memory: {get_memory_stats()}")
+
+        # If model doesn't compute loss, compute it manually
+        if outputs.loss is None:
+            shift_logits = outputs.logits[..., :-1, :].contiguous()
+            shift_labels = full_sequence[..., 1:].contiguous()
+
+            # Use CrossEntropyLoss with ignore_index for padding tokens
+            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else -100)
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1)
+            )
+        else:
+            loss = outputs.loss
+
+        print(f"Loss: {loss.item():.4f}")
+
+        # Clear intermediate tensors to save memory
+        del outputs
+        torch.cuda.empty_cache()
+
+        # Perform backward pass with memory management
+        print("Running backward pass...")
+        print(f"Pre-backward memory: {get_memory_stats()}")
+
+        loss.backward()
+        print(f"Post-backward memory: {get_memory_stats()}")
+
+    except torch.cuda.OutOfMemoryError as e:
+        print(f"OOM during forward/backward pass: {e}")
+        print("Try reducing max_tokens or max_seq_len")
+        raise
+
+    # Calculate gradient statistics and print sample gradients
+    total_norm = 0.0
+    param_count = 0
+    grad_samples = {}
+
+    for name, p in model.named_parameters():
+        if p.grad is not None:
+            param_count += 1
+            grad_norm = p.grad.data.norm(2).item()
+            total_norm += grad_norm ** 2
+
+            # Collect gradient statistics for key layers
+            if any(key in name for key in ['embed', 'lm_head', 'mlp.up', 'mlp.down', 'self_attn.q_proj', 'norm']):
+                grad_samples[name] = {
+                    'norm': grad_norm,
+                    'mean': p.grad.data.mean().item(),
+                    'std': p.grad.data.std().item(),
+                    'max': p.grad.data.max().item(),
+                    'min': p.grad.data.min().item(),
+                }
+
+    total_norm = total_norm ** 0.5
+
+    print(f"\nGradient norm: {total_norm:.4f}")
+    print(f"Parameters with gradients: {param_count}")
+
+    # Print sample gradients from important layers
+    print("\nSample gradient statistics:")
+    for i, (name, stats) in enumerate(list(grad_samples.items())[:10]):
+        print(f"  {name[:60]:<60} | norm: {stats['norm']:.4e} | mean: {stats['mean']:.4e} | std: {stats['std']:.4e}")
+
+    # Optional: zero gradients for next iteration
+    model.zero_grad()
+    model.eval()  # Switch back to eval mode
+
diff --git a/moe_benchmarks/megablocks/megablocks_only.html b/moe_benchmarks/megablocks/megablocks_only.html
index 8606aa9eddbd37a08f18ccfdeb910a8caa1cf0b5..2a81ff4825c4828a6c1b4b0e16548e5a18bc2114 100644
--- a/moe_benchmarks/megablocks/megablocks_only.html
+++ b/moe_benchmarks/megablocks/megablocks_only.html
@@ -3710,7 +3710,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-6.11.0-1018-azure-x86_64-with-glibc2.39
+            Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
         </div>
     </div>
     
@@ -3724,122 +3724,219 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <p>Next we can run with Megablocks kernels enabled.</p>
 <h3>Forward</h3>
 <p>First, we run a forward pass with Megablocks kernels.</p>
-<div class="cell cell-failed" id="cell-forward_only">
+<h2>Forward and Backward</h2>
+<p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
+<div class="cell cell-failed" id="cell-forward_and_backward">
 <div class="cell-header">
 <span class="collapse-indicators">
-<span onclick="toggleCode('forward_only')" style="cursor: pointer;">▼ code</span> 
-<span onclick="toggleOutput('forward_only')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-forward_only" onclick="toggleUvLogsFromHeader('forward_only')" style="cursor: pointer;">▶ uv-logs</span>
+<span onclick="toggleCode('forward_and_backward')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('forward_and_backward')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-forward_and_backward" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: forward_only | 118.48s | FAILED
- | <button class="run-btn" onclick="runCell('forward_only')">▶ run</button>
-<button class="copy-btn" onclick="copyCell('forward_only')">Copy</button>
-<a href="cells/forward_only.py" target="_blank" class="raw-btn">Raw</a>
+Cell: forward_and_backward | 19.43s | FAILED
+ | <button class="run-btn" onclick="runCell('forward_and_backward')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('forward_and_backward')">Copy</button>
+<a href="cells/forward_and_backward.py" target="_blank" class="raw-btn">Raw</a>
 </div>
-<div id="code-forward_only" class="cell-code" data-lines="101">
+<div id="code-forward_and_backward" class="cell-code" data-lines="196">
 <div class="highlight-with-lines">
-<div class="line-numbers" id="lines-forward_only">
-<a class="line-number" data-cell="forward_only" data-line="1" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 1, true);">1</a>
-<a class="line-number" data-cell="forward_only" data-line="2" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 2, true);">2</a>
-<a class="line-number" data-cell="forward_only" data-line="3" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 3, true);">3</a>
-<a class="line-number" data-cell="forward_only" data-line="4" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 4, true);">4</a>
-<a class="line-number" data-cell="forward_only" data-line="5" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 5, true);">5</a>
-<a class="line-number" data-cell="forward_only" data-line="6" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 6, true);">6</a>
-<a class="line-number" data-cell="forward_only" data-line="7" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 7, true);">7</a>
-<a class="line-number" data-cell="forward_only" data-line="8" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 8, true);">8</a>
-<a class="line-number" data-cell="forward_only" data-line="9" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 9, true);">9</a>
-<a class="line-number" data-cell="forward_only" data-line="10" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 10, true);">10</a>
-<a class="line-number" data-cell="forward_only" data-line="11" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 11, true);">11</a>
-<a class="line-number" data-cell="forward_only" data-line="12" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 12, true);">12</a>
-<a class="line-number" data-cell="forward_only" data-line="13" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 13, true);">13</a>
-<a class="line-number" data-cell="forward_only" data-line="14" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 14, true);">14</a>
-<a class="line-number" data-cell="forward_only" data-line="15" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 15, true);">15</a>
-<a class="line-number" data-cell="forward_only" data-line="16" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 16, true);">16</a>
-<a class="line-number" data-cell="forward_only" data-line="17" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 17, true);">17</a>
-<a class="line-number" data-cell="forward_only" data-line="18" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 18, true);">18</a>
-<a class="line-number" data-cell="forward_only" data-line="19" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 19, true);">19</a>
-<a class="line-number" data-cell="forward_only" data-line="20" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 20, true);">20</a>
-<a class="line-number" data-cell="forward_only" data-line="21" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 21, true);">21</a>
-<a class="line-number" data-cell="forward_only" data-line="22" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 22, true);">22</a>
-<a class="line-number" data-cell="forward_only" data-line="23" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 23, true);">23</a>
-<a class="line-number" data-cell="forward_only" data-line="24" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 24, true);">24</a>
-<a class="line-number" data-cell="forward_only" data-line="25" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 25, true);">25</a>
-<a class="line-number" data-cell="forward_only" data-line="26" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 26, true);">26</a>
-<a class="line-number" data-cell="forward_only" data-line="27" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 27, true);">27</a>
-<a class="line-number" data-cell="forward_only" data-line="28" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 28, true);">28</a>
-<a class="line-number" data-cell="forward_only" data-line="29" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 29, true);">29</a>
-<a class="line-number" data-cell="forward_only" data-line="30" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 30, true);">30</a>
-<a class="line-number" data-cell="forward_only" data-line="31" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 31, true);">31</a>
-<a class="line-number" data-cell="forward_only" data-line="32" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 32, true);">32</a>
-<a class="line-number" data-cell="forward_only" data-line="33" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 33, true);">33</a>
-<a class="line-number" data-cell="forward_only" data-line="34" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 34, true);">34</a>
-<a class="line-number" data-cell="forward_only" data-line="35" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 35, true);">35</a>
-<a class="line-number" data-cell="forward_only" data-line="36" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 36, true);">36</a>
-<a class="line-number" data-cell="forward_only" data-line="37" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 37, true);">37</a>
-<a class="line-number" data-cell="forward_only" data-line="38" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 38, true);">38</a>
-<a class="line-number" data-cell="forward_only" data-line="39" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 39, true);">39</a>
-<a class="line-number" data-cell="forward_only" data-line="40" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 40, true);">40</a>
-<a class="line-number" data-cell="forward_only" data-line="41" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 41, true);">41</a>
-<a class="line-number" data-cell="forward_only" data-line="42" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 42, true);">42</a>
-<a class="line-number" data-cell="forward_only" data-line="43" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 43, true);">43</a>
-<a class="line-number" data-cell="forward_only" data-line="44" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 44, true);">44</a>
-<a class="line-number" data-cell="forward_only" data-line="45" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 45, true);">45</a>
-<a class="line-number" data-cell="forward_only" data-line="46" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 46, true);">46</a>
-<a class="line-number" data-cell="forward_only" data-line="47" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 47, true);">47</a>
-<a class="line-number" data-cell="forward_only" data-line="48" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 48, true);">48</a>
-<a class="line-number" data-cell="forward_only" data-line="49" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 49, true);">49</a>
-<a class="line-number" data-cell="forward_only" data-line="50" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 50, true);">50</a>
-<a class="line-number" data-cell="forward_only" data-line="51" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 51, true);">51</a>
-<a class="line-number" data-cell="forward_only" data-line="52" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 52, true);">52</a>
-<a class="line-number" data-cell="forward_only" data-line="53" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 53, true);">53</a>
-<a class="line-number" data-cell="forward_only" data-line="54" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 54, true);">54</a>
-<a class="line-number" data-cell="forward_only" data-line="55" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 55, true);">55</a>
-<a class="line-number" data-cell="forward_only" data-line="56" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 56, true);">56</a>
-<a class="line-number" data-cell="forward_only" data-line="57" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 57, true);">57</a>
-<a class="line-number" data-cell="forward_only" data-line="58" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 58, true);">58</a>
-<a class="line-number" data-cell="forward_only" data-line="59" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 59, true);">59</a>
-<a class="line-number" data-cell="forward_only" data-line="60" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 60, true);">60</a>
-<a class="line-number" data-cell="forward_only" data-line="61" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 61, true);">61</a>
-<a class="line-number" data-cell="forward_only" data-line="62" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 62, true);">62</a>
-<a class="line-number" data-cell="forward_only" data-line="63" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 63, true);">63</a>
-<a class="line-number" data-cell="forward_only" data-line="64" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 64, true);">64</a>
-<a class="line-number" data-cell="forward_only" data-line="65" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 65, true);">65</a>
-<a class="line-number" data-cell="forward_only" data-line="66" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 66, true);">66</a>
-<a class="line-number" data-cell="forward_only" data-line="67" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 67, true);">67</a>
-<a class="line-number" data-cell="forward_only" data-line="68" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 68, true);">68</a>
-<a class="line-number" data-cell="forward_only" data-line="69" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 69, true);">69</a>
-<a class="line-number" data-cell="forward_only" data-line="70" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 70, true);">70</a>
-<a class="line-number" data-cell="forward_only" data-line="71" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 71, true);">71</a>
-<a class="line-number" data-cell="forward_only" data-line="72" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 72, true);">72</a>
-<a class="line-number" data-cell="forward_only" data-line="73" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 73, true);">73</a>
-<a class="line-number" data-cell="forward_only" data-line="74" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 74, true);">74</a>
-<a class="line-number" data-cell="forward_only" data-line="75" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 75, true);">75</a>
-<a class="line-number" data-cell="forward_only" data-line="76" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 76, true);">76</a>
-<a class="line-number" data-cell="forward_only" data-line="77" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 77, true);">77</a>
-<a class="line-number" data-cell="forward_only" data-line="78" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 78, true);">78</a>
-<a class="line-number" data-cell="forward_only" data-line="79" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 79, true);">79</a>
-<a class="line-number" data-cell="forward_only" data-line="80" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 80, true);">80</a>
-<a class="line-number" data-cell="forward_only" data-line="81" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 81, true);">81</a>
-<a class="line-number" data-cell="forward_only" data-line="82" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 82, true);">82</a>
-<a class="line-number" data-cell="forward_only" data-line="83" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 83, true);">83</a>
-<a class="line-number" data-cell="forward_only" data-line="84" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 84, true);">84</a>
-<a class="line-number" data-cell="forward_only" data-line="85" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 85, true);">85</a>
-<a class="line-number" data-cell="forward_only" data-line="86" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 86, true);">86</a>
-<a class="line-number" data-cell="forward_only" data-line="87" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 87, true);">87</a>
-<a class="line-number" data-cell="forward_only" data-line="88" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 88, true);">88</a>
-<a class="line-number" data-cell="forward_only" data-line="89" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 89, true);">89</a>
-<a class="line-number" data-cell="forward_only" data-line="90" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 90, true);">90</a>
-<a class="line-number" data-cell="forward_only" data-line="91" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 91, true);">91</a>
-<a class="line-number" data-cell="forward_only" data-line="92" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 92, true);">92</a>
-<a class="line-number" data-cell="forward_only" data-line="93" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 93, true);">93</a>
-<a class="line-number" data-cell="forward_only" data-line="94" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 94, true);">94</a>
-<a class="line-number" data-cell="forward_only" data-line="95" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 95, true);">95</a>
-<a class="line-number" data-cell="forward_only" data-line="96" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 96, true);">96</a>
-<a class="line-number" data-cell="forward_only" data-line="97" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 97, true);">97</a>
-<a class="line-number" data-cell="forward_only" data-line="98" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 98, true);">98</a>
-<a class="line-number" data-cell="forward_only" data-line="99" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 99, true);">99</a>
-<a class="line-number" data-cell="forward_only" data-line="100" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 100, true);">100</a>
-<a class="line-number" data-cell="forward_only" data-line="101" href="#cell-forward_only" onclick="event.preventDefault(); selectCellLine('forward_only', 101, true);">101</a>
+<div class="line-numbers" id="lines-forward_and_backward">
+<a class="line-number" data-cell="forward_and_backward" data-line="1" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 1, true);">1</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="2" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 2, true);">2</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="3" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 3, true);">3</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="4" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 4, true);">4</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="5" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 5, true);">5</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="6" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 6, true);">6</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="7" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 7, true);">7</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="8" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 8, true);">8</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="9" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 9, true);">9</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="10" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 10, true);">10</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="11" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 11, true);">11</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="12" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 12, true);">12</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="13" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 13, true);">13</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="14" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 14, true);">14</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="15" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 15, true);">15</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="16" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 16, true);">16</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="17" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 17, true);">17</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="18" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 18, true);">18</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="19" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 19, true);">19</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="20" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 20, true);">20</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="21" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 21, true);">21</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="22" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 22, true);">22</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="23" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 23, true);">23</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="24" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 24, true);">24</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="25" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 25, true);">25</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="26" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 26, true);">26</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="27" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 27, true);">27</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="28" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 28, true);">28</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="29" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 29, true);">29</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="30" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 30, true);">30</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="31" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 31, true);">31</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="32" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 32, true);">32</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="33" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 33, true);">33</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="34" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 34, true);">34</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="35" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 35, true);">35</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="36" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 36, true);">36</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="37" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 37, true);">37</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="38" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 38, true);">38</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="39" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 39, true);">39</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="40" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 40, true);">40</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="41" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 41, true);">41</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="42" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 42, true);">42</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="43" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 43, true);">43</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="44" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 44, true);">44</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="45" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 45, true);">45</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="46" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 46, true);">46</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="47" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 47, true);">47</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="48" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 48, true);">48</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="49" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 49, true);">49</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="50" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 50, true);">50</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="51" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 51, true);">51</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="52" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 52, true);">52</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="53" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 53, true);">53</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="54" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 54, true);">54</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="55" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 55, true);">55</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="56" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 56, true);">56</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="57" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 57, true);">57</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="58" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 58, true);">58</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="59" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 59, true);">59</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="60" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 60, true);">60</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="61" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 61, true);">61</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="62" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 62, true);">62</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="63" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 63, true);">63</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="64" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 64, true);">64</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="65" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 65, true);">65</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="66" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 66, true);">66</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="67" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 67, true);">67</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="68" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 68, true);">68</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="69" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 69, true);">69</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="70" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 70, true);">70</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="71" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 71, true);">71</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="72" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 72, true);">72</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="73" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 73, true);">73</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="74" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 74, true);">74</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="75" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 75, true);">75</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="76" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 76, true);">76</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="77" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 77, true);">77</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="78" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 78, true);">78</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="79" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 79, true);">79</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="80" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 80, true);">80</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="81" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 81, true);">81</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="82" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 82, true);">82</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="83" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 83, true);">83</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="84" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 84, true);">84</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="85" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 85, true);">85</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="86" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 86, true);">86</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="87" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 87, true);">87</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="88" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 88, true);">88</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="89" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 89, true);">89</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="90" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 90, true);">90</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="91" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 91, true);">91</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="92" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 92, true);">92</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="93" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 93, true);">93</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="94" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 94, true);">94</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="95" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 95, true);">95</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="96" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 96, true);">96</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="97" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 97, true);">97</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="98" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 98, true);">98</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="99" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 99, true);">99</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="100" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 100, true);">100</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="101" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 101, true);">101</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="102" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 102, true);">102</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="103" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 103, true);">103</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="104" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 104, true);">104</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="105" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 105, true);">105</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="106" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 106, true);">106</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="107" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 107, true);">107</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="108" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 108, true);">108</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="109" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 109, true);">109</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="110" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 110, true);">110</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="111" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 111, true);">111</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="112" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 112, true);">112</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="113" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 113, true);">113</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="114" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 114, true);">114</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="115" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 115, true);">115</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="116" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 116, true);">116</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="117" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 117, true);">117</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="118" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 118, true);">118</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="119" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 119, true);">119</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="120" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 120, true);">120</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="121" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 121, true);">121</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="122" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 122, true);">122</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="123" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 123, true);">123</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="124" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 124, true);">124</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="125" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 125, true);">125</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="126" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 126, true);">126</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="127" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 127, true);">127</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="128" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 128, true);">128</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="129" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 129, true);">129</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="130" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 130, true);">130</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="131" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 131, true);">131</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="132" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 132, true);">132</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="133" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 133, true);">133</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="134" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 134, true);">134</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="135" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 135, true);">135</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="136" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 136, true);">136</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="137" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 137, true);">137</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="138" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 138, true);">138</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="139" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 139, true);">139</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="140" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 140, true);">140</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="141" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 141, true);">141</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="142" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 142, true);">142</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="143" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 143, true);">143</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="144" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 144, true);">144</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="145" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 145, true);">145</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="146" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 146, true);">146</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="147" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 147, true);">147</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="148" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 148, true);">148</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="149" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 149, true);">149</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="150" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 150, true);">150</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="151" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 151, true);">151</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="152" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 152, true);">152</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="153" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 153, true);">153</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="154" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 154, true);">154</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="155" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 155, true);">155</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="156" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 156, true);">156</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="157" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 157, true);">157</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="158" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 158, true);">158</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="159" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 159, true);">159</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="160" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 160, true);">160</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="161" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 161, true);">161</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="162" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 162, true);">162</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="163" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 163, true);">163</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="164" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 164, true);">164</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="165" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 165, true);">165</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="166" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 166, true);">166</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="167" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 167, true);">167</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="168" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 168, true);">168</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="169" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 169, true);">169</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="170" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 170, true);">170</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="171" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 171, true);">171</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="172" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 172, true);">172</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="173" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 173, true);">173</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="174" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 174, true);">174</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="175" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 175, true);">175</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="176" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 176, true);">176</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="177" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 177, true);">177</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="178" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 178, true);">178</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="179" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 179, true);">179</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="180" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 180, true);">180</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="181" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 181, true);">181</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="182" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 182, true);">182</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="183" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 183, true);">183</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="184" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 184, true);">184</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="185" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 185, true);">185</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="186" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 186, true);">186</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="187" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 187, true);">187</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="188" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 188, true);">188</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="189" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 189, true);">189</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="190" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 190, true);">190</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="191" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 191, true);">191</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="192" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 192, true);">192</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="193" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 193, true);">193</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="194" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 194, true);">194</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="195" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 195, true);">195</a>
+<a class="line-number" data-cell="forward_and_backward" data-line="196" href="#cell-forward_and_backward" onclick="event.preventDefault(); selectCellLine('forward_and_backward', 196, true);">196</a>
 </div>
 <div class="code-wrap">
 <div class="highlight"><pre><span></span><span class="c1"># /// script</span>
@@ -3866,7 +3963,7 @@ Cell: forward_only | 118.48s | FAILED
 <span class="kn">import</span><span class="w"> </span><span class="nn">logging</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssRMSNorm</span>
 
-
+<span class="c1"># remove liger kernel for testing </span>
 <span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
 
 <span class="c1"># set to debug logging</span>
@@ -3907,8 +4004,6 @@ Cell: forward_only | 118.48s | FAILED
 <span class="n">tokenizer</span> <span class="o">=</span> <span class="n">PreTrainedTokenizerFast</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_id</span><span class="p">)</span>
 <span class="n">quantization_config</span> <span class="o">=</span> <span class="n">Mxfp4Config</span><span class="p">(</span><span class="n">dequantize</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 
-
-
 <span class="n">model</span> <span class="o">=</span> <span class="n">GptOssForCausalLM</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
     <span class="n">model_id</span><span class="p">,</span>
     <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;bfloat16&quot;</span><span class="p">,</span>
@@ -3929,9 +4024,14 @@ Cell: forward_only | 118.48s | FAILED
     <span class="n">reasoning_effort</span><span class="o">=</span><span class="s2">&quot;low&quot;</span><span class="p">,</span>
 <span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
 
-<span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">256</span>
+<span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">128</span>  <span class="c1"># Reduced to help with memory usage</span>
+
+<span class="c1"># Clear memory before backward pass</span>
+<span class="n">reset_peak_memory_stats</span><span class="p">()</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Pre-generation memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
 
-<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
+<span class="c1"># forward and backward pass</span>
+<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">autograd</span><span class="o">.</span><span class="n">set_grad_enabled</span><span class="p">(</span><span class="kc">True</span><span class="p">):</span>
     <span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
     <span class="n">generated</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
         <span class="o">**</span><span class="n">inputs</span><span class="p">,</span>
@@ -3940,144 +4040,124 @@ Cell: forward_only | 118.48s | FAILED
         <span class="n">temperature</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
     <span class="p">)</span>
     <span class="n">end_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
-
-<span class="nb">print</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">generated</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">))</span>
-<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds&quot;</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">generated</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">))</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds&quot;</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Post-generation memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="c1"># Use gradient checkpointing to reduce memory usage</span>
+    <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="s1">&#39;gradient_checkpointing_enable&#39;</span><span class="p">):</span>
+        <span class="n">model</span><span class="o">.</span><span class="n">gradient_checkpointing_enable</span><span class="p">()</span>
+        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Enabled gradient checkpointing&quot;</span><span class="p">)</span>
+
+    <span class="c1"># Reduce sequence length if needed for memory</span>
+    <span class="n">max_seq_len</span> <span class="o">=</span> <span class="mi">512</span>  <span class="c1"># Limit sequence length for backward pass</span>
+    <span class="k">if</span> <span class="n">generated</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">max_seq_len</span><span class="p">:</span>
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Truncating sequence from </span><span class="si">{</span><span class="n">generated</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="si">}</span><span class="s2"> to </span><span class="si">{</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2"> tokens&quot;</span><span class="p">)</span>
+        <span class="n">full_sequence</span> <span class="o">=</span> <span class="n">generated</span><span class="p">[:,</span> <span class="o">-</span><span class="n">max_seq_len</span><span class="p">:]</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="n">full_sequence</span> <span class="o">=</span> <span class="n">generated</span>
+
+    <span class="c1"># Get model outputs for the full sequence</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">train</span><span class="p">()</span>  <span class="c1"># Enable dropout and other training behaviors</span>
+
+    <span class="k">try</span><span class="p">:</span>
+        <span class="n">outputs</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span>
+            <span class="n">input_ids</span><span class="o">=</span><span class="n">full_sequence</span><span class="p">,</span>
+            <span class="n">labels</span><span class="o">=</span><span class="n">full_sequence</span><span class="p">,</span>  <span class="c1"># This will compute loss internally</span>
+            <span class="n">return_dict</span><span class="o">=</span><span class="kc">True</span>
+        <span class="p">)</span>
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Post-forward memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="c1"># If model doesn&#39;t compute loss, compute it manually</span>
+        <span class="k">if</span> <span class="n">outputs</span><span class="o">.</span><span class="n">loss</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">shift_logits</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">logits</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="p">:]</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
+            <span class="n">shift_labels</span> <span class="o">=</span> <span class="n">full_sequence</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">:]</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
+
+            <span class="c1"># Use CrossEntropyLoss with ignore_index for padding tokens</span>
+            <span class="n">loss_fct</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">CrossEntropyLoss</span><span class="p">(</span><span class="n">ignore_index</span><span class="o">=</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span> <span class="k">if</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">pad_token_id</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="o">-</span><span class="mi">100</span><span class="p">)</span>
+            <span class="n">loss</span> <span class="o">=</span> <span class="n">loss_fct</span><span class="p">(</span>
+                <span class="n">shift_logits</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">shift_logits</span><span class="o">.</span><span class="n">size</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)),</span>
+                <span class="n">shift_labels</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
+            <span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">loss</span> <span class="o">=</span> <span class="n">outputs</span><span class="o">.</span><span class="n">loss</span>
+
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Loss: </span><span class="si">{</span><span class="n">loss</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="c1"># Clear intermediate tensors to save memory</span>
+        <span class="k">del</span> <span class="n">outputs</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">empty_cache</span><span class="p">()</span>
+
+        <span class="c1"># Perform backward pass with memory management</span>
+        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Running backward pass...&quot;</span><span class="p">)</span>
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Pre-backward memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+        <span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span>
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Post-backward memory: </span><span class="si">{</span><span class="n">get_memory_stats</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="k">except</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">OutOfMemoryError</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;OOM during forward/backward pass: </span><span class="si">{</span><span class="n">e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Try reducing max_tokens or max_seq_len&quot;</span><span class="p">)</span>
+        <span class="k">raise</span>
+
+    <span class="c1"># Calculate gradient statistics and print sample gradients</span>
+    <span class="n">total_norm</span> <span class="o">=</span> <span class="mf">0.0</span>
+    <span class="n">param_count</span> <span class="o">=</span> <span class="mi">0</span>
+    <span class="n">grad_samples</span> <span class="o">=</span> <span class="p">{}</span>
+
+    <span class="k">for</span> <span class="n">name</span><span class="p">,</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">named_parameters</span><span class="p">():</span>
+        <span class="k">if</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">param_count</span> <span class="o">+=</span> <span class="mi">1</span>
+            <span class="n">grad_norm</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">item</span><span class="p">()</span>
+            <span class="n">total_norm</span> <span class="o">+=</span> <span class="n">grad_norm</span> <span class="o">**</span> <span class="mi">2</span>
+
+            <span class="c1"># Collect gradient statistics for key layers</span>
+            <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">key</span> <span class="ow">in</span> <span class="n">name</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;embed&#39;</span><span class="p">,</span> <span class="s1">&#39;lm_head&#39;</span><span class="p">,</span> <span class="s1">&#39;mlp.up&#39;</span><span class="p">,</span> <span class="s1">&#39;mlp.down&#39;</span><span class="p">,</span> <span class="s1">&#39;self_attn.q_proj&#39;</span><span class="p">,</span> <span class="s1">&#39;norm&#39;</span><span class="p">]):</span>
+                <span class="n">grad_samples</span><span class="p">[</span><span class="n">name</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
+                    <span class="s1">&#39;norm&#39;</span><span class="p">:</span> <span class="n">grad_norm</span><span class="p">,</span>
+                    <span class="s1">&#39;mean&#39;</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
+                    <span class="s1">&#39;std&#39;</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">std</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
+                    <span class="s1">&#39;max&#39;</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">max</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
+                    <span class="s1">&#39;min&#39;</span><span class="p">:</span> <span class="n">p</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">min</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">(),</span>
+                <span class="p">}</span>
+
+    <span class="n">total_norm</span> <span class="o">=</span> <span class="n">total_norm</span> <span class="o">**</span> <span class="mf">0.5</span>
+
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Gradient norm: </span><span class="si">{</span><span class="n">total_norm</span><span class="si">:</span><span class="s2">.4f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Parameters with gradients: </span><span class="si">{</span><span class="n">param_count</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="c1"># Print sample gradients from important layers</span>
+    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Sample gradient statistics:&quot;</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">stats</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">grad_samples</span><span class="o">.</span><span class="n">items</span><span class="p">())[:</span><span class="mi">10</span><span class="p">]):</span>
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;  </span><span class="si">{</span><span class="n">name</span><span class="p">[:</span><span class="mi">60</span><span class="p">]</span><span class="si">:</span><span class="s2">&lt;60</span><span class="si">}</span><span class="s2"> | norm: </span><span class="si">{</span><span class="n">stats</span><span class="p">[</span><span class="s1">&#39;norm&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.4e</span><span class="si">}</span><span class="s2"> | mean: </span><span class="si">{</span><span class="n">stats</span><span class="p">[</span><span class="s1">&#39;mean&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.4e</span><span class="si">}</span><span class="s2"> | std: </span><span class="si">{</span><span class="n">stats</span><span class="p">[</span><span class="s1">&#39;std&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.4e</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="c1"># Optional: zero gradients for next iteration</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">zero_grad</span><span class="p">()</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>  <span class="c1"># Switch back to eval mode</span>
 </pre></div>
 
-<div class="code-line-highlight" id="line-highlight-forward_only"></div>
+<div class="code-line-highlight" id="line-highlight-forward_and_backward"></div>
 </div>
 </div>
 </div>
-<div id="output-forward_only" class="cell-output">
-<div class="uv-install-logs" id="uv-logs-forward_only">
-<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
-<div class="uv-logs-content" style="display: none;">
+<div id="output-forward_and_backward" class="cell-output">
+<div class="cell-stderr">Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
+ Downloading cpython-3.13.7-linux-x86_64-gnu (download)
    Updating https://github.com/huggingface/transformers.git (HEAD)
-    Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d)
-   Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
-Downloading triton (148.4MiB)
-Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
-Downloading hf-xet (3.0MiB)
-Downloading pillow (6.3MiB)
-Downloading tokenizers (3.1MiB)
-Downloading jedi (1.5MiB)
-Downloading nvidia-nvjitlink-cu12 (37.4MiB)
-Downloading nvidia-cufile-cu12 (1.1MiB)
-Downloading networkx (1.9MiB)
-Downloading nvidia-cusparselt-cu12 (273.9MiB)
-Downloading nvidia-cusolver-cu12 (255.1MiB)
-Downloading nvidia-cufft-cu12 (184.2MiB)
-Downloading nvidia-curand-cu12 (60.7MiB)
-Downloading nvidia-cublas-cu12 (566.8MiB)
-Downloading nvidia-nccl-cu12 (307.4MiB)
-Downloading nvidia-cudnn-cu12 (674.0MiB)
-Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
-Downloading sympy (6.0MiB)
-Downloading fonttools (4.7MiB)
-Downloading nvidia-cusparse-cu12 (274.9MiB)
-Downloading torch (846.8MiB)
-Downloading numpy (15.9MiB)
-Downloading matplotlib (8.3MiB)
-Downloading kiwisolver (1.4MiB)
- Downloading nvidia-cufile-cu12
- Downloading kiwisolver
- Downloading hf-xet
- Downloading tokenizers
- Downloading networkx
- Downloading fonttools
- Downloading pillow
- Downloading matplotlib
- Downloading nvidia-cuda-cupti-cu12
- Downloading sympy
- Downloading numpy
- Downloading jedi
-      Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d
- Downloading nvidia-nvjitlink-cu12
- Downloading nvidia-curand-cu12
- Downloading nvidia-cuda-nvrtc-cu12
- Downloading triton
- Downloading nvidia-cufft-cu12
- Downloading nvidia-cusolver-cu12
- Downloading nvidia-cusparselt-cu12
- Downloading nvidia-cusparse-cu12
- Downloading nvidia-nccl-cu12
- Downloading nvidia-cublas-cu12
- Downloading nvidia-cudnn-cu12
- Downloading torch
-Installed 69 packages in 321ms
-</div>
+    Updated https://github.com/huggingface/transformers.git (449533af73874470e914a203391635e04ac2ffc8)
+  × No solution found when resolving script dependencies:
+  ╰─▶ Because only transformers==4.57.0.dev0 is available and
+      transformers==4.57.0.dev0 depends on huggingface-hub==1.0.0rc1,
+      we can conclude that all versions of transformers depend on
+      huggingface-hub==1.0.0rc1.
+      And because kernels==0.10.0 depends on huggingface-hub&gt;=0.26.0,&lt;1.0,
+      we can conclude that kernels==0.10.0 and all versions of transformers
+      are incompatible.
+      And because you require kernels==0.10.0 and transformers, we can
+      conclude that your requirements are unsatisfiable.
 </div>
-<div class="cell-stderr">Fetching 3 files:   0%|          | 0/3 [00:00&lt;?, ?it/s]
-Fetching 3 files:   0%|          | 0/3 [00:50&lt;?, ?it/s]
-Traceback (most recent call last):
-  File &quot;/home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks/.uvnote/cells/forward_only.py&quot;, line 68, in &lt;module&gt;
-    model = GptOssForCausalLM.from_pretrained(
-            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/modeling_utils.py&quot;, line 285, in _wrapper
-    return func(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/modeling_utils.py&quot;, line 4904, in from_pretrained
-    checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files(
-                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/modeling_utils.py&quot;, line 1239, in _get_resolved_checkpoint_files
-    checkpoint_files, sharded_metadata = get_checkpoint_shard_files(
-                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/utils/hub.py&quot;, line 1116, in get_checkpoint_shard_files
-    cached_filenames = cached_files(
-                       ^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/utils/hub.py&quot;, line 564, in cached_files
-    raise e
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/transformers/utils/hub.py&quot;, line 491, in cached_files
-    snapshot_download(
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py&quot;, line 114, in _inner_fn
-    return fn(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/_snapshot_download.py&quot;, line 332, in snapshot_download
-    thread_map(
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/tqdm/contrib/concurrent.py&quot;, line 69, in thread_map
-    return _executor_map(ThreadPoolExecutor, fn, *iterables, **tqdm_kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/tqdm/contrib/concurrent.py&quot;, line 51, in _executor_map
-    return list(tqdm_class(ex.map(fn, *iterables, chunksize=chunksize), **kwargs))
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/tqdm/std.py&quot;, line 1181, in __iter__
-    for obj in iterable:
-  File &quot;/usr/lib/python3.12/concurrent/futures/_base.py&quot;, line 619, in result_iterator
-    yield _result_or_cancel(fs.pop())
-          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/usr/lib/python3.12/concurrent/futures/_base.py&quot;, line 317, in _result_or_cancel
-    return fut.result(timeout)
-           ^^^^^^^^^^^^^^^^^^^
-  File &quot;/usr/lib/python3.12/concurrent/futures/_base.py&quot;, line 456, in result
-    return self.__get_result()
-           ^^^^^^^^^^^^^^^^^^^
-  File &quot;/usr/lib/python3.12/concurrent/futures/_base.py&quot;, line 401, in __get_result
-    raise self._exception
-  File &quot;/usr/lib/python3.12/concurrent/futures/thread.py&quot;, line 58, in run
-    result = self.fn(*self.args, **self.kwargs)
-             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/_snapshot_download.py&quot;, line 306, in _inner_hf_hub_download
-    return hf_hub_download(
-           ^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py&quot;, line 114, in _inner_fn
-    return fn(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py&quot;, line 1010, in hf_hub_download
-    return _hf_hub_download_to_cache_dir(
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py&quot;, line 1171, in _hf_hub_download_to_cache_dir
-    _download_to_tmp_and_move(
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py&quot;, line 1723, in _download_to_tmp_and_move
-    xet_get(
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/forward-only-b65004b2d0cb4ca8/lib/python3.12/site-packages/huggingface_hub/file_download.py&quot;, line 629, in xet_get
-    download_files(
-RuntimeError: Data processing error: CAS service error : IO Error: No space left on device (os error 28)</div>
 </div>
 </div>
-
-<h2>Forward and Backward</h2>
-<p>Next, we run a forward and backward pass with Megablocks kernels enabled. This should be more memory efficient and allow us to complete the backward pass without running out of memory.</p>
     </div>
     
 </body>
diff --git a/moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json b/moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..b222e58061a4ff0233f1bab85f7d2c289d162f3e
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json
@@ -0,0 +1,24 @@
+{
+  "implementation": "binned_results",
+  "config": {
+    "warmup": 10,
+    "iters": 50,
+    "device": "cuda",
+    "dtype": "torch.float32",
+    "tokens": 100,
+    "vary_inputs": true
+  },
+  "stats": {
+    "avg_ms": 36.06324691992995,
+    "min_ms": 33.29206800026441,
+    "max_ms": 38.40615900026023,
+    "std_ms": 1.258567678508065,
+    "p50_ms": 36.21510599987232,
+    "p95_ms": 37.524451049966956,
+    "p99_ms": 38.03603995002959,
+    "num_iters": 50,
+    "tokens_per_s": 2772.906172925215,
+    "throughput_variance": 98.28636435515342
+  },
+  "output_sum": 3.97190523147583
+}
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json b/moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..491d61dbedf6649b5665666f8408fd1d61d51144
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
@@ -0,0 +1,24 @@
+{
+  "implementation": "gptoss_results",
+  "config": {
+    "warmup": 10,
+    "iters": 50,
+    "device": "cuda",
+    "dtype": "torch.float32",
+    "tokens": 100,
+    "vary_inputs": true
+  },
+  "stats": {
+    "avg_ms": 45.286630379978305,
+    "min_ms": 38.91367899996112,
+    "max_ms": 49.84392799997295,
+    "std_ms": 3.2326168009526866,
+    "p50_ms": 45.42240999990099,
+    "p95_ms": 49.729684149951936,
+    "p99_ms": 49.82545450991893,
+    "num_iters": 50,
+    "tokens_per_s": 2208.1572234663554,
+    "throughput_variance": 161.27578702324564
+  },
+  "output_sum": 11.53223705291748
+}
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json b/moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..c0d899fe9a06bb4ca74cd58fe9746a5942bbd236
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
@@ -0,0 +1,24 @@
+{
+  "implementation": "gptoss_training_results",
+  "config": {
+    "warmup": 10,
+    "iters": 50,
+    "device": "cuda",
+    "dtype": "torch.float32",
+    "tokens": 100,
+    "vary_inputs": true
+  },
+  "stats": {
+    "avg_ms": 46.01034353989235,
+    "min_ms": 39.20698799993261,
+    "max_ms": 51.09754699969926,
+    "std_ms": 3.2594474712819497,
+    "p50_ms": 46.132551999562565,
+    "p95_ms": 50.721096600273086,
+    "p99_ms": 51.0080171399477,
+    "num_iters": 50,
+    "tokens_per_s": 2173.4243282338675,
+    "throughput_variance": 158.68467070353637
+  },
+  "output_sum": 11.53223705291748
+}
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json b/moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec2f20c34ce683f571a322b29e917480b9e73939
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
@@ -0,0 +1,24 @@
+{
+  "implementation": "yamoe_results",
+  "config": {
+    "warmup": 10,
+    "iters": 50,
+    "device": "cuda",
+    "dtype": "torch.float32",
+    "tokens": 100,
+    "vary_inputs": true
+  },
+  "stats": {
+    "avg_ms": 4.2510544400101935,
+    "min_ms": 4.144352999901457,
+    "max_ms": 4.320155999266717,
+    "std_ms": 0.02873328656403644,
+    "p50_ms": 4.2539659998510615,
+    "p95_ms": 4.2857709999225335,
+    "p99_ms": 4.306132199617423,
+    "num_iters": 50,
+    "tokens_per_s": 23523.575482547854,
+    "throughput_variance": 160.28680309512873
+  },
+  "output_sum": 3.97190523147583
+}
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc
index d5df252f2e6edaec8717d49f0fe7d72b278c362e..d4071be109b86c510b35c06dedc5c8c3e35bfe86 100644
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
diff --git a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc
index be5ea0a48cedabb22eac9d1ef3f5b0422d87c5c2..8eaaeeb579d95786f08f8a033ac563063f8e58ba 100644
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
diff --git a/moe_benchmarks/megablocks_yamoe/cells/binned_run.py b/moe_benchmarks/megablocks_yamoe/cells/binned_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe9e54316e7380bc60d7bb62459498e450575b31
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/cells/binned_run.py
@@ -0,0 +1,195 @@
+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+# ]
+# ///
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
+from config import (
+    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
+    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
+    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
+)
+from pathlib import Path
+import os
+
+# Discover the upstream artifact directory from env
+data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
+
+router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
+router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
+gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
+gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
+down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
+down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
+
+print("Loaded shared weights from artifacts")
+print(f"Router weight sum: {router_weight.sum().item():.6f}")
+print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+print(f"Down sum: {down_proj.sum().item():.6f}")
+
+def binned_gather(x, indices, bins, expert_capacity, top_k):
+    E, H = bins.shape[0], x.shape[1]
+    out = torch.zeros((E, expert_capacity, H), device=x.device, dtype=x.dtype)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = min(end - start, expert_capacity)
+        for i in range(n):
+            flat_pos = indices[start + i]
+            tok = flat_pos // top_k
+            out[e, i] = x[tok]
+    return out
+
+def binned_scatter(x, indices, weights, bins, expert_capacity, top_k):
+    E, C, H = x.shape
+    N = indices.shape[0] // top_k
+    out = torch.zeros((N, top_k, H), dtype=x.dtype, device=x.device)
+    for e in range(E):
+        start = 0 if e == 0 else bins[e - 1]
+        end = bins[e]
+        n = end - start
+        if n == 0:
+            continue
+        take = min(n, expert_capacity)
+        for i in range(take):
+            flat_pos = indices[start + i]
+            tok = flat_pos // top_k
+            slot = flat_pos % top_k
+            scale = weights[flat_pos] if weights is not None else 1.0
+            out[tok, slot] = x[e, i] * scale
+    return out.sum(dim=1)
+
+def sort_tokens_by_expert(router_indices, num_experts):
+    flat_indices = router_indices.flatten()
+    sorted_values, sorted_indices = torch.sort(flat_indices)
+    tokens_per_expert = torch.bincount(sorted_values, minlength=num_experts)
+    bins = torch.cumsum(tokens_per_expert, dim=0)
+    return sorted_indices, sorted_values, bins, tokens_per_expert
+
+def binned_experts_ref(
+    hidden_states,
+    router_indices,
+    routing_weights,
+    gate_up_proj,
+    gate_up_proj_bias,
+    down_proj,
+    down_proj_bias,
+    expert_capacity,
+):
+    B, S, H = hidden_states.shape
+    E, K = routing_weights.shape[1], router_indices.shape[1]
+
+    indices, _, bins, _ = sort_tokens_by_expert(router_indices, E)
+    x = binned_gather(hidden_states.view(-1, H), indices, bins, expert_capacity, K)
+
+    gate_up = torch.bmm(x, gate_up_proj) 
+    gate_up += gate_up_proj_bias[..., None, :]
+
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+
+    # clamp to limit
+    limit = 7.0
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+
+    glu = gate * torch.sigmoid(gate * 1.702)
+    x = (up + 1) * glu
+    x = torch.bmm(x, down_proj) + down_proj_bias[..., None, :]
+
+    # build routing weights aligned to (token, slot)
+    flat_dense = routing_weights.view(-1, E)
+    flat_router = router_indices.view(-1, K)
+    selected = torch.gather(flat_dense, 1, flat_router).reshape(-1)
+
+    # scatter back
+    y = binned_scatter(x, indices, selected, bins, expert_capacity, K)
+
+    return y.view(B, S, H)
+
+class BinnedRouter(nn.Module):
+    def __init__(self, router_weight, router_bias):
+        super().__init__()
+        self.top_k = TOP_K
+        self.num_experts = NUM_EXPERTS
+        self.hidden_dim = HIDDEN_SIZE
+        self.weight = nn.Parameter(router_weight.clone())
+        self.bias = nn.Parameter(router_bias.clone())
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+class BinnedMoEMLP(nn.Module):
+    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.router = BinnedRouter(router_weight, router_bias)
+        self.num_experts = NUM_EXPERTS
+        self.hidden_size = HIDDEN_SIZE
+        self.top_k = TOP_K
+        
+        # Expert weights - use the loaded weights
+        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
+        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
+        self.down_proj = nn.Parameter(down_proj.clone())
+        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
+
+    def forward(self, hidden_states):
+        router_scores, router_indices = self.router(hidden_states)
+        batch_size = hidden_states.shape[0]
+        expert_capacity = ceil_div(batch_size * self.top_k, self.num_experts)
+
+        output = binned_experts_ref(
+            hidden_states,
+            router_indices,
+            router_scores,
+            self.gate_up_proj,
+            self.gate_up_proj_bias,
+            self.down_proj,
+            self.down_proj_bias,
+            expert_capacity,
+        )
+        
+        return output, router_scores
+
+# Run the model
+set_seed(GENERAL_SEED)
+
+device = torch.device(DEVICE)
+dtype = to_dtype(DTYPE)
+
+print("\n=== Binned Implementation ===")
+# Initialize model with loaded weights
+model = BinnedMoEMLP(
+    router_weight.to(device),
+    router_bias.to(device),
+    gate_up_proj.to(device),
+    gate_up_proj_bias.to(device),
+    down_proj.to(device),
+    down_proj_bias.to(device)
+).to(device=device)
+
+print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
+print(f"Gate/up proj sum: {model.gate_up_proj.sum().item():.6f}")
+print(f"Down proj sum: {model.down_proj.sum().item():.6f}")
+
+# Generate the same input as Yamoe
+set_seed(INPUT_SEED)
+x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
+
+# Benchmark the model with varied inputs to prevent caching artifacts
+tokens = BATCH_SIZE * SEQ_LEN
+with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="binned_results.json", vary_inputs=True) as bench:
+    output, stats = bench(model, x)
+    print(f"\nOutput sum: {output[0].sum().item():.6f}")
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/cells/gptoss_run.py b/moe_benchmarks/megablocks_yamoe/cells/gptoss_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a1532dabff53ecb068ddd4354c545f0cea2d72b
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/cells/gptoss_run.py
@@ -0,0 +1,147 @@
+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+# ]
+# ///
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
+from config import (
+    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
+    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
+    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
+)
+from pathlib import Path
+import os
+
+# Discover the upstream artifact directory from env
+data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
+
+router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
+router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
+gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
+gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
+down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
+down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
+
+print("Loaded shared weights from artifacts")
+print(f"Router weight sum: {router_weight.sum().item():.6f}")
+print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+print(f"Down sum: {down_proj.sum().item():.6f}")
+
+class GptOssRouter(nn.Module):
+    def __init__(self, router_weight, router_bias):
+        super().__init__()
+        self.top_k = TOP_K
+        self.num_experts = NUM_EXPERTS
+        self.hidden_dim = HIDDEN_SIZE
+        self.weight = nn.Parameter(router_weight.clone())
+        self.bias = nn.Parameter(router_bias.clone())
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+
+class GptOssExperts(nn.Module):
+    def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.num_experts = NUM_EXPERTS
+        self.hidden_size = HIDDEN_SIZE
+        self.expert_dim = self.hidden_size
+        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
+        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
+        self.down_proj = nn.Parameter(down_proj.clone())
+        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
+        self.alpha = 1.702
+        self.limit = 7.0
+
+    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        num_experts = routing_weights.shape[1]
+        
+        if hidden_states.device.type == "cpu" or self.training:
+            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+            with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
+                expert_mask = expert_mask.permute(2, 1, 0)
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            
+            for expert_idx in expert_hit[:]:
+                expert_idx = expert_idx[0]
+                with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx])
+                current_state = hidden_states[token_idx]
+                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
+                gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+                gate = gate.clamp(min=None, max=self.limit)
+                up = up.clamp(min=-self.limit, max=self.limit)
+                glu = gate * torch.sigmoid(gate * self.alpha)
+                gated_output = (up + 1) * glu
+                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
+                weighted_output = out * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
+        else:
+            hidden_states = hidden_states.repeat(num_experts, 1)
+            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
+            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+            glu = gate * torch.sigmoid(gate * self.alpha)
+            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
+            next_states = next_states + self.down_proj_bias[..., None, :]
+            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
+            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+            next_states = next_states.sum(dim=0)
+        return next_states
+
+class GptOssMoEMLP(nn.Module):
+    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.router = GptOssRouter(router_weight, router_bias)
+        self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
+
+    def forward(self, hidden_states):
+        router_scores, router_indices = self.router(hidden_states)
+        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
+        return routed_out, router_scores
+
+# Run the model
+set_seed(GENERAL_SEED)
+
+device = torch.device(DEVICE)
+dtype = to_dtype(DTYPE)
+
+print("\n=== GPT-OSS Implementation ===")
+# Initialize model with loaded weights
+model = GptOssMoEMLP(
+    router_weight.to(device),
+    router_bias.to(device),
+    gate_up_proj.to(device),
+    gate_up_proj_bias.to(device),
+    down_proj.to(device),
+    down_proj_bias.to(device)
+).to(device=device)
+
+print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
+print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
+print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
+
+# Generate the same input as other implementations
+set_seed(INPUT_SEED)
+x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
+
+# Benchmark the model with varied inputs to prevent caching artifacts
+tokens = BATCH_SIZE * SEQ_LEN
+with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="gptoss_results.json", vary_inputs=True) as bench:
+    output, stats = bench(model, x)
+    print(f"\nOutput sum: {output[0].sum().item():.6f}")
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/cells/gptoss_training_run.py b/moe_benchmarks/megablocks_yamoe/cells/gptoss_training_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..f18731a74bfa546e612addbaab9e3ff5ec5d26dc
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/cells/gptoss_training_run.py
@@ -0,0 +1,138 @@
+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+# ]
+# ///
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
+from config import (
+    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
+    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
+    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
+)
+from pathlib import Path
+import os
+
+# Discover the upstream artifact directory from env
+data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
+
+router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
+router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
+gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
+gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
+down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
+down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
+
+print("Loaded shared weights from artifacts")
+print(f"Router weight sum: {router_weight.sum().item():.6f}")
+print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+print(f"Down sum: {down_proj.sum().item():.6f}")
+
+class GptOssTrainingRouter(nn.Module):
+    def __init__(self, router_weight, router_bias):
+        super().__init__()
+        self.top_k = TOP_K
+        self.num_experts = NUM_EXPERTS
+        self.hidden_dim = HIDDEN_SIZE
+        self.weight = nn.Parameter(router_weight.clone())
+        self.bias = nn.Parameter(router_bias.clone())
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+
+class GptOssTrainingExperts(nn.Module):
+    def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.num_experts = NUM_EXPERTS
+        self.hidden_size = HIDDEN_SIZE
+        self.expert_dim = self.hidden_size
+        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
+        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
+        self.down_proj = nn.Parameter(down_proj.clone())
+        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
+        self.alpha = 1.702
+        self.limit = 7.0
+
+    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        num_experts = routing_weights.shape[1]
+        
+        # Force training mode path (expert loop instead of batched)
+        next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+        with torch.no_grad():
+            expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
+            expert_mask = expert_mask.permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        
+        for expert_idx in expert_hit[:]:
+            expert_idx = expert_idx[0]
+            with torch.no_grad():
+                _, token_idx = torch.where(expert_mask[expert_idx])
+            current_state = hidden_states[token_idx]
+            gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+            glu = gate * torch.sigmoid(gate * self.alpha)
+            gated_output = (up + 1) * glu
+            out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
+            weighted_output = out * routing_weights[token_idx, expert_idx, None]
+            next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+        next_states = next_states.view(batch_size, -1, self.hidden_size)
+        return next_states
+
+class GptOssTrainingMoEMLP(nn.Module):
+    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.router = GptOssTrainingRouter(router_weight, router_bias)
+        self.experts = GptOssTrainingExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
+
+    def forward(self, hidden_states):
+        router_scores, router_indices = self.router(hidden_states)
+        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
+        return routed_out, router_scores
+
+# Run the model
+set_seed(GENERAL_SEED)
+
+device = torch.device(DEVICE)
+dtype = to_dtype(DTYPE)
+
+print("\n=== GPT-OSS Implementation (Training Mode - Expert Loop) ===")
+# Initialize model with loaded weights and force training mode
+model = GptOssTrainingMoEMLP(
+    router_weight.to(device),
+    router_bias.to(device),
+    gate_up_proj.to(device),
+    gate_up_proj_bias.to(device),
+    down_proj.to(device),
+    down_proj_bias.to(device)
+).to(device=device)
+
+# Set to training mode to force expert loop path
+model.train()
+
+print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
+print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
+print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
+print(f"Model training mode: {model.training}")
+
+# Generate the same input as other implementations
+set_seed(INPUT_SEED)
+x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
+
+# Benchmark the model with varied inputs to prevent caching artifacts
+tokens = BATCH_SIZE * SEQ_LEN
+with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="gptoss_training_results.json", vary_inputs=True) as bench:
+    output, stats = bench(model, x)
+    print(f"\nOutput sum: {output[0].sum().item():.6f}")
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/cells/megablocks_run.py b/moe_benchmarks/megablocks_yamoe/cells/megablocks_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..a18723cb66c892119c0a9e88d8c2a140a6354a00
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/cells/megablocks_run.py
@@ -0,0 +1,103 @@
+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+#     "kernels",
+# ]
+# ///
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from kernels import get_kernel, get_local_kernel
+from bench_utils import to_dtype, tensor_stats, set_seed, bench_context
+from config import (
+    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
+    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
+    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
+)
+from pathlib import Path
+from collections import namedtuple
+import os
+
+# Discover the upstream artifact directory from env
+data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
+
+print(f"Loading weights from: {data_dir}")
+
+router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
+router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
+gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
+gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
+down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
+down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
+
+print("Loaded shared weights from artifacts")
+print(f"Router weight sum: {router_weight.sum().item():.6f}")
+print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+print(f"Down sum: {down_proj.sum().item():.6f}")
+
+def build_megablocks_model(device: torch.device):
+    # Download optimized kernels from the Hugging Face hub
+    megablocks = get_kernel("kernels-community/megablocks", revision="v0.0.2")
+    model = megablocks.layers.MegaBlocksMoeMLP()
+
+    # Create attribute container for expert weights
+    model.experts = namedtuple(
+        "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"]
+    )
+
+    # Use loaded router weights for consistency
+    model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device)
+    with torch.no_grad():
+        model.router.weight.copy_(router_weight)
+        model.router.bias.copy_(router_bias)
+
+    # Attach loaded expert weights to the experts container
+    e = model.experts
+    e.alpha = 1.702
+    e.capacity_factor = 32
+    e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device))
+    e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device))
+    e.down_proj = torch.nn.Parameter(down_proj.clone().to(device))
+    e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device))
+    e.hidden_size = HIDDEN_SIZE
+    
+    # Log weight statistics for comparison
+    print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}")
+    print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}")
+    print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}")
+
+    return model
+
+# Create a wrapper to match the interface of other implementations
+class MegaBlocksMoEWrapper(nn.Module):
+    def __init__(self, megablocks_model):
+        super().__init__()
+        self.model = megablocks_model
+        
+    def forward(self, hidden_states):
+        # MegaBlocks expects input in the format (batch, seq_len, hidden_dim)
+        output, dummy_routing_weights = self.model(hidden_states)
+        return output, dummy_routing_weights
+
+# Run the model
+set_seed(GENERAL_SEED)
+
+device = torch.device(DEVICE)
+dtype = to_dtype(DTYPE)
+
+print("\n=== MegaBlocks Implementation ===")
+# Build MegaBlocks model with loaded weights
+megablocks_model = build_megablocks_model(device)
+model = MegaBlocksMoEWrapper(megablocks_model).to(device=device)
+
+# Generate the same input as other implementations
+set_seed(INPUT_SEED)
+x = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=device, dtype=dtype) * 0.1
+
+# Benchmark the model with varied inputs to prevent caching artifacts
+tokens = BATCH_SIZE * SEQ_LEN
+with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="megablocks_results.json", vary_inputs=True) as bench:
+    output, stats = bench(model, x)
+    print(f"\nOutput sum: {output[0].sum().item():.6f}")
\ No newline at end of file
diff --git a/moe_benchmarks/megablocks_yamoe/cells/setup.py b/moe_benchmarks/megablocks_yamoe/cells/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7f386417ca59470f5e6404d26b64a6d1fd6f39
--- /dev/null
+++ b/moe_benchmarks/megablocks_yamoe/cells/setup.py
@@ -0,0 +1,116 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+import logging
+
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+
+
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+
+
+from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
+
+from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
+
+replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")
+replace_kernel_forward_from_hub(GptOssRMSNorm, None)
+custom_mapping = {
+    "Yamoe": {
+        "cuda": {
+            Mode.INFERENCE: LayerRepository(
+                repo_id="drbh/yamoe",
+                layer_name="Yamoe",
+                revision="v0.3.0",
+            )
+        }
+    }
+}
+register_kernel_mapping(custom_mapping)
+
+
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+
+max_tokens = 256
+
+with torch.inference_mode():
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+
+print(tokenizer.decode(generated[0], skip_special_tokens=False))
+print(f"Generation took {end_time - start_time:.2f} seconds")
diff --git a/moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html b/moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html
index d483be109634d9f2c6ca41723356d82e1bf2cfa1..c4126222a6acb3c8c1746f92615d27aaf4909fb6 100644
--- a/moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html
+++ b/moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html
@@ -3710,61 +3710,288 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-6.11.0-1018-azure-x86_64-with-glibc2.39
+            Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
         </div>
     </div>
     
     <div class="main-content">
-        <div class="cell cell-failed" id="cell-nv">
+        <h1>Comparison of Megablocks and Yamoe Kernels</h1>
+<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
+<h2>Megablocks kernel</h2>
+<h2>Yamoe Kernel</h2>
+<div class="cell cell-failed" id="cell-setup">
 <div class="cell-header">
 <span class="collapse-indicators">
-<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span> 
-<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
- <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
+<span onclick="toggleCode('setup')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
 </span> | 
-Cell: nv | 0.07s | FAILED
- | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
-<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
-<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
+Cell: setup | 19.20s | FAILED
+ | <button class="run-btn" onclick="runCell('setup')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
+<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
 </div>
-<div id="code-nv" class="cell-code" data-lines="3">
+<div id="code-setup" class="cell-code" data-lines="116">
 <div class="highlight-with-lines">
-<div class="line-numbers" id="lines-nv">
-<a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
-<a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
-<a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
+<div class="line-numbers" id="lines-setup">
+<a class="line-number" data-cell="setup" data-line="1" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 1, true);">1</a>
+<a class="line-number" data-cell="setup" data-line="2" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 2, true);">2</a>
+<a class="line-number" data-cell="setup" data-line="3" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 3, true);">3</a>
+<a class="line-number" data-cell="setup" data-line="4" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 4, true);">4</a>
+<a class="line-number" data-cell="setup" data-line="5" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 5, true);">5</a>
+<a class="line-number" data-cell="setup" data-line="6" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 6, true);">6</a>
+<a class="line-number" data-cell="setup" data-line="7" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 7, true);">7</a>
+<a class="line-number" data-cell="setup" data-line="8" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 8, true);">8</a>
+<a class="line-number" data-cell="setup" data-line="9" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 9, true);">9</a>
+<a class="line-number" data-cell="setup" data-line="10" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 10, true);">10</a>
+<a class="line-number" data-cell="setup" data-line="11" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 11, true);">11</a>
+<a class="line-number" data-cell="setup" data-line="12" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 12, true);">12</a>
+<a class="line-number" data-cell="setup" data-line="13" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 13, true);">13</a>
+<a class="line-number" data-cell="setup" data-line="14" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 14, true);">14</a>
+<a class="line-number" data-cell="setup" data-line="15" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 15, true);">15</a>
+<a class="line-number" data-cell="setup" data-line="16" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 16, true);">16</a>
+<a class="line-number" data-cell="setup" data-line="17" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 17, true);">17</a>
+<a class="line-number" data-cell="setup" data-line="18" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 18, true);">18</a>
+<a class="line-number" data-cell="setup" data-line="19" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 19, true);">19</a>
+<a class="line-number" data-cell="setup" data-line="20" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 20, true);">20</a>
+<a class="line-number" data-cell="setup" data-line="21" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 21, true);">21</a>
+<a class="line-number" data-cell="setup" data-line="22" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 22, true);">22</a>
+<a class="line-number" data-cell="setup" data-line="23" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 23, true);">23</a>
+<a class="line-number" data-cell="setup" data-line="24" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 24, true);">24</a>
+<a class="line-number" data-cell="setup" data-line="25" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 25, true);">25</a>
+<a class="line-number" data-cell="setup" data-line="26" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 26, true);">26</a>
+<a class="line-number" data-cell="setup" data-line="27" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 27, true);">27</a>
+<a class="line-number" data-cell="setup" data-line="28" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 28, true);">28</a>
+<a class="line-number" data-cell="setup" data-line="29" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 29, true);">29</a>
+<a class="line-number" data-cell="setup" data-line="30" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 30, true);">30</a>
+<a class="line-number" data-cell="setup" data-line="31" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 31, true);">31</a>
+<a class="line-number" data-cell="setup" data-line="32" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 32, true);">32</a>
+<a class="line-number" data-cell="setup" data-line="33" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 33, true);">33</a>
+<a class="line-number" data-cell="setup" data-line="34" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 34, true);">34</a>
+<a class="line-number" data-cell="setup" data-line="35" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 35, true);">35</a>
+<a class="line-number" data-cell="setup" data-line="36" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 36, true);">36</a>
+<a class="line-number" data-cell="setup" data-line="37" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 37, true);">37</a>
+<a class="line-number" data-cell="setup" data-line="38" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 38, true);">38</a>
+<a class="line-number" data-cell="setup" data-line="39" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 39, true);">39</a>
+<a class="line-number" data-cell="setup" data-line="40" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 40, true);">40</a>
+<a class="line-number" data-cell="setup" data-line="41" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 41, true);">41</a>
+<a class="line-number" data-cell="setup" data-line="42" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 42, true);">42</a>
+<a class="line-number" data-cell="setup" data-line="43" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 43, true);">43</a>
+<a class="line-number" data-cell="setup" data-line="44" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 44, true);">44</a>
+<a class="line-number" data-cell="setup" data-line="45" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 45, true);">45</a>
+<a class="line-number" data-cell="setup" data-line="46" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 46, true);">46</a>
+<a class="line-number" data-cell="setup" data-line="47" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 47, true);">47</a>
+<a class="line-number" data-cell="setup" data-line="48" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 48, true);">48</a>
+<a class="line-number" data-cell="setup" data-line="49" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 49, true);">49</a>
+<a class="line-number" data-cell="setup" data-line="50" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 50, true);">50</a>
+<a class="line-number" data-cell="setup" data-line="51" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 51, true);">51</a>
+<a class="line-number" data-cell="setup" data-line="52" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 52, true);">52</a>
+<a class="line-number" data-cell="setup" data-line="53" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 53, true);">53</a>
+<a class="line-number" data-cell="setup" data-line="54" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 54, true);">54</a>
+<a class="line-number" data-cell="setup" data-line="55" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 55, true);">55</a>
+<a class="line-number" data-cell="setup" data-line="56" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 56, true);">56</a>
+<a class="line-number" data-cell="setup" data-line="57" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 57, true);">57</a>
+<a class="line-number" data-cell="setup" data-line="58" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 58, true);">58</a>
+<a class="line-number" data-cell="setup" data-line="59" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 59, true);">59</a>
+<a class="line-number" data-cell="setup" data-line="60" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 60, true);">60</a>
+<a class="line-number" data-cell="setup" data-line="61" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 61, true);">61</a>
+<a class="line-number" data-cell="setup" data-line="62" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 62, true);">62</a>
+<a class="line-number" data-cell="setup" data-line="63" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 63, true);">63</a>
+<a class="line-number" data-cell="setup" data-line="64" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 64, true);">64</a>
+<a class="line-number" data-cell="setup" data-line="65" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 65, true);">65</a>
+<a class="line-number" data-cell="setup" data-line="66" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 66, true);">66</a>
+<a class="line-number" data-cell="setup" data-line="67" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 67, true);">67</a>
+<a class="line-number" data-cell="setup" data-line="68" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 68, true);">68</a>
+<a class="line-number" data-cell="setup" data-line="69" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 69, true);">69</a>
+<a class="line-number" data-cell="setup" data-line="70" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 70, true);">70</a>
+<a class="line-number" data-cell="setup" data-line="71" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 71, true);">71</a>
+<a class="line-number" data-cell="setup" data-line="72" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 72, true);">72</a>
+<a class="line-number" data-cell="setup" data-line="73" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 73, true);">73</a>
+<a class="line-number" data-cell="setup" data-line="74" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 74, true);">74</a>
+<a class="line-number" data-cell="setup" data-line="75" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 75, true);">75</a>
+<a class="line-number" data-cell="setup" data-line="76" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 76, true);">76</a>
+<a class="line-number" data-cell="setup" data-line="77" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 77, true);">77</a>
+<a class="line-number" data-cell="setup" data-line="78" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 78, true);">78</a>
+<a class="line-number" data-cell="setup" data-line="79" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 79, true);">79</a>
+<a class="line-number" data-cell="setup" data-line="80" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 80, true);">80</a>
+<a class="line-number" data-cell="setup" data-line="81" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 81, true);">81</a>
+<a class="line-number" data-cell="setup" data-line="82" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 82, true);">82</a>
+<a class="line-number" data-cell="setup" data-line="83" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 83, true);">83</a>
+<a class="line-number" data-cell="setup" data-line="84" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 84, true);">84</a>
+<a class="line-number" data-cell="setup" data-line="85" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 85, true);">85</a>
+<a class="line-number" data-cell="setup" data-line="86" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 86, true);">86</a>
+<a class="line-number" data-cell="setup" data-line="87" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 87, true);">87</a>
+<a class="line-number" data-cell="setup" data-line="88" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 88, true);">88</a>
+<a class="line-number" data-cell="setup" data-line="89" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 89, true);">89</a>
+<a class="line-number" data-cell="setup" data-line="90" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 90, true);">90</a>
+<a class="line-number" data-cell="setup" data-line="91" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 91, true);">91</a>
+<a class="line-number" data-cell="setup" data-line="92" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 92, true);">92</a>
+<a class="line-number" data-cell="setup" data-line="93" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 93, true);">93</a>
+<a class="line-number" data-cell="setup" data-line="94" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 94, true);">94</a>
+<a class="line-number" data-cell="setup" data-line="95" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 95, true);">95</a>
+<a class="line-number" data-cell="setup" data-line="96" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 96, true);">96</a>
+<a class="line-number" data-cell="setup" data-line="97" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 97, true);">97</a>
+<a class="line-number" data-cell="setup" data-line="98" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 98, true);">98</a>
+<a class="line-number" data-cell="setup" data-line="99" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 99, true);">99</a>
+<a class="line-number" data-cell="setup" data-line="100" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 100, true);">100</a>
+<a class="line-number" data-cell="setup" data-line="101" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 101, true);">101</a>
+<a class="line-number" data-cell="setup" data-line="102" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 102, true);">102</a>
+<a class="line-number" data-cell="setup" data-line="103" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 103, true);">103</a>
+<a class="line-number" data-cell="setup" data-line="104" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 104, true);">104</a>
+<a class="line-number" data-cell="setup" data-line="105" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 105, true);">105</a>
+<a class="line-number" data-cell="setup" data-line="106" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 106, true);">106</a>
+<a class="line-number" data-cell="setup" data-line="107" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 107, true);">107</a>
+<a class="line-number" data-cell="setup" data-line="108" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 108, true);">108</a>
+<a class="line-number" data-cell="setup" data-line="109" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 109, true);">109</a>
+<a class="line-number" data-cell="setup" data-line="110" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 110, true);">110</a>
+<a class="line-number" data-cell="setup" data-line="111" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 111, true);">111</a>
+<a class="line-number" data-cell="setup" data-line="112" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 112, true);">112</a>
+<a class="line-number" data-cell="setup" data-line="113" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 113, true);">113</a>
+<a class="line-number" data-cell="setup" data-line="114" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 114, true);">114</a>
+<a class="line-number" data-cell="setup" data-line="115" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 115, true);">115</a>
+<a class="line-number" data-cell="setup" data-line="116" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 116, true);">116</a>
 </div>
 <div class="code-wrap">
-<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
-
-<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">&quot;nvidia-smi&quot;</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
+<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
+<span class="c1"># requires-python = &quot;&gt;=3.12&quot;</span>
+<span class="c1"># dependencies = [</span>
+<span class="c1">#     &quot;accelerate&gt;=1.10.1&quot;,</span>
+<span class="c1">#     &quot;torch&gt;=2.7.0&quot;,</span>
+<span class="c1">#     &quot;kernels==0.10.0&quot;,</span>
+<span class="c1">#     &quot;transformers@https://github.com/huggingface/transformers.git&quot;,</span>
+<span class="c1">#     &quot;ipdb&gt;=0.13.13&quot;,</span>
+<span class="c1">#     &quot;matplotlib&gt;=3.7.2&quot;,</span>
+<span class="c1">#     &quot;numpy&gt;=1.24.3&quot;,</span>
+<span class="c1"># ]</span>
+<span class="c1"># ///</span>
+
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssForCausalLM</span><span class="p">,</span> <span class="n">PreTrainedTokenizerFast</span><span class="p">,</span> <span class="n">Mxfp4Config</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch.nn</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">nn</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">register_kernel_mapping</span><span class="p">,</span> <span class="n">Mode</span><span class="p">,</span> <span class="n">LayerRepository</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">torch.profiler</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">gc</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">logging</span>
+
+<span class="c1"># set to debug logging</span>
+<span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">)</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">reset_peak_memory_stats</span><span class="p">():</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Clear CUDA cache and reset memory allocation counters.&quot;&quot;&quot;</span>
+    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">empty_cache</span><span class="p">()</span>
+    <span class="k">if</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
+        <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">reset_peak_memory_stats</span><span class="p">()</span>
+    <span class="n">gc</span><span class="o">.</span><span class="n">collect</span><span class="p">()</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">get_memory_stats</span><span class="p">():</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Get current and peak CUDA memory usage.&quot;&quot;&quot;</span>
+    <span class="k">if</span> <span class="ow">not</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span>
+        <span class="k">return</span> <span class="p">{</span><span class="s2">&quot;allocated_gb&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">&quot;peak_gb&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="s2">&quot;reserved_gb&quot;</span><span class="p">:</span> <span class="mi">0</span><span class="p">}</span>
+    <span class="k">return</span> <span class="p">{</span>
+        <span class="s2">&quot;allocated_gb&quot;</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">memory_allocated</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
+        <span class="s2">&quot;peak_gb&quot;</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">max_memory_allocated</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
+        <span class="s2">&quot;reserved_gb&quot;</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">memory_reserved</span><span class="p">()</span> <span class="o">/</span> <span class="mf">1e9</span><span class="p">,</span>
+    <span class="p">}</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">override_kernel_layer_name</span><span class="p">(</span><span class="n">cls_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;Helper to dynamically override the kernel_layer_name in a model class.&quot;&quot;&quot;</span>
+    <span class="k">for</span> <span class="n">mod</span> <span class="ow">in</span> <span class="n">sys</span><span class="o">.</span><span class="n">modules</span><span class="o">.</span><span class="n">values</span><span class="p">():</span>
+        <span class="k">if</span> <span class="n">mod</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">continue</span>
+        <span class="n">obj</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">mod</span><span class="p">,</span> <span class="n">cls_name</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="nb">type</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">issubclass</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+            <span class="nb">setattr</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="s2">&quot;kernel_layer_name&quot;</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
+            <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Overrode </span><span class="si">{</span><span class="n">cls_name</span><span class="si">}</span><span class="s2">.kernel_layer_name to </span><span class="si">{</span><span class="n">value</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+            <span class="k">return</span> <span class="kc">True</span>
+    <span class="k">return</span> <span class="kc">False</span>
+
+
+<span class="c1"># Init the model the normal way</span>
+<span class="n">model_id</span> <span class="o">=</span> <span class="s2">&quot;openai/gpt-oss-20b&quot;</span>
+<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">PreTrainedTokenizerFast</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span><span class="n">model_id</span><span class="p">)</span>
+<span class="n">quantization_config</span> <span class="o">=</span> <span class="n">Mxfp4Config</span><span class="p">(</span><span class="n">dequantize</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+
+
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">replace_kernel_forward_from_hub</span><span class="p">,</span> <span class="n">register_kernel_mapping</span><span class="p">,</span> <span class="n">LayerRepository</span><span class="p">,</span> <span class="n">Mode</span>
+
+<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssMLP</span><span class="p">,</span> <span class="n">GptOssRMSNorm</span>
+
+<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssMLP</span><span class="p">,</span> <span class="s2">&quot;Yamoe&quot;</span><span class="p">)</span>
+<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
+<span class="n">custom_mapping</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="s2">&quot;Yamoe&quot;</span><span class="p">:</span> <span class="p">{</span>
+        <span class="s2">&quot;cuda&quot;</span><span class="p">:</span> <span class="p">{</span>
+            <span class="n">Mode</span><span class="o">.</span><span class="n">INFERENCE</span><span class="p">:</span> <span class="n">LayerRepository</span><span class="p">(</span>
+                <span class="n">repo_id</span><span class="o">=</span><span class="s2">&quot;drbh/yamoe&quot;</span><span class="p">,</span>
+                <span class="n">layer_name</span><span class="o">=</span><span class="s2">&quot;Yamoe&quot;</span><span class="p">,</span>
+                <span class="n">revision</span><span class="o">=</span><span class="s2">&quot;v0.3.0&quot;</span><span class="p">,</span>
+            <span class="p">)</span>
+        <span class="p">}</span>
+    <span class="p">}</span>
+<span class="p">}</span>
+<span class="n">register_kernel_mapping</span><span class="p">(</span><span class="n">custom_mapping</span><span class="p">)</span>
+
+
+<span class="n">model</span> <span class="o">=</span> <span class="n">GptOssForCausalLM</span><span class="o">.</span><span class="n">from_pretrained</span><span class="p">(</span>
+    <span class="n">model_id</span><span class="p">,</span>
+    <span class="n">dtype</span><span class="o">=</span><span class="s2">&quot;bfloat16&quot;</span><span class="p">,</span>
+    <span class="n">device_map</span><span class="o">=</span><span class="s2">&quot;auto&quot;</span><span class="p">,</span>
+    <span class="n">use_kernels</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+    <span class="n">quantization_config</span><span class="o">=</span><span class="n">quantization_config</span><span class="p">,</span>
+<span class="p">)</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
+
+<span class="n">messages</span> <span class="o">=</span> <span class="p">[</span>
+    <span class="p">{</span><span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;system&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="s2">&quot;What is Tensor Parallelism?&quot;</span><span class="p">},</span>
+<span class="p">]</span>
+
+<span class="n">inputs</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">apply_chat_template</span><span class="p">(</span>
+    <span class="n">messages</span><span class="p">,</span>
+    <span class="n">add_generation_prompt</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+    <span class="n">return_tensors</span><span class="o">=</span><span class="s2">&quot;pt&quot;</span><span class="p">,</span>
+    <span class="n">return_dict</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+    <span class="n">reasoning_effort</span><span class="o">=</span><span class="s2">&quot;low&quot;</span><span class="p">,</span>
+<span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
+
+<span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">256</span>
+
+<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">inference_mode</span><span class="p">():</span>
+    <span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
+    <span class="n">generated</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
+        <span class="o">**</span><span class="n">inputs</span><span class="p">,</span>
+        <span class="n">max_new_tokens</span><span class="o">=</span><span class="n">max_tokens</span><span class="p">,</span>
+        <span class="n">do_sample</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">temperature</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+    <span class="p">)</span>
+    <span class="n">end_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="n">generated</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">skip_special_tokens</span><span class="o">=</span><span class="kc">False</span><span class="p">))</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds&quot;</span><span class="p">)</span>
 </pre></div>
 
-<div class="code-line-highlight" id="line-highlight-nv"></div>
+<div class="code-line-highlight" id="line-highlight-setup"></div>
 </div>
 </div>
 </div>
-<div id="output-nv" class="cell-output">
-<div class="cell-stderr">Traceback (most recent call last):
-  File &quot;/home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks_yamoe/.uvnote/cells/nv.py&quot;, line 3, in &lt;module&gt;
-    print(subprocess.run([&quot;nvidia-smi&quot;], capture_output=True, text=True).stdout)
-          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/opt/hostedtoolcache/Python/3.11.13/x64/lib/python3.11/subprocess.py&quot;, line 548, in run
-    with Popen(*popenargs, **kwargs) as process:
-         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/opt/hostedtoolcache/Python/3.11.13/x64/lib/python3.11/subprocess.py&quot;, line 1026, in __init__
-    self._execute_child(args, executable, preexec_fn, close_fds,
-  File &quot;/opt/hostedtoolcache/Python/3.11.13/x64/lib/python3.11/subprocess.py&quot;, line 1955, in _execute_child
-    raise child_exception_type(errno_num, err_msg, err_filename)
-FileNotFoundError: [Errno 2] No such file or directory: &#x27;nvidia-smi&#x27;
+<div id="output-setup" class="cell-output">
+<div class="cell-stderr">Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB)
+ Downloading cpython-3.13.7-linux-x86_64-gnu (download)
+   Updating https://github.com/huggingface/transformers.git (HEAD)
+    Updated https://github.com/huggingface/transformers.git (449533af73874470e914a203391635e04ac2ffc8)
+  × No solution found when resolving script dependencies:
+  ╰─▶ Because only transformers==4.57.0.dev0 is available and
+      transformers==4.57.0.dev0 depends on huggingface-hub==1.0.0rc1,
+      we can conclude that all versions of transformers depend on
+      huggingface-hub==1.0.0rc1.
+      And because kernels==0.10.0 depends on huggingface-hub&gt;=0.26.0,&lt;1.0,
+      we can conclude that kernels==0.10.0 and all versions of transformers
+      are incompatible.
+      And because you require kernels==0.10.0 and transformers, we can
+      conclude that your requirements are unsatisfiable.
 </div>
 </div>
 </div>
-
-<h1>Comparison of Megablocks and Yamoe Kernels</h1>
-<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
-<h2>Megablocks kernel</h2>
-<h2>Yamoe Kernel</h2>
     </div>
     
 </body>
diff --git a/moe_benchmarks/megablocks_yamoe/torch_profile.html b/moe_benchmarks/megablocks_yamoe/torch_profile.html
index 03274be1af151bba4833da45e7954d7de1f9a558..ec3f276d4f2ffdf0354ae0f539751c7b01f73f61 100644
--- a/moe_benchmarks/megablocks_yamoe/torch_profile.html
+++ b/moe_benchmarks/megablocks_yamoe/torch_profile.html
@@ -3708,7 +3708,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
     <div class="system-info">
         <div class="system-info-header">Generated on:</div>
         <div class="system-info-content">
-            Linux x86_64 | Linux-6.11.0-1018-azure-x86_64-with-glibc2.39
+            Linux x86_64 | Linux-6.12.40-64.114.amzn2023.x86_64-x86_64-with-glibc2.36
         </div>
     </div>
     
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
 <span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: utils | deps: torch, numpy | 3.06s
+Cell: utils | deps: torch, numpy | 34.59s
  | <button class="run-btn" onclick="runCell('utils')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('utils')">Copy</button>
 <a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -3794,7 +3794,43 @@ Cell: utils | deps: torch, numpy | 3.06s
 <div class="uv-install-logs" id="uv-logs-utils">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 26 packages in 253ms
+Downloading setuptools (1.1MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading triton (148.3MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 26 packages in 452ms
 </div>
 </div>
 </div>
@@ -3807,7 +3843,7 @@ Installed 26 packages in 253ms
 <span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: bench_utils | deps: torch, numpy | 13.67s
+Cell: bench_utils | deps: torch, numpy | 35.65s
  | <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
 <a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
@@ -4295,13 +4331,43 @@ Cell: bench_utils | deps: torch, numpy | 13.67s
 <div class="uv-install-logs" id="uv-logs-bench_utils">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
 Downloading torch (846.9MiB)
+Downloading sympy (6.0MiB)
 Downloading numpy (16.2MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
 Downloading triton (148.3MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
  Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
  Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
  Downloading torch
-Installed 26 packages in 259ms
+Installed 26 packages in 452ms
 </div>
 </div>
 </div>
@@ -4315,7 +4381,7 @@ Installed 26 packages in 259ms
 <span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: config | deps: torch, numpy | 3.02s
+Cell: config | deps: torch, numpy | 34.53s
  | <button class="run-btn" onclick="runCell('config')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('config')">Copy</button>
 <a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
@@ -4375,7 +4441,43 @@ Cell: config | deps: torch, numpy | 3.02s
 <div class="uv-install-logs" id="uv-logs-config">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 26 packages in 243ms
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading numpy (16.2MiB)
+Downloading networkx (1.9MiB)
+Downloading setuptools (1.1MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading sympy (6.0MiB)
+Downloading triton (148.3MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 26 packages in 448ms
 </div>
 </div>
 </div>
@@ -4388,7 +4490,7 @@ Installed 26 packages in 243ms
 <span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: save_data | deps: torch, numpy | 11.90s
+Cell: save_data | deps: torch, numpy | 39.05s
  | <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
 <a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
@@ -4476,38 +4578,74 @@ Cell: save_data | deps: torch, numpy | 11.90s
 </div>
 <div id="output-save_data" class="cell-output">
 <div class="cell-stdout">Saved shared weights to artifacts
-Router weight sum: 12.588735
+Router weight sum: 12.588732
 Gate/up sum: 1026.601807
-Down sum: 206.729279
+Down sum: 206.729263
 </div>
 <div class="uv-install-logs" id="uv-logs-save_data">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 26 packages in 242ms
+Downloading networkx (1.9MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading triton (148.3MiB)
+Downloading torch (846.9MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 26 packages in 450ms
 </div>
 </div>
 <div class="cell-artifacts">
 <h4>Artifacts:</h4>
-<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
+<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
 <a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
 <a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
-<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
-<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
+<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
 <a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
+<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
 </div>
 </div>
 </div>
 
 <h2>Yamoe Implementation</h2>
 <p>This section runs the Yamoe MoE implementation with optimized Triton kernels.</p>
-<div class="cell cell-failed" id="cell-yamoe_run">
+<div class="cell" id="cell-yamoe_run">
 <div class="cell-header">
 <span class="collapse-indicators">
 <span onclick="toggleCode('yamoe_run')" style="cursor: pointer;">▼ code</span> 
 <span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
  <span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
 </span> | 
-Cell: yamoe_run | deps: torch, kernels, numpy | 4.02s | FAILED
+Cell: yamoe_run | deps: torch, kernels, numpy | 39.19s
  | <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
 <button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
 <a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
@@ -4778,38 +4916,1811 @@ Cell: yamoe_run | deps: torch, kernels, numpy | 4.02s | FAILED
 </div>
 </div>
 <div id="output-yamoe_run" class="cell-output">
-<div class="cell-stdout">Loading weights from: /home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks_yamoe/.uvnote/cache/57bbe537b6c3412d45373a8967728666b60b8687c5d1f5d0decc3ba51923edde
+<div class="cell-stdout">Loading weights from: /repo/moe_benchmarks/megablocks_yamoe/.uvnote/cache/f8744f31d9cf720409852d42748815c6d61f005a2a9b297b7b9bf986ed98bb90
 Loaded shared weights from artifacts
-Router weight sum: 12.588735
+Router weight sum: 12.588732
 Gate/up sum: 1026.601807
-Down sum: 206.729279
+Down sum: 206.729263
 
 === Yamoe Implementation ===
+Router weight sum: 12.588732
+Gate/up proj sum: 1026.601807
+Down proj sum: 206.729340
+
+┌─ Benchmark Configuration ─────────────────────────────┐
+│ Warmup: 10              Iters: 50              │
+│ Tokens: 100                                        │
+│ Input Variation: Enabled (prevents caching artifacts)  │
+└────────────────────────────────────────────────────────┘
+
+Base Input: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.486445, 0.446746], mean=-0.000048, std=0.099986, norm=33.936142
+Input Variation: +0.001 * iteration (deterministic)
+
+Warming up (10 iterations)...
+Benchmarking (50 iterations)...
+  Progress: 20% complete (avg: 4.253 ms)
+  Progress: 40% complete (avg: 4.250 ms)
+  Progress: 60% complete (avg: 4.250 ms)
+  Progress: 80% complete (avg: 4.251 ms)
+
+Output tensors:
+  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
+  Auxiliary: shape=(100, 128), dtype=torch.float32, device=cuda:0, range=[0.000000, 0.302948], mean=0.007812, std=0.043553, norm=5.005893
+
+━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━
+Iterations: 50
+
+Latency Statistics:
+  Average: 4.251 ms
+  Min:     4.144 ms
+  Max:     4.320 ms
+  Std Dev: 0.029 ms
+
+Percentiles:
+  P50 (median): 4.254 ms
+  P95:          4.286 ms
+  P99:          4.306 ms
+
+Throughput:
+  Tokens/sec: 23523.6
+  Std Dev:    160.3
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+Saved benchmark results to yamoe_results.json
+
+Output sum: 3.971905
 </div>
 <div class="uv-install-logs" id="uv-logs-yamoe_run">
 <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 <div class="uv-logs-content" style="display: none;">
-Installed 37 packages in 255ms
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading hf-xet (3.0MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading torch (846.9MiB)
+Downloading triton (148.3MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading hf-xet
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 454ms
+</div>
 </div>
+<div class="cell-stderr">Fetching 6 files:   0%|          | 0/6 [00:00&lt;?, ?it/s]
+Fetching 6 files:  17%|█▋        | 1/6 [00:00&lt;00:01,  3.18it/s]
+Fetching 6 files:  50%|█████     | 3/6 [00:00&lt;00:00,  3.84it/s]
+Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00,  7.53it/s]</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
 </div>
-<div class="cell-stderr">Traceback (most recent call last):
-  File &quot;/home/runner/work/kernels-uvnotes/kernels-uvnotes/moe_benchmarks/megablocks_yamoe/.uvnote/cells/yamoe_run.py&quot;, line 115, in &lt;module&gt;
-    router_weight.to(device),
-    ^^^^^^^^^^^^^^^^^^^^^^^^
-  File &quot;/home/runner/work/_temp/setup-uv-cache/environments-v2/yamoe-run-07f6c9b004377cec/lib/python3.11/site-packages/torch/cuda/__init__.py&quot;, line 412, in _lazy_init
-    torch._C._cuda_init()
-RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx</div>
 </div>
 </div>
 
 <h2>Binned Implementation</h2>
 <p>This section runs the binned implementation that manually handles token gathering/scattering.</p>
+<div class="cell" id="cell-binned_run">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('binned_run')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: binned_run | deps: torch, numpy | 39.23s
+ | <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
+<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
+</div>
+<div id="code-binned_run" class="cell-code" data-lines="188">
+<div class="highlight-with-lines">
+<div class="line-numbers" id="lines-binned_run">
+<a class="line-number" data-cell="binned_run" data-line="1" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 1, true);">1</a>
+<a class="line-number" data-cell="binned_run" data-line="2" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 2, true);">2</a>
+<a class="line-number" data-cell="binned_run" data-line="3" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 3, true);">3</a>
+<a class="line-number" data-cell="binned_run" data-line="4" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 4, true);">4</a>
+<a class="line-number" data-cell="binned_run" data-line="5" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 5, true);">5</a>
+<a class="line-number" data-cell="binned_run" data-line="6" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 6, true);">6</a>
+<a class="line-number" data-cell="binned_run" data-line="7" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 7, true);">7</a>
+<a class="line-number" data-cell="binned_run" data-line="8" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 8, true);">8</a>
+<a class="line-number" data-cell="binned_run" data-line="9" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 9, true);">9</a>
+<a class="line-number" data-cell="binned_run" data-line="10" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 10, true);">10</a>
+<a class="line-number" data-cell="binned_run" data-line="11" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 11, true);">11</a>
+<a class="line-number" data-cell="binned_run" data-line="12" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 12, true);">12</a>
+<a class="line-number" data-cell="binned_run" data-line="13" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 13, true);">13</a>
+<a class="line-number" data-cell="binned_run" data-line="14" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 14, true);">14</a>
+<a class="line-number" data-cell="binned_run" data-line="15" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 15, true);">15</a>
+<a class="line-number" data-cell="binned_run" data-line="16" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 16, true);">16</a>
+<a class="line-number" data-cell="binned_run" data-line="17" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 17, true);">17</a>
+<a class="line-number" data-cell="binned_run" data-line="18" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 18, true);">18</a>
+<a class="line-number" data-cell="binned_run" data-line="19" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 19, true);">19</a>
+<a class="line-number" data-cell="binned_run" data-line="20" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 20, true);">20</a>
+<a class="line-number" data-cell="binned_run" data-line="21" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 21, true);">21</a>
+<a class="line-number" data-cell="binned_run" data-line="22" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 22, true);">22</a>
+<a class="line-number" data-cell="binned_run" data-line="23" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 23, true);">23</a>
+<a class="line-number" data-cell="binned_run" data-line="24" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 24, true);">24</a>
+<a class="line-number" data-cell="binned_run" data-line="25" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 25, true);">25</a>
+<a class="line-number" data-cell="binned_run" data-line="26" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 26, true);">26</a>
+<a class="line-number" data-cell="binned_run" data-line="27" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 27, true);">27</a>
+<a class="line-number" data-cell="binned_run" data-line="28" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 28, true);">28</a>
+<a class="line-number" data-cell="binned_run" data-line="29" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 29, true);">29</a>
+<a class="line-number" data-cell="binned_run" data-line="30" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 30, true);">30</a>
+<a class="line-number" data-cell="binned_run" data-line="31" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 31, true);">31</a>
+<a class="line-number" data-cell="binned_run" data-line="32" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 32, true);">32</a>
+<a class="line-number" data-cell="binned_run" data-line="33" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 33, true);">33</a>
+<a class="line-number" data-cell="binned_run" data-line="34" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 34, true);">34</a>
+<a class="line-number" data-cell="binned_run" data-line="35" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 35, true);">35</a>
+<a class="line-number" data-cell="binned_run" data-line="36" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 36, true);">36</a>
+<a class="line-number" data-cell="binned_run" data-line="37" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 37, true);">37</a>
+<a class="line-number" data-cell="binned_run" data-line="38" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 38, true);">38</a>
+<a class="line-number" data-cell="binned_run" data-line="39" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 39, true);">39</a>
+<a class="line-number" data-cell="binned_run" data-line="40" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 40, true);">40</a>
+<a class="line-number" data-cell="binned_run" data-line="41" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 41, true);">41</a>
+<a class="line-number" data-cell="binned_run" data-line="42" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 42, true);">42</a>
+<a class="line-number" data-cell="binned_run" data-line="43" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 43, true);">43</a>
+<a class="line-number" data-cell="binned_run" data-line="44" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 44, true);">44</a>
+<a class="line-number" data-cell="binned_run" data-line="45" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 45, true);">45</a>
+<a class="line-number" data-cell="binned_run" data-line="46" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 46, true);">46</a>
+<a class="line-number" data-cell="binned_run" data-line="47" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 47, true);">47</a>
+<a class="line-number" data-cell="binned_run" data-line="48" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 48, true);">48</a>
+<a class="line-number" data-cell="binned_run" data-line="49" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 49, true);">49</a>
+<a class="line-number" data-cell="binned_run" data-line="50" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 50, true);">50</a>
+<a class="line-number" data-cell="binned_run" data-line="51" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 51, true);">51</a>
+<a class="line-number" data-cell="binned_run" data-line="52" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 52, true);">52</a>
+<a class="line-number" data-cell="binned_run" data-line="53" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 53, true);">53</a>
+<a class="line-number" data-cell="binned_run" data-line="54" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 54, true);">54</a>
+<a class="line-number" data-cell="binned_run" data-line="55" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 55, true);">55</a>
+<a class="line-number" data-cell="binned_run" data-line="56" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 56, true);">56</a>
+<a class="line-number" data-cell="binned_run" data-line="57" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 57, true);">57</a>
+<a class="line-number" data-cell="binned_run" data-line="58" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 58, true);">58</a>
+<a class="line-number" data-cell="binned_run" data-line="59" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 59, true);">59</a>
+<a class="line-number" data-cell="binned_run" data-line="60" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 60, true);">60</a>
+<a class="line-number" data-cell="binned_run" data-line="61" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 61, true);">61</a>
+<a class="line-number" data-cell="binned_run" data-line="62" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 62, true);">62</a>
+<a class="line-number" data-cell="binned_run" data-line="63" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 63, true);">63</a>
+<a class="line-number" data-cell="binned_run" data-line="64" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 64, true);">64</a>
+<a class="line-number" data-cell="binned_run" data-line="65" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 65, true);">65</a>
+<a class="line-number" data-cell="binned_run" data-line="66" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 66, true);">66</a>
+<a class="line-number" data-cell="binned_run" data-line="67" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 67, true);">67</a>
+<a class="line-number" data-cell="binned_run" data-line="68" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 68, true);">68</a>
+<a class="line-number" data-cell="binned_run" data-line="69" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 69, true);">69</a>
+<a class="line-number" data-cell="binned_run" data-line="70" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 70, true);">70</a>
+<a class="line-number" data-cell="binned_run" data-line="71" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 71, true);">71</a>
+<a class="line-number" data-cell="binned_run" data-line="72" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 72, true);">72</a>
+<a class="line-number" data-cell="binned_run" data-line="73" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 73, true);">73</a>
+<a class="line-number" data-cell="binned_run" data-line="74" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 74, true);">74</a>
+<a class="line-number" data-cell="binned_run" data-line="75" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 75, true);">75</a>
+<a class="line-number" data-cell="binned_run" data-line="76" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 76, true);">76</a>
+<a class="line-number" data-cell="binned_run" data-line="77" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 77, true);">77</a>
+<a class="line-number" data-cell="binned_run" data-line="78" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 78, true);">78</a>
+<a class="line-number" data-cell="binned_run" data-line="79" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 79, true);">79</a>
+<a class="line-number" data-cell="binned_run" data-line="80" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 80, true);">80</a>
+<a class="line-number" data-cell="binned_run" data-line="81" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 81, true);">81</a>
+<a class="line-number" data-cell="binned_run" data-line="82" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 82, true);">82</a>
+<a class="line-number" data-cell="binned_run" data-line="83" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 83, true);">83</a>
+<a class="line-number" data-cell="binned_run" data-line="84" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 84, true);">84</a>
+<a class="line-number" data-cell="binned_run" data-line="85" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 85, true);">85</a>
+<a class="line-number" data-cell="binned_run" data-line="86" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 86, true);">86</a>
+<a class="line-number" data-cell="binned_run" data-line="87" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 87, true);">87</a>
+<a class="line-number" data-cell="binned_run" data-line="88" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 88, true);">88</a>
+<a class="line-number" data-cell="binned_run" data-line="89" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 89, true);">89</a>
+<a class="line-number" data-cell="binned_run" data-line="90" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 90, true);">90</a>
+<a class="line-number" data-cell="binned_run" data-line="91" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 91, true);">91</a>
+<a class="line-number" data-cell="binned_run" data-line="92" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 92, true);">92</a>
+<a class="line-number" data-cell="binned_run" data-line="93" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 93, true);">93</a>
+<a class="line-number" data-cell="binned_run" data-line="94" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 94, true);">94</a>
+<a class="line-number" data-cell="binned_run" data-line="95" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 95, true);">95</a>
+<a class="line-number" data-cell="binned_run" data-line="96" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 96, true);">96</a>
+<a class="line-number" data-cell="binned_run" data-line="97" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 97, true);">97</a>
+<a class="line-number" data-cell="binned_run" data-line="98" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 98, true);">98</a>
+<a class="line-number" data-cell="binned_run" data-line="99" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 99, true);">99</a>
+<a class="line-number" data-cell="binned_run" data-line="100" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 100, true);">100</a>
+<a class="line-number" data-cell="binned_run" data-line="101" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 101, true);">101</a>
+<a class="line-number" data-cell="binned_run" data-line="102" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 102, true);">102</a>
+<a class="line-number" data-cell="binned_run" data-line="103" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 103, true);">103</a>
+<a class="line-number" data-cell="binned_run" data-line="104" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 104, true);">104</a>
+<a class="line-number" data-cell="binned_run" data-line="105" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 105, true);">105</a>
+<a class="line-number" data-cell="binned_run" data-line="106" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 106, true);">106</a>
+<a class="line-number" data-cell="binned_run" data-line="107" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 107, true);">107</a>
+<a class="line-number" data-cell="binned_run" data-line="108" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 108, true);">108</a>
+<a class="line-number" data-cell="binned_run" data-line="109" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 109, true);">109</a>
+<a class="line-number" data-cell="binned_run" data-line="110" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 110, true);">110</a>
+<a class="line-number" data-cell="binned_run" data-line="111" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 111, true);">111</a>
+<a class="line-number" data-cell="binned_run" data-line="112" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 112, true);">112</a>
+<a class="line-number" data-cell="binned_run" data-line="113" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 113, true);">113</a>
+<a class="line-number" data-cell="binned_run" data-line="114" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 114, true);">114</a>
+<a class="line-number" data-cell="binned_run" data-line="115" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 115, true);">115</a>
+<a class="line-number" data-cell="binned_run" data-line="116" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 116, true);">116</a>
+<a class="line-number" data-cell="binned_run" data-line="117" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 117, true);">117</a>
+<a class="line-number" data-cell="binned_run" data-line="118" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 118, true);">118</a>
+<a class="line-number" data-cell="binned_run" data-line="119" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 119, true);">119</a>
+<a class="line-number" data-cell="binned_run" data-line="120" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 120, true);">120</a>
+<a class="line-number" data-cell="binned_run" data-line="121" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 121, true);">121</a>
+<a class="line-number" data-cell="binned_run" data-line="122" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 122, true);">122</a>
+<a class="line-number" data-cell="binned_run" data-line="123" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 123, true);">123</a>
+<a class="line-number" data-cell="binned_run" data-line="124" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 124, true);">124</a>
+<a class="line-number" data-cell="binned_run" data-line="125" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 125, true);">125</a>
+<a class="line-number" data-cell="binned_run" data-line="126" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 126, true);">126</a>
+<a class="line-number" data-cell="binned_run" data-line="127" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 127, true);">127</a>
+<a class="line-number" data-cell="binned_run" data-line="128" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 128, true);">128</a>
+<a class="line-number" data-cell="binned_run" data-line="129" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 129, true);">129</a>
+<a class="line-number" data-cell="binned_run" data-line="130" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 130, true);">130</a>
+<a class="line-number" data-cell="binned_run" data-line="131" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 131, true);">131</a>
+<a class="line-number" data-cell="binned_run" data-line="132" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 132, true);">132</a>
+<a class="line-number" data-cell="binned_run" data-line="133" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 133, true);">133</a>
+<a class="line-number" data-cell="binned_run" data-line="134" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 134, true);">134</a>
+<a class="line-number" data-cell="binned_run" data-line="135" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 135, true);">135</a>
+<a class="line-number" data-cell="binned_run" data-line="136" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 136, true);">136</a>
+<a class="line-number" data-cell="binned_run" data-line="137" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 137, true);">137</a>
+<a class="line-number" data-cell="binned_run" data-line="138" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 138, true);">138</a>
+<a class="line-number" data-cell="binned_run" data-line="139" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 139, true);">139</a>
+<a class="line-number" data-cell="binned_run" data-line="140" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 140, true);">140</a>
+<a class="line-number" data-cell="binned_run" data-line="141" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 141, true);">141</a>
+<a class="line-number" data-cell="binned_run" data-line="142" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 142, true);">142</a>
+<a class="line-number" data-cell="binned_run" data-line="143" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 143, true);">143</a>
+<a class="line-number" data-cell="binned_run" data-line="144" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 144, true);">144</a>
+<a class="line-number" data-cell="binned_run" data-line="145" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 145, true);">145</a>
+<a class="line-number" data-cell="binned_run" data-line="146" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 146, true);">146</a>
+<a class="line-number" data-cell="binned_run" data-line="147" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 147, true);">147</a>
+<a class="line-number" data-cell="binned_run" data-line="148" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 148, true);">148</a>
+<a class="line-number" data-cell="binned_run" data-line="149" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 149, true);">149</a>
+<a class="line-number" data-cell="binned_run" data-line="150" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 150, true);">150</a>
+<a class="line-number" data-cell="binned_run" data-line="151" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 151, true);">151</a>
+<a class="line-number" data-cell="binned_run" data-line="152" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 152, true);">152</a>
+<a class="line-number" data-cell="binned_run" data-line="153" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 153, true);">153</a>
+<a class="line-number" data-cell="binned_run" data-line="154" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 154, true);">154</a>
+<a class="line-number" data-cell="binned_run" data-line="155" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 155, true);">155</a>
+<a class="line-number" data-cell="binned_run" data-line="156" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 156, true);">156</a>
+<a class="line-number" data-cell="binned_run" data-line="157" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 157, true);">157</a>
+<a class="line-number" data-cell="binned_run" data-line="158" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 158, true);">158</a>
+<a class="line-number" data-cell="binned_run" data-line="159" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 159, true);">159</a>
+<a class="line-number" data-cell="binned_run" data-line="160" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 160, true);">160</a>
+<a class="line-number" data-cell="binned_run" data-line="161" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 161, true);">161</a>
+<a class="line-number" data-cell="binned_run" data-line="162" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 162, true);">162</a>
+<a class="line-number" data-cell="binned_run" data-line="163" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 163, true);">163</a>
+<a class="line-number" data-cell="binned_run" data-line="164" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 164, true);">164</a>
+<a class="line-number" data-cell="binned_run" data-line="165" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 165, true);">165</a>
+<a class="line-number" data-cell="binned_run" data-line="166" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 166, true);">166</a>
+<a class="line-number" data-cell="binned_run" data-line="167" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 167, true);">167</a>
+<a class="line-number" data-cell="binned_run" data-line="168" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 168, true);">168</a>
+<a class="line-number" data-cell="binned_run" data-line="169" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 169, true);">169</a>
+<a class="line-number" data-cell="binned_run" data-line="170" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 170, true);">170</a>
+<a class="line-number" data-cell="binned_run" data-line="171" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 171, true);">171</a>
+<a class="line-number" data-cell="binned_run" data-line="172" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 172, true);">172</a>
+<a class="line-number" data-cell="binned_run" data-line="173" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 173, true);">173</a>
+<a class="line-number" data-cell="binned_run" data-line="174" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 174, true);">174</a>
+<a class="line-number" data-cell="binned_run" data-line="175" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 175, true);">175</a>
+<a class="line-number" data-cell="binned_run" data-line="176" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 176, true);">176</a>
+<a class="line-number" data-cell="binned_run" data-line="177" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 177, true);">177</a>
+<a class="line-number" data-cell="binned_run" data-line="178" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 178, true);">178</a>
+<a class="line-number" data-cell="binned_run" data-line="179" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 179, true);">179</a>
+<a class="line-number" data-cell="binned_run" data-line="180" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 180, true);">180</a>
+<a class="line-number" data-cell="binned_run" data-line="181" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 181, true);">181</a>
+<a class="line-number" data-cell="binned_run" data-line="182" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 182, true);">182</a>
+<a class="line-number" data-cell="binned_run" data-line="183" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 183, true);">183</a>
+<a class="line-number" data-cell="binned_run" data-line="184" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 184, true);">184</a>
+<a class="line-number" data-cell="binned_run" data-line="185" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 185, true);">185</a>
+<a class="line-number" data-cell="binned_run" data-line="186" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 186, true);">186</a>
+<a class="line-number" data-cell="binned_run" data-line="187" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 187, true);">187</a>
+<a class="line-number" data-cell="binned_run" data-line="188" href="#cell-binned_run" onclick="event.preventDefault(); selectCellLine('binned_run', 188, true);">188</a>
+</div>
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch</span><span class="w"> </span><span class="kn">import</span> <span class="n">nn</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">functional</span> <span class="k">as</span> <span class="n">F</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">bench_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">to_dtype</span><span class="p">,</span> <span class="n">tensor_stats</span><span class="p">,</span> <span class="n">set_seed</span><span class="p">,</span> <span class="n">bench_context</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">config</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span>
+    <span class="n">NUM_EXPERTS</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">TOP_K</span><span class="p">,</span>
+    <span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">DTYPE</span><span class="p">,</span> <span class="n">DEVICE</span><span class="p">,</span>
+    <span class="n">WEIGHT_SEED</span><span class="p">,</span> <span class="n">EXPERT_SEED</span><span class="p">,</span> <span class="n">INPUT_SEED</span><span class="p">,</span> <span class="n">GENERAL_SEED</span>
+<span class="p">)</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
+
+<span class="c1"># Discover the upstream artifact directory from env</span>
+<span class="n">data_dir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;UVNOTE_INPUT_SAVE_DATA&#39;</span><span class="p">,</span> <span class="s1">&#39;.&#39;</span><span class="p">)</span>
+
+<span class="n">router_weight</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_weight.pt&#39;</span><span class="p">)</span>
+<span class="n">router_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj_bias.pt&#39;</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Loaded shared weights from artifacts&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Router weight sum: </span><span class="si">{</span><span class="n">router_weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gate/up sum: </span><span class="si">{</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Down sum: </span><span class="si">{</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">binned_gather</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">indices</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">top_k</span><span class="p">):</span>
+    <span class="n">E</span><span class="p">,</span> <span class="n">H</span> <span class="o">=</span> <span class="n">bins</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">E</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">H</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">E</span><span class="p">):</span>
+        <span class="n">start</span> <span class="o">=</span> <span class="mi">0</span> <span class="k">if</span> <span class="n">e</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span>
+        <span class="n">end</span> <span class="o">=</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span><span class="p">]</span>
+        <span class="n">n</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">end</span> <span class="o">-</span> <span class="n">start</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">)</span>
+        <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">n</span><span class="p">):</span>
+            <span class="n">flat_pos</span> <span class="o">=</span> <span class="n">indices</span><span class="p">[</span><span class="n">start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span>
+            <span class="n">tok</span> <span class="o">=</span> <span class="n">flat_pos</span> <span class="o">//</span> <span class="n">top_k</span>
+            <span class="n">out</span><span class="p">[</span><span class="n">e</span><span class="p">,</span> <span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="n">tok</span><span class="p">]</span>
+    <span class="k">return</span> <span class="n">out</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">binned_scatter</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">indices</span><span class="p">,</span> <span class="n">weights</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">top_k</span><span class="p">):</span>
+    <span class="n">E</span><span class="p">,</span> <span class="n">C</span><span class="p">,</span> <span class="n">H</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span>
+    <span class="n">N</span> <span class="o">=</span> <span class="n">indices</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">//</span> <span class="n">top_k</span>
+    <span class="n">out</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">top_k</span><span class="p">,</span> <span class="n">H</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">E</span><span class="p">):</span>
+        <span class="n">start</span> <span class="o">=</span> <span class="mi">0</span> <span class="k">if</span> <span class="n">e</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span> <span class="o">-</span> <span class="mi">1</span><span class="p">]</span>
+        <span class="n">end</span> <span class="o">=</span> <span class="n">bins</span><span class="p">[</span><span class="n">e</span><span class="p">]</span>
+        <span class="n">n</span> <span class="o">=</span> <span class="n">end</span> <span class="o">-</span> <span class="n">start</span>
+        <span class="k">if</span> <span class="n">n</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+            <span class="k">continue</span>
+        <span class="n">take</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">)</span>
+        <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">take</span><span class="p">):</span>
+            <span class="n">flat_pos</span> <span class="o">=</span> <span class="n">indices</span><span class="p">[</span><span class="n">start</span> <span class="o">+</span> <span class="n">i</span><span class="p">]</span>
+            <span class="n">tok</span> <span class="o">=</span> <span class="n">flat_pos</span> <span class="o">//</span> <span class="n">top_k</span>
+            <span class="n">slot</span> <span class="o">=</span> <span class="n">flat_pos</span> <span class="o">%</span> <span class="n">top_k</span>
+            <span class="n">scale</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">flat_pos</span><span class="p">]</span> <span class="k">if</span> <span class="n">weights</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="mf">1.0</span>
+            <span class="n">out</span><span class="p">[</span><span class="n">tok</span><span class="p">,</span> <span class="n">slot</span><span class="p">]</span> <span class="o">=</span> <span class="n">x</span><span class="p">[</span><span class="n">e</span><span class="p">,</span> <span class="n">i</span><span class="p">]</span> <span class="o">*</span> <span class="n">scale</span>
+    <span class="k">return</span> <span class="n">out</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">sort_tokens_by_expert</span><span class="p">(</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">num_experts</span><span class="p">):</span>
+    <span class="n">flat_indices</span> <span class="o">=</span> <span class="n">router_indices</span><span class="o">.</span><span class="n">flatten</span><span class="p">()</span>
+    <span class="n">sorted_values</span><span class="p">,</span> <span class="n">sorted_indices</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">sort</span><span class="p">(</span><span class="n">flat_indices</span><span class="p">)</span>
+    <span class="n">tokens_per_expert</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bincount</span><span class="p">(</span><span class="n">sorted_values</span><span class="p">,</span> <span class="n">minlength</span><span class="o">=</span><span class="n">num_experts</span><span class="p">)</span>
+    <span class="n">bins</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cumsum</span><span class="p">(</span><span class="n">tokens_per_expert</span><span class="p">,</span> <span class="n">dim</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">sorted_indices</span><span class="p">,</span> <span class="n">sorted_values</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">tokens_per_expert</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">binned_experts_ref</span><span class="p">(</span>
+    <span class="n">hidden_states</span><span class="p">,</span>
+    <span class="n">router_indices</span><span class="p">,</span>
+    <span class="n">routing_weights</span><span class="p">,</span>
+    <span class="n">gate_up_proj</span><span class="p">,</span>
+    <span class="n">gate_up_proj_bias</span><span class="p">,</span>
+    <span class="n">down_proj</span><span class="p">,</span>
+    <span class="n">down_proj_bias</span><span class="p">,</span>
+    <span class="n">expert_capacity</span><span class="p">,</span>
+<span class="p">):</span>
+    <span class="n">B</span><span class="p">,</span> <span class="n">S</span><span class="p">,</span> <span class="n">H</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span>
+    <span class="n">E</span><span class="p">,</span> <span class="n">K</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">router_indices</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+
+    <span class="n">indices</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">sort_tokens_by_expert</span><span class="p">(</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">E</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">binned_gather</span><span class="p">(</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">H</span><span class="p">),</span> <span class="n">indices</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">K</span><span class="p">)</span>
+
+    <span class="n">gate_up</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bmm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">gate_up_proj</span><span class="p">)</span> 
+    <span class="n">gate_up</span> <span class="o">+=</span> <span class="n">gate_up_proj_bias</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
+
+    <span class="n">gate</span><span class="p">,</span> <span class="n">up</span> <span class="o">=</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">::</span><span class="mi">2</span><span class="p">],</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">::</span><span class="mi">2</span><span class="p">]</span>
+
+    <span class="c1"># clamp to limit</span>
+    <span class="n">limit</span> <span class="o">=</span> <span class="mf">7.0</span>
+    <span class="n">gate</span> <span class="o">=</span> <span class="n">gate</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
+    <span class="n">up</span> <span class="o">=</span> <span class="n">up</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=-</span><span class="n">limit</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
+
+    <span class="n">glu</span> <span class="o">=</span> <span class="n">gate</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">gate</span> <span class="o">*</span> <span class="mf">1.702</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="p">(</span><span class="n">up</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">glu</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bmm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">)</span> <span class="o">+</span> <span class="n">down_proj_bias</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
+
+    <span class="c1"># build routing weights aligned to (token, slot)</span>
+    <span class="n">flat_dense</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">E</span><span class="p">)</span>
+    <span class="n">flat_router</span> <span class="o">=</span> <span class="n">router_indices</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">K</span><span class="p">)</span>
+    <span class="n">selected</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">gather</span><span class="p">(</span><span class="n">flat_dense</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">flat_router</span><span class="p">)</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
+
+    <span class="c1"># scatter back</span>
+    <span class="n">y</span> <span class="o">=</span> <span class="n">binned_scatter</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">indices</span><span class="p">,</span> <span class="n">selected</span><span class="p">,</span> <span class="n">bins</span><span class="p">,</span> <span class="n">expert_capacity</span><span class="p">,</span> <span class="n">K</span><span class="p">)</span>
+
+    <span class="k">return</span> <span class="n">y</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">B</span><span class="p">,</span> <span class="n">S</span><span class="p">,</span> <span class="n">H</span><span class="p">)</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">BinnedRouter</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="o">=</span> <span class="n">TOP_K</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">num_experts</span> <span class="o">=</span> <span class="n">NUM_EXPERTS</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">hidden_dim</span> <span class="o">=</span> <span class="n">HIDDEN_SIZE</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">router_weight</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">router_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">):</span>
+        <span class="n">hidden_states</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_dim</span><span class="p">)</span>
+        <span class="n">router_logits</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias</span><span class="p">)</span>
+        <span class="n">router_top_value</span><span class="p">,</span> <span class="n">router_indices</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">topk</span><span class="p">(</span><span class="n">router_logits</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
+        <span class="n">router_top_value</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">router_top_value</span><span class="p">,</span> <span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">router_top_value</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+        <span class="n">router_scores</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">router_logits</span><span class="p">)</span><span class="o">.</span><span class="n">scatter_</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">router_indices</span><span class="p">,</span> <span class="n">router_top_value</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">router_scores</span><span class="p">,</span> <span class="n">router_indices</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">ceil_div</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">):</span>
+    <span class="k">return</span> <span class="p">(</span><span class="n">a</span> <span class="o">+</span> <span class="n">b</span> <span class="o">-</span> <span class="mi">1</span><span class="p">)</span> <span class="o">//</span> <span class="n">b</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">BinnedMoEMLP</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">,</span> <span class="n">gate_up_proj</span><span class="p">,</span> <span class="n">gate_up_proj_bias</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">,</span> <span class="n">down_proj_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">router</span> <span class="o">=</span> <span class="n">BinnedRouter</span><span class="p">(</span><span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">num_experts</span> <span class="o">=</span> <span class="n">NUM_EXPERTS</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span> <span class="o">=</span> <span class="n">HIDDEN_SIZE</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="o">=</span> <span class="n">TOP_K</span>
+
+        <span class="c1"># Expert weights - use the loaded weights</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">):</span>
+        <span class="n">router_scores</span><span class="p">,</span> <span class="n">router_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">router</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">)</span>
+        <span class="n">batch_size</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+        <span class="n">expert_capacity</span> <span class="o">=</span> <span class="n">ceil_div</span><span class="p">(</span><span class="n">batch_size</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">num_experts</span><span class="p">)</span>
+
+        <span class="n">output</span> <span class="o">=</span> <span class="n">binned_experts_ref</span><span class="p">(</span>
+            <span class="n">hidden_states</span><span class="p">,</span>
+            <span class="n">router_indices</span><span class="p">,</span>
+            <span class="n">router_scores</span><span class="p">,</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="p">,</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj_bias</span><span class="p">,</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">down_proj</span><span class="p">,</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">down_proj_bias</span><span class="p">,</span>
+            <span class="n">expert_capacity</span><span class="p">,</span>
+        <span class="p">)</span>
+
+        <span class="k">return</span> <span class="n">output</span><span class="p">,</span> <span class="n">router_scores</span>
+
+<span class="c1"># Run the model</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">GENERAL_SEED</span><span class="p">)</span>
+
+<span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="n">DEVICE</span><span class="p">)</span>
+<span class="n">dtype</span> <span class="o">=</span> <span class="n">to_dtype</span><span class="p">(</span><span class="n">DTYPE</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">=== Binned Implementation ===&quot;</span><span class="p">)</span>
+<span class="c1"># Initialize model with loaded weights</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">BinnedMoEMLP</span><span class="p">(</span>
+    <span class="n">router_weight</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">router_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">gate_up_proj</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">down_proj</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">down_proj_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span>
+<span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Router weight sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">router</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gate/up proj sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Down proj sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="c1"># Generate the same input as Yamoe</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">INPUT_SEED</span><span class="p">)</span>
+<span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span> <span class="o">*</span> <span class="mf">0.1</span>
+
+<span class="c1"># Benchmark the model with varied inputs to prevent caching artifacts</span>
+<span class="n">tokens</span> <span class="o">=</span> <span class="n">BATCH_SIZE</span> <span class="o">*</span> <span class="n">SEQ_LEN</span>
+<span class="k">with</span> <span class="n">bench_context</span><span class="p">(</span><span class="n">warmup</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">iters</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">tokens</span><span class="o">=</span><span class="n">tokens</span><span class="p">,</span> <span class="n">save_json</span><span class="o">=</span><span class="s2">&quot;binned_results.json&quot;</span><span class="p">,</span> <span class="n">vary_inputs</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">as</span> <span class="n">bench</span><span class="p">:</span>
+    <span class="n">output</span><span class="p">,</span> <span class="n">stats</span> <span class="o">=</span> <span class="n">bench</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Output sum: </span><span class="si">{</span><span class="n">output</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-binned_run"></div>
+</div>
+</div>
+</div>
+<div id="output-binned_run" class="cell-output">
+<div class="cell-stdout">Loaded shared weights from artifacts
+Router weight sum: 12.588732
+Gate/up sum: 1026.601807
+Down sum: 206.729263
+
+=== Binned Implementation ===
+Router weight sum: 12.588732
+Gate/up proj sum: 1026.601807
+Down proj sum: 206.729340
+
+┌─ Benchmark Configuration ─────────────────────────────┐
+│ Warmup: 10              Iters: 50              │
+│ Tokens: 100                                        │
+│ Input Variation: Enabled (prevents caching artifacts)  │
+└────────────────────────────────────────────────────────┘
+
+Base Input: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.486445, 0.446746], mean=-0.000048, std=0.099986, norm=33.936142
+Input Variation: +0.001 * iteration (deterministic)
+
+Warming up (10 iterations)...
+Benchmarking (50 iterations)...
+  Progress: 20% complete (avg: 37.503 ms)
+  Progress: 40% complete (avg: 37.304 ms)
+  Progress: 60% complete (avg: 36.964 ms)
+  Progress: 80% complete (avg: 36.508 ms)
+
+Output tensors:
+  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
+  Auxiliary: shape=(100, 128), dtype=torch.float32, device=cuda:0, range=[0.000000, 0.302948], mean=0.007812, std=0.043553, norm=5.005893
+
+━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━
+Iterations: 50
+
+Latency Statistics:
+  Average: 36.063 ms
+  Min:     33.292 ms
+  Max:     38.406 ms
+  Std Dev: 1.259 ms
+
+Percentiles:
+  P50 (median): 36.215 ms
+  P95:          37.524 ms
+  P99:          38.036 ms
+
+Throughput:
+  Tokens/sec: 2772.9
+  Std Dev:    98.3
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+Saved benchmark results to binned_results.json
+
+Output sum: 3.971905
+</div>
+<div class="uv-install-logs" id="uv-logs-binned_run">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Downloading sympy (6.0MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading setuptools (1.1MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 26 packages in 449ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/binned_run/binned_results.json" class="artifact" target="_blank">binned_results.json</a>
+</div>
+</div>
+</div>
+
 <h2>GPT-OSS Implementation</h2>
 <p>This section runs the GPT-OSS MoE implementation with manual expert loop handling.</p>
+<div class="cell" id="cell-gptoss_run">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('gptoss_run')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: gptoss_run | deps: torch, numpy | 39.77s
+ | <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
+<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
+</div>
+<div id="code-gptoss_run" class="cell-code" data-lines="140">
+<div class="highlight-with-lines">
+<div class="line-numbers" id="lines-gptoss_run">
+<a class="line-number" data-cell="gptoss_run" data-line="1" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 1, true);">1</a>
+<a class="line-number" data-cell="gptoss_run" data-line="2" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 2, true);">2</a>
+<a class="line-number" data-cell="gptoss_run" data-line="3" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 3, true);">3</a>
+<a class="line-number" data-cell="gptoss_run" data-line="4" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 4, true);">4</a>
+<a class="line-number" data-cell="gptoss_run" data-line="5" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 5, true);">5</a>
+<a class="line-number" data-cell="gptoss_run" data-line="6" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 6, true);">6</a>
+<a class="line-number" data-cell="gptoss_run" data-line="7" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 7, true);">7</a>
+<a class="line-number" data-cell="gptoss_run" data-line="8" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 8, true);">8</a>
+<a class="line-number" data-cell="gptoss_run" data-line="9" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 9, true);">9</a>
+<a class="line-number" data-cell="gptoss_run" data-line="10" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 10, true);">10</a>
+<a class="line-number" data-cell="gptoss_run" data-line="11" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 11, true);">11</a>
+<a class="line-number" data-cell="gptoss_run" data-line="12" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 12, true);">12</a>
+<a class="line-number" data-cell="gptoss_run" data-line="13" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 13, true);">13</a>
+<a class="line-number" data-cell="gptoss_run" data-line="14" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 14, true);">14</a>
+<a class="line-number" data-cell="gptoss_run" data-line="15" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 15, true);">15</a>
+<a class="line-number" data-cell="gptoss_run" data-line="16" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 16, true);">16</a>
+<a class="line-number" data-cell="gptoss_run" data-line="17" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 17, true);">17</a>
+<a class="line-number" data-cell="gptoss_run" data-line="18" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 18, true);">18</a>
+<a class="line-number" data-cell="gptoss_run" data-line="19" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 19, true);">19</a>
+<a class="line-number" data-cell="gptoss_run" data-line="20" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 20, true);">20</a>
+<a class="line-number" data-cell="gptoss_run" data-line="21" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 21, true);">21</a>
+<a class="line-number" data-cell="gptoss_run" data-line="22" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 22, true);">22</a>
+<a class="line-number" data-cell="gptoss_run" data-line="23" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 23, true);">23</a>
+<a class="line-number" data-cell="gptoss_run" data-line="24" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 24, true);">24</a>
+<a class="line-number" data-cell="gptoss_run" data-line="25" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 25, true);">25</a>
+<a class="line-number" data-cell="gptoss_run" data-line="26" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 26, true);">26</a>
+<a class="line-number" data-cell="gptoss_run" data-line="27" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 27, true);">27</a>
+<a class="line-number" data-cell="gptoss_run" data-line="28" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 28, true);">28</a>
+<a class="line-number" data-cell="gptoss_run" data-line="29" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 29, true);">29</a>
+<a class="line-number" data-cell="gptoss_run" data-line="30" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 30, true);">30</a>
+<a class="line-number" data-cell="gptoss_run" data-line="31" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 31, true);">31</a>
+<a class="line-number" data-cell="gptoss_run" data-line="32" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 32, true);">32</a>
+<a class="line-number" data-cell="gptoss_run" data-line="33" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 33, true);">33</a>
+<a class="line-number" data-cell="gptoss_run" data-line="34" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 34, true);">34</a>
+<a class="line-number" data-cell="gptoss_run" data-line="35" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 35, true);">35</a>
+<a class="line-number" data-cell="gptoss_run" data-line="36" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 36, true);">36</a>
+<a class="line-number" data-cell="gptoss_run" data-line="37" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 37, true);">37</a>
+<a class="line-number" data-cell="gptoss_run" data-line="38" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 38, true);">38</a>
+<a class="line-number" data-cell="gptoss_run" data-line="39" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 39, true);">39</a>
+<a class="line-number" data-cell="gptoss_run" data-line="40" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 40, true);">40</a>
+<a class="line-number" data-cell="gptoss_run" data-line="41" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 41, true);">41</a>
+<a class="line-number" data-cell="gptoss_run" data-line="42" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 42, true);">42</a>
+<a class="line-number" data-cell="gptoss_run" data-line="43" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 43, true);">43</a>
+<a class="line-number" data-cell="gptoss_run" data-line="44" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 44, true);">44</a>
+<a class="line-number" data-cell="gptoss_run" data-line="45" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 45, true);">45</a>
+<a class="line-number" data-cell="gptoss_run" data-line="46" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 46, true);">46</a>
+<a class="line-number" data-cell="gptoss_run" data-line="47" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 47, true);">47</a>
+<a class="line-number" data-cell="gptoss_run" data-line="48" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 48, true);">48</a>
+<a class="line-number" data-cell="gptoss_run" data-line="49" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 49, true);">49</a>
+<a class="line-number" data-cell="gptoss_run" data-line="50" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 50, true);">50</a>
+<a class="line-number" data-cell="gptoss_run" data-line="51" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 51, true);">51</a>
+<a class="line-number" data-cell="gptoss_run" data-line="52" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 52, true);">52</a>
+<a class="line-number" data-cell="gptoss_run" data-line="53" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 53, true);">53</a>
+<a class="line-number" data-cell="gptoss_run" data-line="54" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 54, true);">54</a>
+<a class="line-number" data-cell="gptoss_run" data-line="55" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 55, true);">55</a>
+<a class="line-number" data-cell="gptoss_run" data-line="56" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 56, true);">56</a>
+<a class="line-number" data-cell="gptoss_run" data-line="57" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 57, true);">57</a>
+<a class="line-number" data-cell="gptoss_run" data-line="58" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 58, true);">58</a>
+<a class="line-number" data-cell="gptoss_run" data-line="59" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 59, true);">59</a>
+<a class="line-number" data-cell="gptoss_run" data-line="60" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 60, true);">60</a>
+<a class="line-number" data-cell="gptoss_run" data-line="61" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 61, true);">61</a>
+<a class="line-number" data-cell="gptoss_run" data-line="62" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 62, true);">62</a>
+<a class="line-number" data-cell="gptoss_run" data-line="63" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 63, true);">63</a>
+<a class="line-number" data-cell="gptoss_run" data-line="64" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 64, true);">64</a>
+<a class="line-number" data-cell="gptoss_run" data-line="65" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 65, true);">65</a>
+<a class="line-number" data-cell="gptoss_run" data-line="66" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 66, true);">66</a>
+<a class="line-number" data-cell="gptoss_run" data-line="67" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 67, true);">67</a>
+<a class="line-number" data-cell="gptoss_run" data-line="68" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 68, true);">68</a>
+<a class="line-number" data-cell="gptoss_run" data-line="69" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 69, true);">69</a>
+<a class="line-number" data-cell="gptoss_run" data-line="70" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 70, true);">70</a>
+<a class="line-number" data-cell="gptoss_run" data-line="71" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 71, true);">71</a>
+<a class="line-number" data-cell="gptoss_run" data-line="72" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 72, true);">72</a>
+<a class="line-number" data-cell="gptoss_run" data-line="73" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 73, true);">73</a>
+<a class="line-number" data-cell="gptoss_run" data-line="74" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 74, true);">74</a>
+<a class="line-number" data-cell="gptoss_run" data-line="75" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 75, true);">75</a>
+<a class="line-number" data-cell="gptoss_run" data-line="76" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 76, true);">76</a>
+<a class="line-number" data-cell="gptoss_run" data-line="77" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 77, true);">77</a>
+<a class="line-number" data-cell="gptoss_run" data-line="78" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 78, true);">78</a>
+<a class="line-number" data-cell="gptoss_run" data-line="79" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 79, true);">79</a>
+<a class="line-number" data-cell="gptoss_run" data-line="80" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 80, true);">80</a>
+<a class="line-number" data-cell="gptoss_run" data-line="81" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 81, true);">81</a>
+<a class="line-number" data-cell="gptoss_run" data-line="82" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 82, true);">82</a>
+<a class="line-number" data-cell="gptoss_run" data-line="83" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 83, true);">83</a>
+<a class="line-number" data-cell="gptoss_run" data-line="84" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 84, true);">84</a>
+<a class="line-number" data-cell="gptoss_run" data-line="85" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 85, true);">85</a>
+<a class="line-number" data-cell="gptoss_run" data-line="86" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 86, true);">86</a>
+<a class="line-number" data-cell="gptoss_run" data-line="87" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 87, true);">87</a>
+<a class="line-number" data-cell="gptoss_run" data-line="88" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 88, true);">88</a>
+<a class="line-number" data-cell="gptoss_run" data-line="89" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 89, true);">89</a>
+<a class="line-number" data-cell="gptoss_run" data-line="90" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 90, true);">90</a>
+<a class="line-number" data-cell="gptoss_run" data-line="91" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 91, true);">91</a>
+<a class="line-number" data-cell="gptoss_run" data-line="92" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 92, true);">92</a>
+<a class="line-number" data-cell="gptoss_run" data-line="93" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 93, true);">93</a>
+<a class="line-number" data-cell="gptoss_run" data-line="94" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 94, true);">94</a>
+<a class="line-number" data-cell="gptoss_run" data-line="95" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 95, true);">95</a>
+<a class="line-number" data-cell="gptoss_run" data-line="96" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 96, true);">96</a>
+<a class="line-number" data-cell="gptoss_run" data-line="97" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 97, true);">97</a>
+<a class="line-number" data-cell="gptoss_run" data-line="98" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 98, true);">98</a>
+<a class="line-number" data-cell="gptoss_run" data-line="99" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 99, true);">99</a>
+<a class="line-number" data-cell="gptoss_run" data-line="100" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 100, true);">100</a>
+<a class="line-number" data-cell="gptoss_run" data-line="101" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 101, true);">101</a>
+<a class="line-number" data-cell="gptoss_run" data-line="102" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 102, true);">102</a>
+<a class="line-number" data-cell="gptoss_run" data-line="103" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 103, true);">103</a>
+<a class="line-number" data-cell="gptoss_run" data-line="104" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 104, true);">104</a>
+<a class="line-number" data-cell="gptoss_run" data-line="105" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 105, true);">105</a>
+<a class="line-number" data-cell="gptoss_run" data-line="106" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 106, true);">106</a>
+<a class="line-number" data-cell="gptoss_run" data-line="107" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 107, true);">107</a>
+<a class="line-number" data-cell="gptoss_run" data-line="108" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 108, true);">108</a>
+<a class="line-number" data-cell="gptoss_run" data-line="109" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 109, true);">109</a>
+<a class="line-number" data-cell="gptoss_run" data-line="110" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 110, true);">110</a>
+<a class="line-number" data-cell="gptoss_run" data-line="111" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 111, true);">111</a>
+<a class="line-number" data-cell="gptoss_run" data-line="112" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 112, true);">112</a>
+<a class="line-number" data-cell="gptoss_run" data-line="113" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 113, true);">113</a>
+<a class="line-number" data-cell="gptoss_run" data-line="114" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 114, true);">114</a>
+<a class="line-number" data-cell="gptoss_run" data-line="115" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 115, true);">115</a>
+<a class="line-number" data-cell="gptoss_run" data-line="116" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 116, true);">116</a>
+<a class="line-number" data-cell="gptoss_run" data-line="117" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 117, true);">117</a>
+<a class="line-number" data-cell="gptoss_run" data-line="118" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 118, true);">118</a>
+<a class="line-number" data-cell="gptoss_run" data-line="119" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 119, true);">119</a>
+<a class="line-number" data-cell="gptoss_run" data-line="120" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 120, true);">120</a>
+<a class="line-number" data-cell="gptoss_run" data-line="121" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 121, true);">121</a>
+<a class="line-number" data-cell="gptoss_run" data-line="122" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 122, true);">122</a>
+<a class="line-number" data-cell="gptoss_run" data-line="123" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 123, true);">123</a>
+<a class="line-number" data-cell="gptoss_run" data-line="124" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 124, true);">124</a>
+<a class="line-number" data-cell="gptoss_run" data-line="125" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 125, true);">125</a>
+<a class="line-number" data-cell="gptoss_run" data-line="126" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 126, true);">126</a>
+<a class="line-number" data-cell="gptoss_run" data-line="127" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 127, true);">127</a>
+<a class="line-number" data-cell="gptoss_run" data-line="128" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 128, true);">128</a>
+<a class="line-number" data-cell="gptoss_run" data-line="129" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 129, true);">129</a>
+<a class="line-number" data-cell="gptoss_run" data-line="130" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 130, true);">130</a>
+<a class="line-number" data-cell="gptoss_run" data-line="131" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 131, true);">131</a>
+<a class="line-number" data-cell="gptoss_run" data-line="132" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 132, true);">132</a>
+<a class="line-number" data-cell="gptoss_run" data-line="133" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 133, true);">133</a>
+<a class="line-number" data-cell="gptoss_run" data-line="134" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 134, true);">134</a>
+<a class="line-number" data-cell="gptoss_run" data-line="135" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 135, true);">135</a>
+<a class="line-number" data-cell="gptoss_run" data-line="136" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 136, true);">136</a>
+<a class="line-number" data-cell="gptoss_run" data-line="137" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 137, true);">137</a>
+<a class="line-number" data-cell="gptoss_run" data-line="138" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 138, true);">138</a>
+<a class="line-number" data-cell="gptoss_run" data-line="139" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 139, true);">139</a>
+<a class="line-number" data-cell="gptoss_run" data-line="140" href="#cell-gptoss_run" onclick="event.preventDefault(); selectCellLine('gptoss_run', 140, true);">140</a>
+</div>
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch</span><span class="w"> </span><span class="kn">import</span> <span class="n">nn</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">functional</span> <span class="k">as</span> <span class="n">F</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">bench_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">to_dtype</span><span class="p">,</span> <span class="n">tensor_stats</span><span class="p">,</span> <span class="n">set_seed</span><span class="p">,</span> <span class="n">bench_context</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">config</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span>
+    <span class="n">NUM_EXPERTS</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">TOP_K</span><span class="p">,</span>
+    <span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">DTYPE</span><span class="p">,</span> <span class="n">DEVICE</span><span class="p">,</span>
+    <span class="n">WEIGHT_SEED</span><span class="p">,</span> <span class="n">EXPERT_SEED</span><span class="p">,</span> <span class="n">INPUT_SEED</span><span class="p">,</span> <span class="n">GENERAL_SEED</span>
+<span class="p">)</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
+
+<span class="c1"># Discover the upstream artifact directory from env</span>
+<span class="n">data_dir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;UVNOTE_INPUT_SAVE_DATA&#39;</span><span class="p">,</span> <span class="s1">&#39;.&#39;</span><span class="p">)</span>
+
+<span class="n">router_weight</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_weight.pt&#39;</span><span class="p">)</span>
+<span class="n">router_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj_bias.pt&#39;</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Loaded shared weights from artifacts&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Router weight sum: </span><span class="si">{</span><span class="n">router_weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gate/up sum: </span><span class="si">{</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Down sum: </span><span class="si">{</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">GptOssRouter</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="o">=</span> <span class="n">TOP_K</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">num_experts</span> <span class="o">=</span> <span class="n">NUM_EXPERTS</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">hidden_dim</span> <span class="o">=</span> <span class="n">HIDDEN_SIZE</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">router_weight</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">router_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">):</span>
+        <span class="n">hidden_states</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_dim</span><span class="p">)</span>
+        <span class="n">router_logits</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias</span><span class="p">)</span>
+        <span class="n">router_top_value</span><span class="p">,</span> <span class="n">router_indices</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">topk</span><span class="p">(</span><span class="n">router_logits</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
+        <span class="n">router_top_value</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">router_top_value</span><span class="p">,</span> <span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">router_top_value</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+        <span class="n">router_scores</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">router_logits</span><span class="p">)</span><span class="o">.</span><span class="n">scatter_</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">router_indices</span><span class="p">,</span> <span class="n">router_top_value</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">router_scores</span><span class="p">,</span> <span class="n">router_indices</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">GptOssExperts</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">gate_up_proj</span><span class="p">,</span> <span class="n">gate_up_proj_bias</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">,</span> <span class="n">down_proj_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">num_experts</span> <span class="o">=</span> <span class="n">NUM_EXPERTS</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span> <span class="o">=</span> <span class="n">HIDDEN_SIZE</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">expert_dim</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">limit</span> <span class="o">=</span> <span class="mf">7.0</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">router_indices</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">routing_weights</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">:</span>
+        <span class="n">batch_size</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+        <span class="n">hidden_states</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span><span class="p">)</span>
+        <span class="n">num_experts</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+
+        <span class="k">if</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">device</span><span class="o">.</span><span class="n">type</span> <span class="o">==</span> <span class="s2">&quot;cpu&quot;</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">training</span><span class="p">:</span>
+            <span class="n">next_states</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+            <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
+                <span class="n">expert_mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">one_hot</span><span class="p">(</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">num_classes</span><span class="o">=</span><span class="n">num_experts</span><span class="p">)</span>
+                <span class="n">expert_mask</span> <span class="o">=</span> <span class="n">expert_mask</span><span class="o">.</span><span class="n">permute</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
+                <span class="n">expert_hit</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">greater</span><span class="p">(</span><span class="n">expert_mask</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">2</span><span class="p">)),</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">nonzero</span><span class="p">()</span>
+
+            <span class="k">for</span> <span class="n">expert_idx</span> <span class="ow">in</span> <span class="n">expert_hit</span><span class="p">[:]:</span>
+                <span class="n">expert_idx</span> <span class="o">=</span> <span class="n">expert_idx</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+                <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
+                    <span class="n">_</span><span class="p">,</span> <span class="n">token_idx</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">expert_mask</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">])</span>
+                <span class="n">current_state</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="p">[</span><span class="n">token_idx</span><span class="p">]</span>
+                <span class="n">gate_up</span> <span class="o">=</span> <span class="n">current_state</span> <span class="o">@</span> <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj_bias</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span>
+                <span class="n">gate</span><span class="p">,</span> <span class="n">up</span> <span class="o">=</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">::</span><span class="mi">2</span><span class="p">],</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">::</span><span class="mi">2</span><span class="p">]</span>
+                <span class="n">gate</span> <span class="o">=</span> <span class="n">gate</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">)</span>
+                <span class="n">up</span> <span class="o">=</span> <span class="n">up</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=-</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">)</span>
+                <span class="n">glu</span> <span class="o">=</span> <span class="n">gate</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">gate</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span><span class="p">)</span>
+                <span class="n">gated_output</span> <span class="o">=</span> <span class="p">(</span><span class="n">up</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">glu</span>
+                <span class="n">out</span> <span class="o">=</span> <span class="n">gated_output</span> <span class="o">@</span> <span class="bp">self</span><span class="o">.</span><span class="n">down_proj</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">down_proj_bias</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span>
+                <span class="n">weighted_output</span> <span class="o">=</span> <span class="n">out</span> <span class="o">*</span> <span class="n">routing_weights</span><span class="p">[</span><span class="n">token_idx</span><span class="p">,</span> <span class="n">expert_idx</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span>
+                <span class="n">next_states</span><span class="o">.</span><span class="n">index_add_</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">token_idx</span><span class="p">,</span> <span class="n">weighted_output</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">dtype</span><span class="p">))</span>
+            <span class="n">next_states</span> <span class="o">=</span> <span class="n">next_states</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span><span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">hidden_states</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">repeat</span><span class="p">(</span><span class="n">num_experts</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
+            <span class="n">hidden_states</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">num_experts</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span><span class="p">)</span>
+            <span class="n">gate_up</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bmm</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="p">)</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj_bias</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
+            <span class="n">gate</span><span class="p">,</span> <span class="n">up</span> <span class="o">=</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">::</span><span class="mi">2</span><span class="p">],</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">::</span><span class="mi">2</span><span class="p">]</span>
+            <span class="n">gate</span> <span class="o">=</span> <span class="n">gate</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">)</span>
+            <span class="n">up</span> <span class="o">=</span> <span class="n">up</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=-</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">)</span>
+            <span class="n">glu</span> <span class="o">=</span> <span class="n">gate</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">gate</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span><span class="p">)</span>
+            <span class="n">next_states</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">bmm</span><span class="p">(((</span><span class="n">up</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">glu</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">down_proj</span><span class="p">)</span>
+            <span class="n">next_states</span> <span class="o">=</span> <span class="n">next_states</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">down_proj_bias</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
+            <span class="n">next_states</span> <span class="o">=</span> <span class="n">next_states</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">num_experts</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span><span class="p">)</span>
+            <span class="n">next_states</span> <span class="o">=</span> <span class="n">next_states</span> <span class="o">*</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">transpose</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">num_experts</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)[</span><span class="o">...</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span>
+            <span class="n">next_states</span> <span class="o">=</span> <span class="n">next_states</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">next_states</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">GptOssMoEMLP</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">,</span> <span class="n">gate_up_proj</span><span class="p">,</span> <span class="n">gate_up_proj_bias</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">,</span> <span class="n">down_proj_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">router</span> <span class="o">=</span> <span class="n">GptOssRouter</span><span class="p">(</span><span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">experts</span> <span class="o">=</span> <span class="n">GptOssExperts</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="p">,</span> <span class="n">gate_up_proj_bias</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">,</span> <span class="n">down_proj_bias</span><span class="p">)</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">):</span>
+        <span class="n">router_scores</span><span class="p">,</span> <span class="n">router_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">router</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">)</span>
+        <span class="n">routed_out</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">experts</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="n">router_indices</span><span class="o">=</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">routing_weights</span><span class="o">=</span><span class="n">router_scores</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">routed_out</span><span class="p">,</span> <span class="n">router_scores</span>
+
+<span class="c1"># Run the model</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">GENERAL_SEED</span><span class="p">)</span>
+
+<span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="n">DEVICE</span><span class="p">)</span>
+<span class="n">dtype</span> <span class="o">=</span> <span class="n">to_dtype</span><span class="p">(</span><span class="n">DTYPE</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">=== GPT-OSS Implementation ===&quot;</span><span class="p">)</span>
+<span class="c1"># Initialize model with loaded weights</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">GptOssMoEMLP</span><span class="p">(</span>
+    <span class="n">router_weight</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">router_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">gate_up_proj</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">down_proj</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">down_proj_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span>
+<span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Router weight sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">router</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gate/up proj sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">experts</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Down proj sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">experts</span><span class="o">.</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="c1"># Generate the same input as other implementations</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">INPUT_SEED</span><span class="p">)</span>
+<span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span> <span class="o">*</span> <span class="mf">0.1</span>
+
+<span class="c1"># Benchmark the model with varied inputs to prevent caching artifacts</span>
+<span class="n">tokens</span> <span class="o">=</span> <span class="n">BATCH_SIZE</span> <span class="o">*</span> <span class="n">SEQ_LEN</span>
+<span class="k">with</span> <span class="n">bench_context</span><span class="p">(</span><span class="n">warmup</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">iters</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">tokens</span><span class="o">=</span><span class="n">tokens</span><span class="p">,</span> <span class="n">save_json</span><span class="o">=</span><span class="s2">&quot;gptoss_results.json&quot;</span><span class="p">,</span> <span class="n">vary_inputs</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">as</span> <span class="n">bench</span><span class="p">:</span>
+    <span class="n">output</span><span class="p">,</span> <span class="n">stats</span> <span class="o">=</span> <span class="n">bench</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Output sum: </span><span class="si">{</span><span class="n">output</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-gptoss_run"></div>
+</div>
+</div>
+</div>
+<div id="output-gptoss_run" class="cell-output">
+<div class="cell-stdout">Loaded shared weights from artifacts
+Router weight sum: 12.588732
+Gate/up sum: 1026.601807
+Down sum: 206.729263
+
+=== GPT-OSS Implementation ===
+Router weight sum: 12.588732
+Gate/up proj sum: 1026.601807
+Down proj sum: 206.729340
+
+┌─ Benchmark Configuration ─────────────────────────────┐
+│ Warmup: 10              Iters: 50              │
+│ Tokens: 100                                        │
+│ Input Variation: Enabled (prevents caching artifacts)  │
+└────────────────────────────────────────────────────────┘
+
+Base Input: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.486445, 0.446746], mean=-0.000048, std=0.099986, norm=33.936142
+Input Variation: +0.001 * iteration (deterministic)
+
+Warming up (10 iterations)...
+Benchmarking (50 iterations)...
+  Progress: 20% complete (avg: 48.905 ms)
+  Progress: 40% complete (avg: 48.717 ms)
+  Progress: 60% complete (avg: 47.570 ms)
+  Progress: 80% complete (avg: 46.370 ms)
+
+Output tensors:
+  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
+  Auxiliary: shape=(100, 128), dtype=torch.float32, device=cuda:0, range=[0.000000, 0.302948], mean=0.007812, std=0.043553, norm=5.005893
+
+━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━
+Iterations: 50
+
+Latency Statistics:
+  Average: 45.287 ms
+  Min:     38.914 ms
+  Max:     49.844 ms
+  Std Dev: 3.233 ms
+
+Percentiles:
+  P50 (median): 45.422 ms
+  P95:          49.730 ms
+  P99:          49.825 ms
+
+Throughput:
+  Tokens/sec: 2208.2
+  Std Dev:    161.3
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+Saved benchmark results to gptoss_results.json
+
+Output sum: 11.532237
+</div>
+<div class="uv-install-logs" id="uv-logs-gptoss_run">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Downloading networkx (1.9MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading sympy (6.0MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading numpy (16.2MiB)
+Downloading triton (148.3MiB)
+Downloading torch (846.9MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 26 packages in 453ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/gptoss_run/gptoss_results.json" class="artifact" target="_blank">gptoss_results.json</a>
+</div>
+</div>
+</div>
+
 <h2>GPT-OSS Implementation (Training Mode)</h2>
 <p>This section runs the GPT-OSS MoE implementation with training mode enabled to force the expert loop path.</p>
+<div class="cell" id="cell-gptoss_training_run">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('gptoss_training_run')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: gptoss_training_run | deps: torch, numpy | 40.24s
+ | <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
+<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
+</div>
+<div id="code-gptoss_training_run" class="cell-code" data-lines="131">
+<div class="highlight-with-lines">
+<div class="line-numbers" id="lines-gptoss_training_run">
+<a class="line-number" data-cell="gptoss_training_run" data-line="1" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 1, true);">1</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="2" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 2, true);">2</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="3" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 3, true);">3</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="4" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 4, true);">4</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="5" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 5, true);">5</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="6" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 6, true);">6</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="7" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 7, true);">7</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="8" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 8, true);">8</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="9" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 9, true);">9</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="10" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 10, true);">10</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="11" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 11, true);">11</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="12" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 12, true);">12</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="13" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 13, true);">13</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="14" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 14, true);">14</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="15" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 15, true);">15</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="16" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 16, true);">16</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="17" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 17, true);">17</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="18" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 18, true);">18</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="19" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 19, true);">19</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="20" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 20, true);">20</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="21" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 21, true);">21</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="22" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 22, true);">22</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="23" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 23, true);">23</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="24" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 24, true);">24</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="25" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 25, true);">25</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="26" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 26, true);">26</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="27" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 27, true);">27</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="28" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 28, true);">28</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="29" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 29, true);">29</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="30" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 30, true);">30</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="31" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 31, true);">31</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="32" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 32, true);">32</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="33" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 33, true);">33</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="34" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 34, true);">34</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="35" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 35, true);">35</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="36" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 36, true);">36</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="37" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 37, true);">37</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="38" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 38, true);">38</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="39" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 39, true);">39</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="40" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 40, true);">40</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="41" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 41, true);">41</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="42" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 42, true);">42</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="43" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 43, true);">43</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="44" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 44, true);">44</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="45" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 45, true);">45</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="46" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 46, true);">46</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="47" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 47, true);">47</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="48" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 48, true);">48</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="49" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 49, true);">49</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="50" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 50, true);">50</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="51" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 51, true);">51</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="52" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 52, true);">52</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="53" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 53, true);">53</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="54" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 54, true);">54</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="55" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 55, true);">55</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="56" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 56, true);">56</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="57" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 57, true);">57</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="58" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 58, true);">58</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="59" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 59, true);">59</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="60" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 60, true);">60</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="61" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 61, true);">61</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="62" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 62, true);">62</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="63" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 63, true);">63</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="64" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 64, true);">64</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="65" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 65, true);">65</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="66" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 66, true);">66</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="67" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 67, true);">67</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="68" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 68, true);">68</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="69" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 69, true);">69</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="70" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 70, true);">70</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="71" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 71, true);">71</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="72" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 72, true);">72</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="73" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 73, true);">73</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="74" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 74, true);">74</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="75" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 75, true);">75</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="76" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 76, true);">76</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="77" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 77, true);">77</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="78" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 78, true);">78</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="79" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 79, true);">79</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="80" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 80, true);">80</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="81" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 81, true);">81</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="82" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 82, true);">82</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="83" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 83, true);">83</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="84" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 84, true);">84</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="85" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 85, true);">85</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="86" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 86, true);">86</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="87" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 87, true);">87</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="88" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 88, true);">88</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="89" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 89, true);">89</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="90" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 90, true);">90</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="91" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 91, true);">91</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="92" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 92, true);">92</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="93" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 93, true);">93</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="94" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 94, true);">94</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="95" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 95, true);">95</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="96" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 96, true);">96</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="97" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 97, true);">97</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="98" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 98, true);">98</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="99" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 99, true);">99</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="100" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 100, true);">100</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="101" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 101, true);">101</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="102" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 102, true);">102</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="103" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 103, true);">103</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="104" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 104, true);">104</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="105" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 105, true);">105</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="106" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 106, true);">106</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="107" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 107, true);">107</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="108" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 108, true);">108</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="109" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 109, true);">109</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="110" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 110, true);">110</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="111" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 111, true);">111</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="112" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 112, true);">112</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="113" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 113, true);">113</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="114" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 114, true);">114</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="115" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 115, true);">115</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="116" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 116, true);">116</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="117" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 117, true);">117</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="118" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 118, true);">118</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="119" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 119, true);">119</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="120" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 120, true);">120</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="121" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 121, true);">121</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="122" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 122, true);">122</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="123" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 123, true);">123</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="124" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 124, true);">124</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="125" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 125, true);">125</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="126" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 126, true);">126</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="127" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 127, true);">127</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="128" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 128, true);">128</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="129" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 129, true);">129</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="130" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 130, true);">130</a>
+<a class="line-number" data-cell="gptoss_training_run" data-line="131" href="#cell-gptoss_training_run" onclick="event.preventDefault(); selectCellLine('gptoss_training_run', 131, true);">131</a>
+</div>
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch</span><span class="w"> </span><span class="kn">import</span> <span class="n">nn</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">functional</span> <span class="k">as</span> <span class="n">F</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">bench_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">to_dtype</span><span class="p">,</span> <span class="n">tensor_stats</span><span class="p">,</span> <span class="n">set_seed</span><span class="p">,</span> <span class="n">bench_context</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">config</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span>
+    <span class="n">NUM_EXPERTS</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">TOP_K</span><span class="p">,</span>
+    <span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">DTYPE</span><span class="p">,</span> <span class="n">DEVICE</span><span class="p">,</span>
+    <span class="n">WEIGHT_SEED</span><span class="p">,</span> <span class="n">EXPERT_SEED</span><span class="p">,</span> <span class="n">INPUT_SEED</span><span class="p">,</span> <span class="n">GENERAL_SEED</span>
+<span class="p">)</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
+
+<span class="c1"># Discover the upstream artifact directory from env</span>
+<span class="n">data_dir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;UVNOTE_INPUT_SAVE_DATA&#39;</span><span class="p">,</span> <span class="s1">&#39;.&#39;</span><span class="p">)</span>
+
+<span class="n">router_weight</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_weight.pt&#39;</span><span class="p">)</span>
+<span class="n">router_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj_bias.pt&#39;</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Loaded shared weights from artifacts&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Router weight sum: </span><span class="si">{</span><span class="n">router_weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gate/up sum: </span><span class="si">{</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Down sum: </span><span class="si">{</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">GptOssTrainingRouter</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span> <span class="o">=</span> <span class="n">TOP_K</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">num_experts</span> <span class="o">=</span> <span class="n">NUM_EXPERTS</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">hidden_dim</span> <span class="o">=</span> <span class="n">HIDDEN_SIZE</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">router_weight</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">router_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">):</span>
+        <span class="n">hidden_states</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_dim</span><span class="p">)</span>
+        <span class="n">router_logits</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">linear</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias</span><span class="p">)</span>
+        <span class="n">router_top_value</span><span class="p">,</span> <span class="n">router_indices</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">topk</span><span class="p">(</span><span class="n">router_logits</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">top_k</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span>
+        <span class="n">router_top_value</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">router_top_value</span><span class="p">,</span> <span class="n">dim</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">router_top_value</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+        <span class="n">router_scores</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">router_logits</span><span class="p">)</span><span class="o">.</span><span class="n">scatter_</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">router_indices</span><span class="p">,</span> <span class="n">router_top_value</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">router_scores</span><span class="p">,</span> <span class="n">router_indices</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">GptOssTrainingExperts</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">gate_up_proj</span><span class="p">,</span> <span class="n">gate_up_proj_bias</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">,</span> <span class="n">down_proj_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">num_experts</span> <span class="o">=</span> <span class="n">NUM_EXPERTS</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span> <span class="o">=</span> <span class="n">HIDDEN_SIZE</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">expert_dim</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">())</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">limit</span> <span class="o">=</span> <span class="mf">7.0</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">router_indices</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">routing_weights</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">:</span>
+        <span class="n">batch_size</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+        <span class="n">hidden_states</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span><span class="p">)</span>
+        <span class="n">num_experts</span> <span class="o">=</span> <span class="n">routing_weights</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
+
+        <span class="c1"># Force training mode path (expert loop instead of batched)</span>
+        <span class="n">next_states</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+        <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
+            <span class="n">expert_mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">one_hot</span><span class="p">(</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">num_classes</span><span class="o">=</span><span class="n">num_experts</span><span class="p">)</span>
+            <span class="n">expert_mask</span> <span class="o">=</span> <span class="n">expert_mask</span><span class="o">.</span><span class="n">permute</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
+            <span class="n">expert_hit</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">greater</span><span class="p">(</span><span class="n">expert_mask</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">dim</span><span class="o">=</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">2</span><span class="p">)),</span> <span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">nonzero</span><span class="p">()</span>
+
+        <span class="k">for</span> <span class="n">expert_idx</span> <span class="ow">in</span> <span class="n">expert_hit</span><span class="p">[:]:</span>
+            <span class="n">expert_idx</span> <span class="o">=</span> <span class="n">expert_idx</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+            <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
+                <span class="n">_</span><span class="p">,</span> <span class="n">token_idx</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">expert_mask</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">])</span>
+            <span class="n">current_state</span> <span class="o">=</span> <span class="n">hidden_states</span><span class="p">[</span><span class="n">token_idx</span><span class="p">]</span>
+            <span class="n">gate_up</span> <span class="o">=</span> <span class="n">current_state</span> <span class="o">@</span> <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">gate_up_proj_bias</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span>
+            <span class="n">gate</span><span class="p">,</span> <span class="n">up</span> <span class="o">=</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">::</span><span class="mi">2</span><span class="p">],</span> <span class="n">gate_up</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="mi">1</span><span class="p">::</span><span class="mi">2</span><span class="p">]</span>
+            <span class="n">gate</span> <span class="o">=</span> <span class="n">gate</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">)</span>
+            <span class="n">up</span> <span class="o">=</span> <span class="n">up</span><span class="o">.</span><span class="n">clamp</span><span class="p">(</span><span class="nb">min</span><span class="o">=-</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">,</span> <span class="nb">max</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">limit</span><span class="p">)</span>
+            <span class="n">glu</span> <span class="o">=</span> <span class="n">gate</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">sigmoid</span><span class="p">(</span><span class="n">gate</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">alpha</span><span class="p">)</span>
+            <span class="n">gated_output</span> <span class="o">=</span> <span class="p">(</span><span class="n">up</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> <span class="o">*</span> <span class="n">glu</span>
+            <span class="n">out</span> <span class="o">=</span> <span class="n">gated_output</span> <span class="o">@</span> <span class="bp">self</span><span class="o">.</span><span class="n">down_proj</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">down_proj_bias</span><span class="p">[</span><span class="n">expert_idx</span><span class="p">]</span>
+            <span class="n">weighted_output</span> <span class="o">=</span> <span class="n">out</span> <span class="o">*</span> <span class="n">routing_weights</span><span class="p">[</span><span class="n">token_idx</span><span class="p">,</span> <span class="n">expert_idx</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span>
+            <span class="n">next_states</span><span class="o">.</span><span class="n">index_add_</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">token_idx</span><span class="p">,</span> <span class="n">weighted_output</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">hidden_states</span><span class="o">.</span><span class="n">dtype</span><span class="p">))</span>
+        <span class="n">next_states</span> <span class="o">=</span> <span class="n">next_states</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">hidden_size</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">next_states</span>
+
+<span class="k">class</span><span class="w"> </span><span class="nc">GptOssTrainingMoEMLP</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">,</span> <span class="n">gate_up_proj</span><span class="p">,</span> <span class="n">gate_up_proj_bias</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">,</span> <span class="n">down_proj_bias</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">router</span> <span class="o">=</span> <span class="n">GptOssTrainingRouter</span><span class="p">(</span><span class="n">router_weight</span><span class="p">,</span> <span class="n">router_bias</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">experts</span> <span class="o">=</span> <span class="n">GptOssTrainingExperts</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="p">,</span> <span class="n">gate_up_proj_bias</span><span class="p">,</span> <span class="n">down_proj</span><span class="p">,</span> <span class="n">down_proj_bias</span><span class="p">)</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">):</span>
+        <span class="n">router_scores</span><span class="p">,</span> <span class="n">router_indices</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">router</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">)</span>
+        <span class="n">routed_out</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">experts</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">,</span> <span class="n">router_indices</span><span class="o">=</span><span class="n">router_indices</span><span class="p">,</span> <span class="n">routing_weights</span><span class="o">=</span><span class="n">router_scores</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">routed_out</span><span class="p">,</span> <span class="n">router_scores</span>
+
+<span class="c1"># Run the model</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">GENERAL_SEED</span><span class="p">)</span>
+
+<span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="n">DEVICE</span><span class="p">)</span>
+<span class="n">dtype</span> <span class="o">=</span> <span class="n">to_dtype</span><span class="p">(</span><span class="n">DTYPE</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">=== GPT-OSS Implementation (Training Mode - Expert Loop) ===&quot;</span><span class="p">)</span>
+<span class="c1"># Initialize model with loaded weights and force training mode</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">GptOssTrainingMoEMLP</span><span class="p">(</span>
+    <span class="n">router_weight</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">router_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">gate_up_proj</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">down_proj</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">),</span>
+    <span class="n">down_proj_bias</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">)</span>
+<span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
+
+<span class="c1"># Set to training mode to force expert loop path</span>
+<span class="n">model</span><span class="o">.</span><span class="n">train</span><span class="p">()</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Router weight sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">router</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gate/up proj sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">experts</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Down proj sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">experts</span><span class="o">.</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Model training mode: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">training</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="c1"># Generate the same input as other implementations</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">INPUT_SEED</span><span class="p">)</span>
+<span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span> <span class="o">*</span> <span class="mf">0.1</span>
+
+<span class="c1"># Benchmark the model with varied inputs to prevent caching artifacts</span>
+<span class="n">tokens</span> <span class="o">=</span> <span class="n">BATCH_SIZE</span> <span class="o">*</span> <span class="n">SEQ_LEN</span>
+<span class="k">with</span> <span class="n">bench_context</span><span class="p">(</span><span class="n">warmup</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">iters</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">tokens</span><span class="o">=</span><span class="n">tokens</span><span class="p">,</span> <span class="n">save_json</span><span class="o">=</span><span class="s2">&quot;gptoss_training_results.json&quot;</span><span class="p">,</span> <span class="n">vary_inputs</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">as</span> <span class="n">bench</span><span class="p">:</span>
+    <span class="n">output</span><span class="p">,</span> <span class="n">stats</span> <span class="o">=</span> <span class="n">bench</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Output sum: </span><span class="si">{</span><span class="n">output</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-gptoss_training_run"></div>
+</div>
+</div>
+</div>
+<div id="output-gptoss_training_run" class="cell-output">
+<div class="cell-stdout">Loaded shared weights from artifacts
+Router weight sum: 12.588732
+Gate/up sum: 1026.601807
+Down sum: 206.729263
+
+=== GPT-OSS Implementation (Training Mode - Expert Loop) ===
+Router weight sum: 12.588732
+Gate/up proj sum: 1026.601807
+Down proj sum: 206.729340
+Model training mode: True
+
+┌─ Benchmark Configuration ─────────────────────────────┐
+│ Warmup: 10              Iters: 50              │
+│ Tokens: 100                                        │
+│ Input Variation: Enabled (prevents caching artifacts)  │
+└────────────────────────────────────────────────────────┘
+
+Base Input: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.486445, 0.446746], mean=-0.000048, std=0.099986, norm=33.936142
+Input Variation: +0.001 * iteration (deterministic)
+
+Warming up (10 iterations)...
+Benchmarking (50 iterations)...
+  Progress: 20% complete (avg: 49.963 ms)
+  Progress: 40% complete (avg: 49.344 ms)
+  Progress: 60% complete (avg: 48.274 ms)
+  Progress: 80% complete (avg: 47.165 ms)
+
+Output tensors:
+  Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
+  Auxiliary: shape=(100, 128), dtype=torch.float32, device=cuda:0, range=[0.000000, 0.302948], mean=0.007812, std=0.043553, norm=5.005893
+
+━━━━━━━━━━━━━━━━━━━━ Benchmark Results ━━━━━━━━━━━━━━━━━━━━
+Iterations: 50
+
+Latency Statistics:
+  Average: 46.010 ms
+  Min:     39.207 ms
+  Max:     51.098 ms
+  Std Dev: 3.259 ms
+
+Percentiles:
+  P50 (median): 46.133 ms
+  P95:          50.721 ms
+  P99:          51.008 ms
+
+Throughput:
+  Tokens/sec: 2173.4
+  Std Dev:    158.7
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+Saved benchmark results to gptoss_training_results.json
+
+Output sum: 11.532237
+</div>
+<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Downloading setuptools (1.1MiB)
+Downloading sympy (6.0MiB)
+Downloading numpy (16.2MiB)
+Downloading networkx (1.9MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading triton (148.3MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 26 packages in 448ms
+</div>
+</div>
+<div class="cell-artifacts">
+<h4>Artifacts:</h4>
+<a href="artifacts/gptoss_training_run/gptoss_training_results.json" class="artifact" target="_blank">gptoss_training_results.json</a>
+</div>
+</div>
+</div>
+
 <h2>MegaBlocks Implementation</h2>
 <p>This section runs the MegaBlocks MoE implementation with optimized kernels from the Hugging Face hub.</p>
+<div class="cell cell-failed" id="cell-megablocks_run">
+<div class="cell-header">
+<span class="collapse-indicators">
+<span onclick="toggleCode('megablocks_run')" style="cursor: pointer;">▼ code</span> 
+<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
+ <span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
+</span> | 
+Cell: megablocks_run | deps: torch, numpy, kernels | 40.58s | FAILED
+ | <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
+<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
+<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
+</div>
+<div id="code-megablocks_run" class="cell-code" data-lines="95">
+<div class="highlight-with-lines">
+<div class="line-numbers" id="lines-megablocks_run">
+<a class="line-number" data-cell="megablocks_run" data-line="1" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 1, true);">1</a>
+<a class="line-number" data-cell="megablocks_run" data-line="2" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 2, true);">2</a>
+<a class="line-number" data-cell="megablocks_run" data-line="3" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 3, true);">3</a>
+<a class="line-number" data-cell="megablocks_run" data-line="4" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 4, true);">4</a>
+<a class="line-number" data-cell="megablocks_run" data-line="5" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 5, true);">5</a>
+<a class="line-number" data-cell="megablocks_run" data-line="6" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 6, true);">6</a>
+<a class="line-number" data-cell="megablocks_run" data-line="7" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 7, true);">7</a>
+<a class="line-number" data-cell="megablocks_run" data-line="8" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 8, true);">8</a>
+<a class="line-number" data-cell="megablocks_run" data-line="9" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 9, true);">9</a>
+<a class="line-number" data-cell="megablocks_run" data-line="10" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 10, true);">10</a>
+<a class="line-number" data-cell="megablocks_run" data-line="11" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 11, true);">11</a>
+<a class="line-number" data-cell="megablocks_run" data-line="12" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 12, true);">12</a>
+<a class="line-number" data-cell="megablocks_run" data-line="13" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 13, true);">13</a>
+<a class="line-number" data-cell="megablocks_run" data-line="14" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 14, true);">14</a>
+<a class="line-number" data-cell="megablocks_run" data-line="15" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 15, true);">15</a>
+<a class="line-number" data-cell="megablocks_run" data-line="16" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 16, true);">16</a>
+<a class="line-number" data-cell="megablocks_run" data-line="17" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 17, true);">17</a>
+<a class="line-number" data-cell="megablocks_run" data-line="18" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 18, true);">18</a>
+<a class="line-number" data-cell="megablocks_run" data-line="19" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 19, true);">19</a>
+<a class="line-number" data-cell="megablocks_run" data-line="20" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 20, true);">20</a>
+<a class="line-number" data-cell="megablocks_run" data-line="21" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 21, true);">21</a>
+<a class="line-number" data-cell="megablocks_run" data-line="22" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 22, true);">22</a>
+<a class="line-number" data-cell="megablocks_run" data-line="23" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 23, true);">23</a>
+<a class="line-number" data-cell="megablocks_run" data-line="24" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 24, true);">24</a>
+<a class="line-number" data-cell="megablocks_run" data-line="25" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 25, true);">25</a>
+<a class="line-number" data-cell="megablocks_run" data-line="26" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 26, true);">26</a>
+<a class="line-number" data-cell="megablocks_run" data-line="27" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 27, true);">27</a>
+<a class="line-number" data-cell="megablocks_run" data-line="28" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 28, true);">28</a>
+<a class="line-number" data-cell="megablocks_run" data-line="29" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 29, true);">29</a>
+<a class="line-number" data-cell="megablocks_run" data-line="30" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 30, true);">30</a>
+<a class="line-number" data-cell="megablocks_run" data-line="31" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 31, true);">31</a>
+<a class="line-number" data-cell="megablocks_run" data-line="32" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 32, true);">32</a>
+<a class="line-number" data-cell="megablocks_run" data-line="33" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 33, true);">33</a>
+<a class="line-number" data-cell="megablocks_run" data-line="34" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 34, true);">34</a>
+<a class="line-number" data-cell="megablocks_run" data-line="35" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 35, true);">35</a>
+<a class="line-number" data-cell="megablocks_run" data-line="36" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 36, true);">36</a>
+<a class="line-number" data-cell="megablocks_run" data-line="37" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 37, true);">37</a>
+<a class="line-number" data-cell="megablocks_run" data-line="38" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 38, true);">38</a>
+<a class="line-number" data-cell="megablocks_run" data-line="39" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 39, true);">39</a>
+<a class="line-number" data-cell="megablocks_run" data-line="40" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 40, true);">40</a>
+<a class="line-number" data-cell="megablocks_run" data-line="41" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 41, true);">41</a>
+<a class="line-number" data-cell="megablocks_run" data-line="42" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 42, true);">42</a>
+<a class="line-number" data-cell="megablocks_run" data-line="43" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 43, true);">43</a>
+<a class="line-number" data-cell="megablocks_run" data-line="44" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 44, true);">44</a>
+<a class="line-number" data-cell="megablocks_run" data-line="45" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 45, true);">45</a>
+<a class="line-number" data-cell="megablocks_run" data-line="46" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 46, true);">46</a>
+<a class="line-number" data-cell="megablocks_run" data-line="47" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 47, true);">47</a>
+<a class="line-number" data-cell="megablocks_run" data-line="48" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 48, true);">48</a>
+<a class="line-number" data-cell="megablocks_run" data-line="49" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 49, true);">49</a>
+<a class="line-number" data-cell="megablocks_run" data-line="50" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 50, true);">50</a>
+<a class="line-number" data-cell="megablocks_run" data-line="51" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 51, true);">51</a>
+<a class="line-number" data-cell="megablocks_run" data-line="52" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 52, true);">52</a>
+<a class="line-number" data-cell="megablocks_run" data-line="53" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 53, true);">53</a>
+<a class="line-number" data-cell="megablocks_run" data-line="54" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 54, true);">54</a>
+<a class="line-number" data-cell="megablocks_run" data-line="55" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 55, true);">55</a>
+<a class="line-number" data-cell="megablocks_run" data-line="56" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 56, true);">56</a>
+<a class="line-number" data-cell="megablocks_run" data-line="57" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 57, true);">57</a>
+<a class="line-number" data-cell="megablocks_run" data-line="58" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 58, true);">58</a>
+<a class="line-number" data-cell="megablocks_run" data-line="59" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 59, true);">59</a>
+<a class="line-number" data-cell="megablocks_run" data-line="60" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 60, true);">60</a>
+<a class="line-number" data-cell="megablocks_run" data-line="61" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 61, true);">61</a>
+<a class="line-number" data-cell="megablocks_run" data-line="62" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 62, true);">62</a>
+<a class="line-number" data-cell="megablocks_run" data-line="63" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 63, true);">63</a>
+<a class="line-number" data-cell="megablocks_run" data-line="64" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 64, true);">64</a>
+<a class="line-number" data-cell="megablocks_run" data-line="65" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 65, true);">65</a>
+<a class="line-number" data-cell="megablocks_run" data-line="66" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 66, true);">66</a>
+<a class="line-number" data-cell="megablocks_run" data-line="67" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 67, true);">67</a>
+<a class="line-number" data-cell="megablocks_run" data-line="68" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 68, true);">68</a>
+<a class="line-number" data-cell="megablocks_run" data-line="69" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 69, true);">69</a>
+<a class="line-number" data-cell="megablocks_run" data-line="70" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 70, true);">70</a>
+<a class="line-number" data-cell="megablocks_run" data-line="71" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 71, true);">71</a>
+<a class="line-number" data-cell="megablocks_run" data-line="72" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 72, true);">72</a>
+<a class="line-number" data-cell="megablocks_run" data-line="73" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 73, true);">73</a>
+<a class="line-number" data-cell="megablocks_run" data-line="74" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 74, true);">74</a>
+<a class="line-number" data-cell="megablocks_run" data-line="75" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 75, true);">75</a>
+<a class="line-number" data-cell="megablocks_run" data-line="76" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 76, true);">76</a>
+<a class="line-number" data-cell="megablocks_run" data-line="77" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 77, true);">77</a>
+<a class="line-number" data-cell="megablocks_run" data-line="78" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 78, true);">78</a>
+<a class="line-number" data-cell="megablocks_run" data-line="79" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 79, true);">79</a>
+<a class="line-number" data-cell="megablocks_run" data-line="80" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 80, true);">80</a>
+<a class="line-number" data-cell="megablocks_run" data-line="81" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 81, true);">81</a>
+<a class="line-number" data-cell="megablocks_run" data-line="82" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 82, true);">82</a>
+<a class="line-number" data-cell="megablocks_run" data-line="83" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 83, true);">83</a>
+<a class="line-number" data-cell="megablocks_run" data-line="84" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 84, true);">84</a>
+<a class="line-number" data-cell="megablocks_run" data-line="85" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 85, true);">85</a>
+<a class="line-number" data-cell="megablocks_run" data-line="86" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 86, true);">86</a>
+<a class="line-number" data-cell="megablocks_run" data-line="87" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 87, true);">87</a>
+<a class="line-number" data-cell="megablocks_run" data-line="88" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 88, true);">88</a>
+<a class="line-number" data-cell="megablocks_run" data-line="89" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 89, true);">89</a>
+<a class="line-number" data-cell="megablocks_run" data-line="90" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 90, true);">90</a>
+<a class="line-number" data-cell="megablocks_run" data-line="91" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 91, true);">91</a>
+<a class="line-number" data-cell="megablocks_run" data-line="92" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 92, true);">92</a>
+<a class="line-number" data-cell="megablocks_run" data-line="93" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 93, true);">93</a>
+<a class="line-number" data-cell="megablocks_run" data-line="94" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 94, true);">94</a>
+<a class="line-number" data-cell="megablocks_run" data-line="95" href="#cell-megablocks_run" onclick="event.preventDefault(); selectCellLine('megablocks_run', 95, true);">95</a>
+</div>
+<div class="code-wrap">
+<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch</span><span class="w"> </span><span class="kn">import</span> <span class="n">nn</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">torch.nn</span><span class="w"> </span><span class="kn">import</span> <span class="n">functional</span> <span class="k">as</span> <span class="n">F</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">kernels</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_kernel</span><span class="p">,</span> <span class="n">get_local_kernel</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">bench_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">to_dtype</span><span class="p">,</span> <span class="n">tensor_stats</span><span class="p">,</span> <span class="n">set_seed</span><span class="p">,</span> <span class="n">bench_context</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">config</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span>
+    <span class="n">NUM_EXPERTS</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">TOP_K</span><span class="p">,</span>
+    <span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">DTYPE</span><span class="p">,</span> <span class="n">DEVICE</span><span class="p">,</span>
+    <span class="n">WEIGHT_SEED</span><span class="p">,</span> <span class="n">EXPERT_SEED</span><span class="p">,</span> <span class="n">INPUT_SEED</span><span class="p">,</span> <span class="n">GENERAL_SEED</span>
+<span class="p">)</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">collections</span><span class="w"> </span><span class="kn">import</span> <span class="n">namedtuple</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
+
+<span class="c1"># Discover the upstream artifact directory from env</span>
+<span class="n">data_dir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;UVNOTE_INPUT_SAVE_DATA&#39;</span><span class="p">,</span> <span class="s1">&#39;.&#39;</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Loading weights from: </span><span class="si">{</span><span class="n">data_dir</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="n">router_weight</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_weight.pt&#39;</span><span class="p">)</span>
+<span class="n">router_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;router_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;gate_up_proj_bias.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj.pt&#39;</span><span class="p">)</span>
+<span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">data_dir</span><span class="p">)</span> <span class="o">/</span> <span class="s1">&#39;down_proj_bias.pt&#39;</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Loaded shared weights from artifacts&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Router weight sum: </span><span class="si">{</span><span class="n">router_weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gate/up sum: </span><span class="si">{</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Down sum: </span><span class="si">{</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+<span class="k">def</span><span class="w"> </span><span class="nf">build_megablocks_model</span><span class="p">(</span><span class="n">device</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">):</span>
+    <span class="c1"># Download optimized kernels from the Hugging Face hub</span>
+    <span class="n">megablocks</span> <span class="o">=</span> <span class="n">get_kernel</span><span class="p">(</span><span class="s2">&quot;kernels-community/megablocks&quot;</span><span class="p">,</span> <span class="n">revision</span><span class="o">=</span><span class="s2">&quot;v0.0.2&quot;</span><span class="p">)</span>
+    <span class="n">model</span> <span class="o">=</span> <span class="n">megablocks</span><span class="o">.</span><span class="n">layers</span><span class="o">.</span><span class="n">MegaBlocksMoeMLP</span><span class="p">()</span>
+
+    <span class="c1"># Create attribute container for expert weights</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">experts</span> <span class="o">=</span> <span class="n">namedtuple</span><span class="p">(</span>
+        <span class="s2">&quot;Experts&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;gate_up_proj&quot;</span><span class="p">,</span> <span class="s2">&quot;gate_up_proj_bias&quot;</span><span class="p">,</span> <span class="s2">&quot;down_proj&quot;</span><span class="p">,</span> <span class="s2">&quot;down_proj_bias&quot;</span><span class="p">,</span> <span class="s2">&quot;hidden_size&quot;</span><span class="p">]</span>
+    <span class="p">)</span>
+
+    <span class="c1"># Use loaded router weights for consistency</span>
+    <span class="n">model</span><span class="o">.</span><span class="n">router</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">NUM_EXPERTS</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
+    <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
+        <span class="n">model</span><span class="o">.</span><span class="n">router</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">copy_</span><span class="p">(</span><span class="n">router_weight</span><span class="p">)</span>
+        <span class="n">model</span><span class="o">.</span><span class="n">router</span><span class="o">.</span><span class="n">bias</span><span class="o">.</span><span class="n">copy_</span><span class="p">(</span><span class="n">router_bias</span><span class="p">)</span>
+
+    <span class="c1"># Attach loaded expert weights to the experts container</span>
+    <span class="n">e</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">experts</span>
+    <span class="n">e</span><span class="o">.</span><span class="n">alpha</span> <span class="o">=</span> <span class="mf">1.702</span>
+    <span class="n">e</span><span class="o">.</span><span class="n">capacity_factor</span> <span class="o">=</span> <span class="mi">32</span>
+    <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
+    <span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">gate_up_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
+    <span class="n">e</span><span class="o">.</span><span class="n">down_proj</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
+    <span class="n">e</span><span class="o">.</span><span class="n">down_proj_bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">down_proj_bias</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="p">))</span>
+    <span class="n">e</span><span class="o">.</span><span class="n">hidden_size</span> <span class="o">=</span> <span class="n">HIDDEN_SIZE</span>
+
+    <span class="c1"># Log weight statistics for comparison</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;[MegaBlocks] Router weight sum: </span><span class="si">{</span><span class="n">model</span><span class="o">.</span><span class="n">router</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;[MegaBlocks] Gate/up projection shape: </span><span class="si">{</span><span class="nb">tuple</span><span class="p">(</span><span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="si">}</span><span class="s2">, sum: </span><span class="si">{</span><span class="n">e</span><span class="o">.</span><span class="n">gate_up_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;[MegaBlocks] Down projection shape: </span><span class="si">{</span><span class="nb">tuple</span><span class="p">(</span><span class="n">e</span><span class="o">.</span><span class="n">down_proj</span><span class="o">.</span><span class="n">shape</span><span class="p">)</span><span class="si">}</span><span class="s2">, sum: </span><span class="si">{</span><span class="n">e</span><span class="o">.</span><span class="n">down_proj</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+
+    <span class="k">return</span> <span class="n">model</span>
+
+<span class="c1"># Create a wrapper to match the interface of other implementations</span>
+<span class="k">class</span><span class="w"> </span><span class="nc">MegaBlocksMoEWrapper</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
+    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">megablocks_model</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">model</span> <span class="o">=</span> <span class="n">megablocks_model</span>
+
+    <span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">hidden_states</span><span class="p">):</span>
+        <span class="c1"># MegaBlocks expects input in the format (batch, seq_len, hidden_dim)</span>
+        <span class="n">output</span><span class="p">,</span> <span class="n">dummy_routing_weights</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">(</span><span class="n">hidden_states</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">output</span><span class="p">,</span> <span class="n">dummy_routing_weights</span>
+
+<span class="c1"># Run the model</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">GENERAL_SEED</span><span class="p">)</span>
+
+<span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="n">DEVICE</span><span class="p">)</span>
+<span class="n">dtype</span> <span class="o">=</span> <span class="n">to_dtype</span><span class="p">(</span><span class="n">DTYPE</span><span class="p">)</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">=== MegaBlocks Implementation ===&quot;</span><span class="p">)</span>
+<span class="c1"># Build MegaBlocks model with loaded weights</span>
+<span class="n">megablocks_model</span> <span class="o">=</span> <span class="n">build_megablocks_model</span><span class="p">(</span><span class="n">device</span><span class="p">)</span>
+<span class="n">model</span> <span class="o">=</span> <span class="n">MegaBlocksMoEWrapper</span><span class="p">(</span><span class="n">megablocks_model</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
+
+<span class="c1"># Generate the same input as other implementations</span>
+<span class="n">set_seed</span><span class="p">(</span><span class="n">INPUT_SEED</span><span class="p">)</span>
+<span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">BATCH_SIZE</span><span class="p">,</span> <span class="n">SEQ_LEN</span><span class="p">,</span> <span class="n">HIDDEN_SIZE</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span> <span class="o">*</span> <span class="mf">0.1</span>
+
+<span class="c1"># Benchmark the model with varied inputs to prevent caching artifacts</span>
+<span class="n">tokens</span> <span class="o">=</span> <span class="n">BATCH_SIZE</span> <span class="o">*</span> <span class="n">SEQ_LEN</span>
+<span class="k">with</span> <span class="n">bench_context</span><span class="p">(</span><span class="n">warmup</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">iters</span><span class="o">=</span><span class="mi">50</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">tokens</span><span class="o">=</span><span class="n">tokens</span><span class="p">,</span> <span class="n">save_json</span><span class="o">=</span><span class="s2">&quot;megablocks_results.json&quot;</span><span class="p">,</span> <span class="n">vary_inputs</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">as</span> <span class="n">bench</span><span class="p">:</span>
+    <span class="n">output</span><span class="p">,</span> <span class="n">stats</span> <span class="o">=</span> <span class="n">bench</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Output sum: </span><span class="si">{</span><span class="n">output</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">item</span><span class="p">()</span><span class="si">:</span><span class="s2">.6f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</pre></div>
+
+<div class="code-line-highlight" id="line-highlight-megablocks_run"></div>
+</div>
+</div>
+</div>
+<div id="output-megablocks_run" class="cell-output">
+<div class="cell-stdout">Loading weights from: /repo/moe_benchmarks/megablocks_yamoe/.uvnote/cache/f8744f31d9cf720409852d42748815c6d61f005a2a9b297b7b9bf986ed98bb90
+Loaded shared weights from artifacts
+Router weight sum: 12.588732
+Gate/up sum: 1026.601807
+Down sum: 206.729263
+
+=== MegaBlocks Implementation ===
+[MegaBlocks] Router weight sum: 12.588732
+[MegaBlocks] Gate/up projection shape: (128, 1152, 2304), sum: 1026.601807
+[MegaBlocks] Down projection shape: (128, 1152, 1152), sum: 206.729340
+
+┌─ Benchmark Configuration ─────────────────────────────┐
+│ Warmup: 10              Iters: 50              │
+│ Tokens: 100                                        │
+│ Input Variation: Enabled (prevents caching artifacts)  │
+└────────────────────────────────────────────────────────┘
+
+Base Input: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.486445, 0.446746], mean=-0.000048, std=0.099986, norm=33.936142
+Input Variation: +0.001 * iteration (deterministic)
+
+Warming up (10 iterations)...
+</div>
+<div class="uv-install-logs" id="uv-logs-megablocks_run">
+<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
+<div class="uv-logs-content" style="display: none;">
+Downloading sympy (6.0MiB)
+Downloading setuptools (1.1MiB)
+Downloading nvidia-nvjitlink-cu12 (37.4MiB)
+Downloading numpy (16.2MiB)
+Downloading nvidia-cufile-cu12 (1.1MiB)
+Downloading nvidia-cudnn-cu12 (674.0MiB)
+Downloading nvidia-cusolver-cu12 (255.1MiB)
+Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
+Downloading triton (148.3MiB)
+Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
+Downloading nvidia-nccl-cu12 (307.4MiB)
+Downloading nvidia-cusparse-cu12 (274.9MiB)
+Downloading nvidia-cusparselt-cu12 (273.9MiB)
+Downloading nvidia-curand-cu12 (60.7MiB)
+Downloading torch (846.9MiB)
+Downloading nvidia-cufft-cu12 (184.2MiB)
+Downloading hf-xet (3.0MiB)
+Downloading nvidia-cublas-cu12 (566.8MiB)
+Downloading networkx (1.9MiB)
+ Downloading nvidia-cufile-cu12
+ Downloading hf-xet
+ Downloading setuptools
+ Downloading networkx
+ Downloading nvidia-cuda-cupti-cu12
+ Downloading numpy
+ Downloading sympy
+ Downloading nvidia-nvjitlink-cu12
+ Downloading nvidia-curand-cu12
+ Downloading nvidia-cuda-nvrtc-cu12
+ Downloading triton
+ Downloading nvidia-cufft-cu12
+ Downloading nvidia-cusolver-cu12
+ Downloading nvidia-cusparselt-cu12
+ Downloading nvidia-cusparse-cu12
+ Downloading nvidia-nccl-cu12
+ Downloading nvidia-cublas-cu12
+ Downloading nvidia-cudnn-cu12
+ Downloading torch
+Installed 37 packages in 545ms
+</div>
+</div>
+<div class="cell-stderr">Fetching 66 files:   0%|          | 0/66 [00:00&lt;?, ?it/s]
+Fetching 66 files:   2%|▏         | 1/66 [00:00&lt;00:24,  2.66it/s]
+Fetching 66 files:  14%|█▎        | 9/66 [00:00&lt;00:02, 20.99it/s]
+Fetching 66 files:  24%|██▍       | 16/66 [00:00&lt;00:01, 31.57it/s]
+Fetching 66 files:  32%|███▏      | 21/66 [00:01&lt;00:02, 17.74it/s]
+Fetching 66 files:  53%|█████▎    | 35/66 [00:01&lt;00:01, 29.20it/s]
+Fetching 66 files:  71%|███████   | 47/66 [00:01&lt;00:00, 40.39it/s]
+Fetching 66 files:  85%|████████▍ | 56/66 [00:01&lt;00:00, 43.01it/s]
+Fetching 66 files:  97%|█████████▋| 64/66 [00:01&lt;00:00, 47.82it/s]
+Fetching 66 files: 100%|██████████| 66/66 [00:01&lt;00:00, 35.14it/s]
+/tmp/tmpsyirxqys/cuda_utils.c:5:10: fatal error: Python.h: No such file or directory
+    5 | #include &lt;Python.h&gt;
+      |          ^~~~~~~~~~
+compilation terminated.
+Traceback (most recent call last):
+  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py&quot;, line 102, in &lt;module&gt;
+    output, stats = bench(model, x)
+                    ^^^^^^^^^^^^^^^
+  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py&quot;, line 189, in runner
+    result, times_s = _bench_engine(call, warmup=warmup, iters=iters, device=device, dtype=dtype, input_gen=input_gen)
+                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py&quot;, line 96, in _bench_engine
+    _ = call(input_gen())
+        ^^^^^^^^^^^^^^^^^
+  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/bench_utils.py&quot;, line 177, in &lt;lambda&gt;
+    call = lambda x: fn(x, *args[1:], **kwargs)
+                     ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/repo/moe_benchmarks/megablocks_yamoe/.uvnote/cells/megablocks_run.py&quot;, line 81, in forward
+    output, dummy_routing_weights = self.model(hidden_states)
+                                    ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1773, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/nn/modules/module.py&quot;, line 1784, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 896, in forward
+    output, expert_weights_out, *_ = moe_forward(
+                                     ^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 730, in moe_forward
+    x, tokens_per_expert = forward_fn(**forward_args)
+                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 457, in forward_once
+    x = permute_and_compute(
+        ^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/layers.py&quot;, line 401, in permute_and_compute
+    x = ops.binned_gather(x, indices, bins, expert_capacity, top_k)
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/torch/autograd/function.py&quot;, line 576, in apply
+    return super().apply(*args, **kwargs)  # type: ignore[misc]
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/stk_autocast.py&quot;, line 30, in decorate_fwd
+    return fwd(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/ops/binned_gather.py&quot;, line 26, in forward
+    return kernels.binned_gather(x, indices, None, bins, bin_size, top_k)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/huggingface/hub/models--kernels-community--megablocks/snapshots/e0fb1437de3f8d7079c4da13be8cb64dc0cfcdd5/build/torch28-cxx11-cu128-x86_64-linux/megablocks/backend/kernels.py&quot;, line 419, in binned_gather
+    _binned_copy[(num_experts, expert_capacity)](
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/jit.py&quot;, line 390, in &lt;lambda&gt;
+    return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
+                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 239, in run
+    benchmark()
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in benchmark
+    timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
+              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 228, in &lt;dictcomp&gt;
+    timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
+                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 160, in _bench
+    return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
+           ^^^^^^^^^^^^^
+  File &quot;/usr/lib/python3.11/functools.py&quot;, line 1001, in __get__
+    val = self.func(instance)
+          ^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/autotuner.py&quot;, line 121, in do_bench
+    return driver.active.get_benchmarker()
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 30, in __getattr__
+    return getattr(self._initialize_obj(), name)
+                   ^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 26, in _initialize_obj
+    self._obj = self._init_fn()
+                ^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/driver.py&quot;, line 12, in _create_driver
+    return active_drivers[0]()
+           ^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 715, in __init__
+    self.utils = CudaUtils()  # TODO: make static
+                 ^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/driver.py&quot;, line 62, in __init__
+    mod = compile_module_from_src(
+          ^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 88, in compile_module_from_src
+    so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
+         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File &quot;/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/runtime/build.py&quot;, line 51, in _build
+    subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
+  File &quot;/usr/lib/python3.11/subprocess.py&quot;, line 413, in check_call
+    raise CalledProcessError(retcode, cmd)
+subprocess.CalledProcessError: Command &#x27;[&#x27;/usr/bin/gcc&#x27;, &#x27;/tmp/tmpsyirxqys/cuda_utils.c&#x27;, &#x27;-O3&#x27;, &#x27;-shared&#x27;, &#x27;-fPIC&#x27;, &#x27;-Wno-psabi&#x27;, &#x27;-o&#x27;, &#x27;/tmp/tmpsyirxqys/cuda_utils.cpython-311-x86_64-linux-gnu.so&#x27;, &#x27;-lcuda&#x27;, &#x27;-L/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/lib&#x27;, &#x27;-L/usr/lib/x86_64-linux-gnu&#x27;, &#x27;-I/tmp/uvnote-run-4n1mby1e/home/.cache/uv/environments-v2/megablocks-run-8802ebf6d3566120/lib/python3.11/site-packages/triton/backends/nvidia/include&#x27;, &#x27;-I/tmp/tmpsyirxqys&#x27;, &#x27;-I/usr/include/python3.11&#x27;]&#x27; returned non-zero exit status 1.</div>
+</div>
+</div>
+
 <h2>Performance Visualization</h2>
 <p>This section reads all benchmark results and creates a comprehensive performance comparison chart.</p>
     </div>