Spaces:

kernels-community
/

kernels-benchmarks

Running

App Files Files Community

drbh HF Staff commited on Sep 23

Commit

782c694

verified ·

1 Parent(s): b7b6ff8

Upload folder using huggingface_hub

Browse files

Files changed (16) hide show

site/artifacts/charts/benchmark_dashboard.png +0 -0
site/artifacts/charts/latency.png +0 -0
site/artifacts/charts/memory.png +0 -0
site/artifacts/charts/throughput.png +0 -0
site/artifacts/setup/benchmark_avg_tokens_per_sec.txt +1 -0
site/artifacts/setup/benchmark_dashboard.png +0 -0
site/artifacts/setup/benchmark_memory.txt +1 -0
site/artifacts/setup/benchmark_times.txt +5 -0
site/cells/charts.py +140 -0
site/cells/forward_and_backward.py +102 -0
site/cells/forward_only.py +96 -0
site/cells/setup.py +116 -0
site/cells/setup2.py +115 -0
site/megablocks_only.html +0 -0
site/note.html +0 -0
site/note_test_override.html +0 -0

site/artifacts/charts/benchmark_dashboard.png ADDED Viewed

site/artifacts/charts/latency.png ADDED Viewed

site/artifacts/charts/memory.png ADDED Viewed

site/artifacts/charts/throughput.png ADDED Viewed

site/artifacts/setup/benchmark_avg_tokens_per_sec.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 5.301658854167735

site/artifacts/setup/benchmark_dashboard.png ADDED Viewed

site/artifacts/setup/benchmark_memory.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 9.398672896,9.414898176,10.334765056

site/artifacts/setup/benchmark_times.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+12.075035744113848
+12.0710428240709
+12.070115809096023
+12.070908240042627
+12.071364195086062

site/cells/charts.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# /// script
+# dependencies = [
+#     "matplotlib",
+#     "numpy",
+# ]
+# ///
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+# get the pathf rom UVNOTE_SETUP env var
+setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
+print(f"Reading benchmark data from: {setup_path}")
+num_runs = 5
+max_tokens = 64
+times = []
+with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
+    for line in f:
+        times.append(float(line.strip()))
+avg_time = 0.0
+min_time = 0.0
+max_time = 0.0
+final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
+avg_tokens_per_sec = 0.0
+with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
+    avg_tokens_per_sec = float(f.read().strip())
+times_file = os.path.join(setup_path, "benchmark_times.txt")
+memory_file = os.path.join(setup_path, "benchmark_memory.txt")
+# Minimal brutalist palette (dark theme): grayscale + 1 accent
+ACCENT = '#5ec8f8'   # calm cyan-blue accent
+FG = '#e6e6e6'       # light gray text/lines
+MUTED = '#9aa0a6'    # muted gray for secondary
+GRID = '#333333'     # grid lines
+# Styling tuned for clarity, high contrast, few colors
+plt.style.use('dark_background')
+plt.rcParams['figure.facecolor'] = 'none'
+plt.rcParams['axes.facecolor'] = 'none'
+plt.rcParams['savefig.facecolor'] = 'none'
+plt.rcParams['savefig.transparent'] = True
+plt.rcParams['font.family'] = 'monospace'
+plt.rcParams['font.weight'] = 'bold'
+plt.rcParams['axes.linewidth'] = 3
+plt.rcParams['grid.linewidth'] = 2
+plt.rcParams['lines.linewidth'] = 3
+plt.rcParams['patch.linewidth'] = 2
+# Prepare data
+runs = list(range(1, len(times) + 1))
+tokens_per_sec_all = [max_tokens / t for t in times]
+# Chart 1: Throughput Performance
+fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
+fig1.patch.set_alpha(0)
+ax1.patch.set_alpha(0)
+ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
+         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
+ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
+ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
+            label=f'AVG: {avg_tokens_per_sec:.1f}')
+ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
+ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
+ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
+ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
+ax1.tick_params(colors=FG, labelsize=12)
+legend1 = ax1.legend(frameon=False, loc='lower right')
+for text in legend1.get_texts():
+    text.set_color(FG)
+    text.set_fontweight('bold')
+plt.tight_layout()
+plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
+plt.show()
+# Chart 2: Generation Latency
+fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
+fig2.patch.set_alpha(0)
+ax2.patch.set_alpha(0)
+bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
+bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
+ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
+            label=f'AVG: {avg_time:.2f}s')
+for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
+    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
+             color=FG, fontweight='bold', fontsize=11)
+ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
+ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
+ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
+ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
+ax2.tick_params(colors=FG, labelsize=12)
+ax2.set_ylim(0, max(times) * 1.15)
+legend2 = ax2.legend(frameon=False, loc='upper right')
+for text in legend2.get_texts():
+    text.set_color(FG)
+    text.set_fontweight('bold')
+plt.tight_layout()
+plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
+plt.show()
+# Chart 3: Memory Usage
+fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
+fig3.patch.set_alpha(0)
+ax3.patch.set_alpha(0)
+memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
+memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
+colors_mem = [MUTED, ACCENT, FG]
+bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
+for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
+    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
+             color=FG, fontweight='bold', fontsize=13)
+ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
+ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
+ax3.set_xlim(0, max(memory_values) * 1.3)
+ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
+ax3.tick_params(colors=FG, labelsize=12)
+ax3.set_yticks(range(len(memory_labels)))
+ax3.set_yticklabels(memory_labels, fontweight='bold')
+plt.tight_layout()
+plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
+plt.show()
+print(f"\n📊 Charts saved as:")
+print(f"  • throughput.png")
+print(f"  • latency.png")
+print(f"  • memory.png")
+print(f"\nBenchmark Summary:")
+print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
+print(f"  min time: {min_time:.3f}s")
+print(f"  max time: {max_time:.3f}s")
+print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")

site/cells/forward_and_backward.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+import logging
+from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
+# remove liger kernel for testing
+replace_kernel_forward_from_hub(GptOssRMSNorm, None)
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+    training=True,
+).eval()
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+max_tokens = 512
+# forward and backward pass
+with torch.autograd.set_grad_enabled(True):
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+    print(tokenizer.decode(generated[0], skip_special_tokens=False))
+    print(f"Generation took {end_time - start_time:.2f} seconds")

site/cells/forward_only.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+import logging
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+max_tokens = 512
+with torch.inference_mode():
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+print(tokenizer.decode(generated[0], skip_special_tokens=False))
+print(f"Generation took {end_time - start_time:.2f} seconds")

site/cells/setup.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+import logging
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
+from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
+replace_kernel_forward_from_hub(GptOssMLP, "Yamoe")  # direct, type-safe
+replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
+custom_mapping = {
+    "Yamoe": {
+        "cuda": {
+            Mode.INFERENCE: LayerRepository(
+                repo_id="drbh/yamoe",
+                layer_name="Yamoe",
+                revision="v0.3.0",
+            )
+        }
+    }
+}
+register_kernel_mapping(custom_mapping)
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+max_tokens = 512
+with torch.inference_mode():
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+print(tokenizer.decode(generated[0], skip_special_tokens=False))
+print(f"Generation took {end_time - start_time:.2f} seconds")

site/cells/setup2.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+import logging
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+from kernels import replace_kernel_forward_from_hub, register_kernel_mapping, LayerRepository, Mode
+from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP, GptOssRMSNorm
+replace_kernel_forward_from_hub(GptOssRMSNorm, None)  # direct, type-safe
+custom_mapping = {
+    "Yamoe": {
+        "cuda": {
+            Mode.INFERENCE: LayerRepository(
+                repo_id="drbh/yamoe",
+                layer_name="Yamoe",
+                revision="v0.3.0",
+            )
+        }
+    }
+}
+register_kernel_mapping(custom_mapping)
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+max_tokens = 512
+with torch.inference_mode():
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+print(tokenizer.decode(generated[0], skip_special_tokens=False))
+print(f"Generation took {end_time - start_time:.2f} seconds")

site/megablocks_only.html ADDED Viewed

The diff for this file is too large to render. See raw diff

site/note.html ADDED Viewed

The diff for this file is too large to render. See raw diff

site/note_test_override.html ADDED Viewed

The diff for this file is too large to render. See raw diff