uvnote Integration Test Report

▼ code ▼ output ▶ uv-logs | Cell: setup | 304.89s | Raw

# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "accelerate>=1.10.1",
#     "torch>=2.7.0",
#     "kernels==0.10.0",
#     "transformers@https://github.com/huggingface/transformers.git",
#     "ipdb>=0.13.13",
#     "matplotlib>=3.7.2",
#     "numpy>=1.24.3",
# ]
# ///

import torch
from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
import time
import torch.nn as nn
from kernels import register_kernel_mapping, Mode, LayerRepository
import sys
import torch.profiler
import gc

def reset_peak_memory_stats():
    """Clear CUDA cache and reset memory allocation counters."""
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
    gc.collect()

def get_memory_stats():
    """Get current and peak CUDA memory usage."""
    if not torch.cuda.is_available():
        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
    return {
        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
    }

def override_kernel_layer_name(cls_name: str, value) -> bool:
    """Helper to dynamically override the kernel_layer_name in a model class."""
    for mod in sys.modules.values():
        if mod is None:
            continue
        obj = getattr(mod, cls_name, None)
        if isinstance(obj, type) and issubclass(obj, nn.Module):
            setattr(obj, "kernel_layer_name", value)
            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
            return True
    return False

def run_generation(model, inputs, max_tokens=64):
    """Run a single generation pass and measure its duration."""
    with torch.inference_mode():
        start_time = time.perf_counter()
        generated = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            temperature=None,
        )
        end_time = time.perf_counter()
    return generated, end_time - start_time


# Init the model the normal way
model_id = "openai/gpt-oss-20b"
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
quantization_config = Mxfp4Config(dequantize=True)

# Now we want to add some custom kernel mapping
custom_mapping = dict(
    Yamoe=dict(
        cuda={
            Mode.INFERENCE: LayerRepository(
                repo_id="drbh/yamoe",
                layer_name="Yamoe",
                revision="v0.3.0",
            ),
        },
    )
)
# First add the mapping
register_kernel_mapping(custom_mapping)
# Then override the layer name in the model class
override_kernel_layer_name("GptOssMLP", "Yamoe")

# TODO: remove this line once RMSNorm is working
override_kernel_layer_name("GptOssRMSNorm", None)

## Normal model stuff

model = GptOssForCausalLM.from_pretrained(
    model_id,
    dtype="bfloat16",
    device_map="auto",
    use_kernels=True,
    quantization_config=quantization_config,
).eval()


messages = [
    {"role": "system", "content": "What is Tensor Parallelism?"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="low",
).to("cuda")



def run_generation(model, inputs, max_tokens=64):
    with torch.inference_mode():
        start_time = time.perf_counter()
        generated = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            temperature=None,
        )
        end_time = time.perf_counter()
    return generated, end_time - start_time


print("\n=== Running Benchmarks ===")
print(f"Model: {model_id}")
print(f"Device: {torch.cuda.get_device_name()}")
print(f"Initial memory: {get_memory_stats()}\n")

# Warmup
print("Running warmup...")
for _ in range(2):
    _ = run_generation(model, inputs, max_tokens=16)

reset_peak_memory_stats()

# Benchmark runs
num_runs = 5
max_tokens = 64
times = []

print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...")
for i in range(num_runs):
    reset_peak_memory_stats()
    generated, elapsed = run_generation(model, inputs, max_tokens)
    times.append(elapsed)
    mem_stats = get_memory_stats()
    tokens_per_sec = max_tokens / elapsed
    print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB")

# Statistics
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
avg_tokens_per_sec = max_tokens / avg_time

print(f"\n=== Benchmark Results ===")
print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)")
print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s")

# Final memory stats
final_mem = get_memory_stats()
print(f"\nFinal Memory:")
print(f"  Allocated: {final_mem['allocated_gb']:.2f}GB")
print(f"  Peak: {final_mem['peak_gb']:.2f}GB")
print(f"  Reserved: {final_mem['reserved_gb']:.2f}GB")


print("\n=== Running with Profiler ===")
reset_peak_memory_stats()

with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    record_shapes=True,
    profile_memory=True,
    with_stack=True,
) as prof:
    generated, elapsed = run_generation(model, inputs, max_tokens=64)

print(f"Generation time: {elapsed:.2f} seconds")

# Print profiler results
print("\n=== Top 10 CUDA operations by time ===")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

print("\n=== Top 10 operations by memory ===")
print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))

# Memory stats
mem_stats = get_memory_stats()
print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB")

# Save trace if needed
prof.export_chrome_trace("trace.json")
print("\nProfile trace saved to trace.json")


# Decode and print output
print("\nGenerated text:")
print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :]))


# save times and memory stats for charting
with open("benchmark_times.txt", "w") as f:
    for t in times:
        f.write(f"{t}\n")
with open("benchmark_memory.txt", "w") as f:
    f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n")

# save avg_tokens_per_sec for charting
with open("benchmark_avg_tokens_per_sec.txt", "w") as f:
    f.write(f"{avg_tokens_per_sec}\n")

Overrode GptOssMLP.kernel_layer_name to Yamoe Overrode GptOssRMSNorm.kernel_layer_name to None === Running Benchmarks === Model: openai/gpt-oss-20b Device: NVIDIA L4 Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032} Running warmup... Running 5 benchmark iterations with 64 tokens... Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB === Benchmark Results === Average: 12.072s (5.3 tok/s) Min: 12.070s | Max: 12.075s Final Memory: Allocated: 9.40GB Peak: 9.41GB Reserved: 10.33GB === Running with Profiler === Generation time: 12.73 seconds === Top 10 CUDA operations by time === ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ _yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536 aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024 aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744 aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680 std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600 aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200 std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024 aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64 ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 10.579s Self CUDA time total: 11.424s === Top 10 operations by memory === ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973 aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608 aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888 aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808 aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536 aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979 aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944 aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136 aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 10.579s Self CUDA time total: 11.424s Peak Memory: 9.41GB Profile trace saved to trace.json Generated text: <|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are

▶ UV Install Logs

Updating https://github.com/huggingface/transformers.git (HEAD) Updated https://github.com/huggingface/transformers.git (cbb290ec23ccd9b5c1d1ff4d333477449891debb) Downloading nvidia-cufft-cu12 (184.2MiB) Downloading matplotlib (8.3MiB) Downloading tokenizers (3.1MiB) Downloading kiwisolver (1.4MiB) Building transformers @ git+https://github.com/huggingface/transformers.git@cbb290ec23ccd9b5c1d1ff4d333477449891debb Downloading nvidia-cufile-cu12 (1.1MiB) Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB) Downloading hf-xet (3.0MiB) Downloading fonttools (4.7MiB) Downloading numpy (15.9MiB) Downloading nvidia-cusparse-cu12 (274.9MiB) Downloading pygments (1.2MiB) Downloading nvidia-cublas-cu12 (566.8MiB) Downloading nvidia-cusolver-cu12 (255.1MiB) Downloading pillow (6.3MiB) Downloading nvidia-cusparselt-cu12 (273.9MiB) Downloading nvidia-nvjitlink-cu12 (37.4MiB) Downloading networkx (1.9MiB) Downloading nvidia-nccl-cu12 (307.4MiB) Downloading nvidia-curand-cu12 (60.7MiB) Downloading jedi (1.5MiB) Downloading sympy (6.0MiB) Downloading nvidia-cuda-cupti-cu12 (9.8MiB) Downloading triton (148.4MiB) Downloading nvidia-cudnn-cu12 (674.0MiB) Downloading torch (846.8MiB) Downloading nvidia-cufile-cu12 Downloading kiwisolver Downloading pygments Downloading hf-xet Downloading tokenizers Downloading networkx Downloading fonttools Downloading pillow Downloading matplotlib Downloading nvidia-cuda-cupti-cu12 Downloading numpy Downloading sympy Downloading nvidia-nvjitlink-cu12 Built transformers @ git+https://github.com/huggingface/transformers.git@cbb290ec23ccd9b5c1d1ff4d333477449891debb Downloading jedi Downloading nvidia-curand-cu12 Downloading nvidia-cuda-nvrtc-cu12 Downloading triton Downloading nvidia-cufft-cu12 Downloading nvidia-cusolver-cu12 Downloading nvidia-cusparse-cu12 Downloading nvidia-cusparselt-cu12 Downloading nvidia-nccl-cu12 Downloading nvidia-cublas-cu12 Downloading nvidia-cudnn-cu12 Downloading torch Installed 69 packages in 220ms

Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] Fetching 3 files: 33%|███▎ | 1/3 [00:11<00:23, 11.59s/it] Fetching 3 files: 67%|██████▋ | 2/3 [00:16<00:07, 7.73s/it] Fetching 3 files: 100%|██████████| 3/3 [00:16<00:00, 5.54s/it] You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it] Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.15s/it] Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.50s/it] Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it] Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.23it/s] Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.19it/s] Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 12.15it/s] /tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. warnings.warn( /tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. warnings.warn(

Artifacts:

benchmark_avg_tokens_per_sec.txt benchmark_memory.txt benchmark_times.txt trace.json

▼ code ▼ output ▶ uv-logs | Cell: charts | deps: matplotlib, numpy | 3.51s | Raw

import matplotlib.pyplot as plt
import numpy as np
import os

# get the pathf rom UVNOTE_SETUP env var
setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
print(f"Reading benchmark data from: {setup_path}")

num_runs = 5
max_tokens = 64
times = []
with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
    for line in f:
        times.append(float(line.strip()))


avg_time = 0.0
min_time = 0.0
max_time = 0.0
final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}

avg_tokens_per_sec = 0.0
with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
    avg_tokens_per_sec = float(f.read().strip())

times_file = os.path.join(setup_path, "benchmark_times.txt")
memory_file = os.path.join(setup_path, "benchmark_memory.txt")


# Minimal brutalist palette (dark theme): grayscale + 1 accent
ACCENT = '#5ec8f8'   # calm cyan-blue accent
FG = '#e6e6e6'       # light gray text/lines
MUTED = '#9aa0a6'    # muted gray for secondary
GRID = '#333333'     # grid lines

# Styling tuned for clarity, high contrast, few colors
plt.style.use('dark_background')
plt.rcParams['figure.facecolor'] = 'none'
plt.rcParams['axes.facecolor'] = 'none'
plt.rcParams['savefig.facecolor'] = 'none'
plt.rcParams['savefig.transparent'] = True
plt.rcParams['font.family'] = 'monospace'
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['axes.linewidth'] = 3
plt.rcParams['grid.linewidth'] = 2
plt.rcParams['lines.linewidth'] = 3
plt.rcParams['patch.linewidth'] = 2

# Prepare data
runs = list(range(1, len(times) + 1))
tokens_per_sec_all = [max_tokens / t for t in times]

# Chart 1: Throughput Performance
fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
fig1.patch.set_alpha(0)
ax1.patch.set_alpha(0)

ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
            label=f'AVG: {avg_tokens_per_sec:.1f}')
ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
ax1.tick_params(colors=FG, labelsize=12)
legend1 = ax1.legend(frameon=False, loc='lower right')
for text in legend1.get_texts():
    text.set_color(FG)
    text.set_fontweight('bold')
plt.tight_layout()
plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
plt.show()

# Chart 2: Generation Latency
fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
fig2.patch.set_alpha(0)
ax2.patch.set_alpha(0)

bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
            label=f'AVG: {avg_time:.2f}s')
for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
             color=FG, fontweight='bold', fontsize=11)
ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
ax2.tick_params(colors=FG, labelsize=12)
ax2.set_ylim(0, max(times) * 1.15)
legend2 = ax2.legend(frameon=False, loc='upper right')
for text in legend2.get_texts():
    text.set_color(FG)
    text.set_fontweight('bold')
plt.tight_layout()
plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
plt.show()

# Chart 3: Memory Usage
fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
fig3.patch.set_alpha(0)
ax3.patch.set_alpha(0)

memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
colors_mem = [MUTED, ACCENT, FG]
bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
             color=FG, fontweight='bold', fontsize=13)
ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
ax3.set_xlim(0, max(memory_values) * 1.3)
ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
ax3.tick_params(colors=FG, labelsize=12)
ax3.set_yticks(range(len(memory_labels)))
ax3.set_yticklabels(memory_labels, fontweight='bold')
plt.tight_layout()
plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
plt.show()

print(f"\n📊 Charts saved as:")
print(f"  • throughput.png")
print(f"  • latency.png")
print(f"  • memory.png")
print(f"\nBenchmark Summary:")
print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
print(f"  min time: {min_time:.3f}s")
print(f"  max time: {max_time:.3f}s")
print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")

Reading benchmark data from: /home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cache/0e89c413a25ded7b4d6fab2a010f0538ba2b35fb5f619a0dfced3121d3ccf879 📊 Charts saved as: • throughput.png • latency.png • memory.png Benchmark Summary: avg tokens/sec: 5.3 min time: 0.000s max time: 0.000s peak memory: 0.00GB

▶ UV Install Logs

/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:123: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding. ax3.set_xlim(0, max(memory_values) * 1.3) /home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:128: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all Axes decorations. plt.tight_layout()

Artifacts:

latency.png memory.png throughput.png