-
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: nv | 0.71s - | - -Raw -
-
-
-
-1 -2 -3 -
-
-
import subprocess
-
-print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
-
- -
-
-
-
-
-
Wed Sep 24 20:58:22 2025 -+-----------------------------------------------------------------------------------------+ -| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 | -|-----------------------------------------+------------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+========================+======================| -| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 | -| 0% 32C P8 27W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 | -| 0% 32C P8 25W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 | -| 0% 32C P8 28W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ -| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | -| 0% 32C P8 27W / 300W | 0MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=========================================================================================| -| No running processes found | -+-----------------------------------------------------------------------------------------+ - -
-
-
- -

No Kernels

+

No Kernels

First, we run the model without any custom kernels to get a reference point.

Forward

-
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: no_kernels | 107.24s - | - -Raw -
-
-
-
-1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -25 -26 -27 -28 -29 -30 -31 -32 -33 -34 -35 -36 -37 -38 -39 -40 -41 -42 -43 -44 -45 -46 -47 -48 -49 -50 -51 -52 -53 -54 -55 -56 -57 -58 -59 -60 -61 -62 -63 -64 -65 -66 -67 -68 -69 -70 -71 -72 -73 -74 -75 -76 -77 -78 -79 -80 -81 -82 -83 -84 -85 -86 -87 -88 -89 -90 -91 -92 -93 -94 -95 -96 -97 -98 -
-
-
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository, replace_kernel_forward_from_hub
-import sys
-import torch.profiler
-import gc
-import logging
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssRMSNorm
-
-# set to debug logging
-logging.basicConfig(level=logging.INFO)
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=False,
-    quantization_config=quantization_config,
-).eval()
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-max_tokens = 256
-
-with torch.inference_mode():
-    start_time = time.perf_counter()
-    generated = model.generate(
-        **inputs,
-        max_new_tokens=max_tokens,
-        do_sample=False,
-        temperature=None,
-    )
-    end_time = time.perf_counter()
-
-print(tokenizer.decode(generated[0], skip_special_tokens=False))
-print(f"Generation took {end_time - start_time:.2f} seconds")
-
- -
-
-
-
-
-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI. -Knowledge cutoff: 2024-06 -Current date: 2025-09-24 - -Reasoning: low - -# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions - -What is Tensor Parallelism? - -<|end|><|start|>assistant<|channel|>analysis<|message|>Need to explain tensor parallelism.<|end|><|start|>assistant<|channel|>final<|message|>**Tensor Parallelism** is a technique used to distribute the computation of a single tensor (or a set of tensors) across multiple devices (GPUs, TPUs, or other accelerators) in order to speed up large‑scale deep‑learning training and inference. It is one of several parallelism strategies—alongside data parallelism, pipeline parallelism, and model parallelism—designed to scale models that exceed the memory or compute capacity of a single device. - ---- - -## 1. Why Tensor Parallelism? - -- **Memory constraints**: Modern transformer models (e.g., GPT‑4, PaLM, LLaMA‑70B) can have billions of parameters. A single GPU may not hold the entire model or its intermediate activations. -- **Compute bottlenecks**: Even if memory were sufficient, the sheer number of floating‑point operations can make training prohibitively slow on a single device. -- **Scalability**: Tensor parallelism allows you to add more devices to reduce per‑device memory usage and accelerate matrix operations. - ---- - -## 2. Core Idea - -In tensor parallelism, a *tensor* (typically a weight matrix or an activation tensor -Generation took 25.73 seconds -
-
-
▶ UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|███▎ | 1/3 [00:07<00:15, 7.69s/it] -Fetching 3 files: 67%|██████▋ | 2/3 [00:09<00:03, 3.95s/it] -Fetching 3 files: 100%|██████████| 3/3 [00:09<00:00, 3.00s/it] - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|███▎ | 1/3 [00:02<00:04, 2.35s/it] -Loading checkpoint shards: 67%|██████▋ | 2/3 [00:04<00:02, 2.25s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.80s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00, 1.93s/it]
-
-
-

Forward and Backward

Next, we'll attempt to run a forward and backward pass without any custom kernels. This will likely run out of memory since the default implementation is not optimized for memory usage.

@@ -4116,7 +3727,7 @@ Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00 ▼ output ▶ uv-logs | -Cell: forward_and_backward_no_kernel | 99.86s | FAILED +Cell: forward_and_backward_no_kernel | 99.38s | FAILED | Raw @@ -4549,7 +4160,7 @@ Post-forward memory: {'allocated_gb': 9.487933952, 'peak_gb' Loss: 1.9761 Running backward pass... Pre-backward memory: {'allocated_gb': 9.405890048, 'peak_gb': 9.514059776, 'reserved_gb': 17.177772032} -OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 34932 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) +OOM during forward/backward pass: CUDA out of memory. Tried to allocate 508.00 MiB. GPU 2 has a total capacity of 22.30 GiB of which 118.69 MiB is free. Process 25557 has 22.18 GiB memory in use. Of the allocated memory 21.52 GiB is allocated by PyTorch, and 357.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) Try reducing max_tokens or max_seq_len
@@ -4559,32 +4170,32 @@ Downloading cpython-3.13.7-linux-x86_64-gnu (download) (32.0MiB) Downloading cpython-3.13.7-linux-x86_64-gnu (download) Updating https://github.com/huggingface/transformers.git (HEAD) Updated https://github.com/huggingface/transformers.git (7258ea44bc0c0a425a468f66f8559d1de8c4126d) -Downloading numpy (15.9MiB) -Downloading nvidia-cusparse-cu12 (274.9MiB) - Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d -Downloading pygments (1.2MiB) +Downloading nvidia-cuda-cupti-cu12 (9.8MiB) +Downloading networkx (1.9MiB) Downloading jedi (1.5MiB) -Downloading tokenizers (3.1MiB) -Downloading hf-xet (3.0MiB) -Downloading sympy (6.0MiB) + Building transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d +Downloading kiwisolver (1.4MiB) Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB) -Downloading nvidia-cufft-cu12 (184.2MiB) -Downloading fonttools (4.7MiB) -Downloading matplotlib (8.3MiB) -Downloading networkx (1.9MiB) -Downloading nvidia-cufile-cu12 (1.1MiB) -Downloading nvidia-nvjitlink-cu12 (37.4MiB) -Downloading nvidia-cublas-cu12 (566.8MiB) -Downloading torch (846.8MiB) Downloading nvidia-nccl-cu12 (307.4MiB) +Downloading nvidia-cublas-cu12 (566.8MiB) +Downloading nvidia-cudnn-cu12 (674.0MiB) +Downloading nvidia-cufft-cu12 (184.2MiB) Downloading nvidia-curand-cu12 (60.7MiB) +Downloading nvidia-cusparse-cu12 (274.9MiB) +Downloading hf-xet (3.0MiB) Downloading triton (148.4MiB) -Downloading nvidia-cudnn-cu12 (674.0MiB) -Downloading nvidia-cuda-cupti-cu12 (9.8MiB) -Downloading nvidia-cusolver-cu12 (255.1MiB) +Downloading nvidia-cufile-cu12 (1.1MiB) +Downloading nvidia-nvjitlink-cu12 (37.4MiB) +Downloading tokenizers (3.1MiB) +Downloading matplotlib (8.3MiB) +Downloading sympy (6.0MiB) Downloading pillow (6.3MiB) -Downloading kiwisolver (1.4MiB) Downloading nvidia-cusparselt-cu12 (273.9MiB) +Downloading pygments (1.2MiB) +Downloading nvidia-cusolver-cu12 (255.1MiB) +Downloading numpy (15.9MiB) +Downloading torch (846.8MiB) +Downloading fonttools (4.7MiB) Downloading nvidia-cufile-cu12 Downloading kiwisolver Downloading pygments @@ -4599,8 +4210,8 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB) Downloading sympy Built transformers @ git+https://github.com/huggingface/transformers.git@7258ea44bc0c0a425a468f66f8559d1de8c4126d Downloading nvidia-nvjitlink-cu12 - Downloading nvidia-curand-cu12 Downloading jedi + Downloading nvidia-curand-cu12 Downloading nvidia-cuda-nvrtc-cu12 Downloading triton Downloading nvidia-cufft-cu12 @@ -4611,13 +4222,13 @@ Downloading nvidia-cusparselt-cu12 (273.9MiB) Downloading nvidia-cublas-cu12 Downloading nvidia-cudnn-cu12 Downloading torch -Installed 69 packages in 592ms +Installed 69 packages in 579ms