PyTorch Native - Rotary Position Embeddings

GPU Info

▼ code ▼ output ▶ uv-logs | Cell: nv | 0.22s | Raw GitHub
import subprocess
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
Mon Nov 10 21:57:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
| N/A   26C    P0             88W /  350W |       0MiB /  46068MiB |     22%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Rotary Embeddings Benchmark (PyTorch Native)

▼ code ▼ output ▶ uv-logs | Cell: benchmark | 38.43s | Raw GitHub
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "torch==2.8.0",
#     "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark


def apply_rotary_torch(x1, x2, cos, sin, conj=False):
    """Reference rotary implementation."""
    if not conj:
        out1 = x1 * cos - x2 * sin
        out2 = x1 * sin + x2 * cos
    else:
        out1 = x1 * cos + x2 * sin
        out2 = -x1 * sin + x2 * cos
    return out1, out2


def torch_rotary(query, key, cos, sin, conj=False):
    rotary_dim = cos.shape[-1]

    # Clone inputs to avoid modifying them
    q_out = query.clone()
    k_out = key.clone()

    # Apply rotation to query
    q1 = q_out[..., :rotary_dim]
    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
    q_out[..., :rotary_dim] = q_out_1
    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2

    # Apply rotation to key
    k1 = k_out[..., :rotary_dim]
    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
    k_out[..., :rotary_dim] = k_out_1
    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2

    return q_out, k_out


run_benchmark(
    kernel_type=KernelTypeEnum.ROTARY,
    impl_name="torch_eager",
    impl_tags={"family": "pytorch", "backend": "eager"},
    impl_func=torch_rotary,
)
Running rotary benchmark on cuda with 24 workloads.

======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.315ms      1474.39%       1.315ms       1.315ms             1  
                                            torch_eager         7.00%     401.548us        82.40%       4.729ms       4.729ms       0.000us         0.00%      90.432us      90.432us             1  
                                              aten::mul         3.25%     186.430us         5.35%     307.044us      12.793us      46.943us        52.62%      46.943us       1.956us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.943us        52.62%      46.943us       1.956us            24  
                                            aten::copy_         2.48%     142.261us        48.48%       2.782ms     154.576us      29.122us        32.64%      30.338us       1.685us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.433us        25.14%      22.433us       1.869us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.151us        14.74%      13.151us       1.096us            12  
                                            aten::clone         0.88%      50.441us        59.65%       3.423ms     570.575us       0.000us         0.00%       7.905us       1.318us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.689us         7.50%       6.689us       1.115us             6  
                                              aten::sub         0.82%      47.350us         1.28%      73.411us      12.235us       6.591us         7.39%       6.591us       1.098us             6  
                                              aten::add         0.64%      36.811us         1.04%      59.601us       9.934us       6.560us         7.35%       6.560us       1.093us             6  
                                Activity Buffer Request        39.92%       2.291ms        39.92%       2.291ms       2.291ms       1.216us         1.36%       1.216us       1.216us             1  
                                    aten::empty_strided        16.52%     948.386us        16.52%     948.386us     158.064us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         1.38%      78.980us         1.38%      78.980us      13.163us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.46%      83.925us         1.86%     106.703us       4.446us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.40%      22.778us         0.40%      22.778us       0.949us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.66%     439.430us         7.66%     439.430us       9.155us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize        17.60%       1.010ms        17.60%       1.010ms       1.010ms       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 5.740ms
Self CUDA time total: 89.216us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     967.576us      1072.55%     967.576us     967.576us             1  
                                            torch_eager        10.80%     301.919us        99.80%       2.790ms       2.790ms       0.000us         0.00%      91.365us      91.365us             1  
                                              aten::mul         5.82%     162.824us         9.87%     275.997us      11.500us      47.523us        52.68%      47.523us       1.980us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.523us        52.68%      47.523us       1.980us            24  
                                            aten::copy_         4.18%     116.751us        70.01%       1.957ms     108.723us      29.282us        32.46%      30.434us       1.691us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.497us        24.94%      22.497us       1.875us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.408us        14.86%      13.408us       1.117us            12  
                                            aten::clone         0.79%      22.172us        66.92%       1.871ms     311.782us       0.000us         0.00%       7.937us       1.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.785us         7.52%       6.785us       1.131us             6  
                                              aten::add         1.23%      34.361us         2.02%      56.562us       9.427us       6.720us         7.45%       6.720us       1.120us             6  
                                              aten::sub         1.36%      38.010us         2.19%      61.310us      10.218us       6.688us         7.41%       6.688us       1.115us             6  
                                Activity Buffer Request        61.66%       1.724ms        61.66%       1.724ms       1.724ms       1.152us         1.28%       1.152us       1.152us             1  
                                    aten::empty_strided         1.16%      32.541us         1.16%      32.541us       5.424us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.01%      56.260us         2.01%      56.260us       9.377us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.34%      65.363us         2.94%      82.214us       3.426us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.60%      16.851us         0.60%      16.851us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.84%     219.114us         7.84%     219.114us       4.565us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.20%       5.580us         0.20%       5.580us       5.580us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.795ms
Self CUDA time total: 90.213us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     927.639us       987.31%     927.639us     927.639us             1  
                                            torch_eager        10.07%     282.335us        99.80%       2.798ms       2.798ms       0.000us         0.00%      95.268us      95.268us             1  
                                              aten::mul         5.75%     161.290us         9.68%     271.373us      11.307us      48.769us        51.91%      48.769us       2.032us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.769us        51.91%      48.769us       2.032us            24  
                                            aten::copy_         3.66%     102.626us        71.21%       1.996ms     110.912us      30.720us        32.70%      32.032us       1.780us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.39%      22.912us       1.909us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.467us        15.40%      14.467us       1.206us            12  
                                            aten::clone         0.79%      22.060us        68.41%       1.918ms     319.628us       0.000us         0.00%       9.120us       1.520us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         8.31%       7.808us       1.301us             6  
                                              aten::sub         1.36%      38.040us         2.18%      61.002us      10.167us       7.265us         7.73%       7.265us       1.211us             6  
                                              aten::add         1.15%      32.220us         1.90%      53.280us       8.880us       7.202us         7.67%       7.202us       1.200us             6  
                                Activity Buffer Request        63.51%       1.780ms        63.51%       1.780ms       1.780ms       1.312us         1.40%       1.312us       1.312us             1  
                                    aten::empty_strided         1.12%      31.490us         1.12%      31.490us       5.248us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         1.87%      52.452us         1.87%      52.452us       8.742us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.25%      63.104us         2.86%      80.042us       3.335us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.60%      16.938us         0.60%      16.938us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.67%     215.090us         7.67%     215.090us       4.481us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.20%       5.470us         0.20%       5.470us       5.470us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.803ms
Self CUDA time total: 93.956us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.847us       904.69%     918.847us     918.847us             1  
                                            torch_eager        11.08%     278.185us        99.79%       2.506ms       2.506ms       0.000us         0.00%     102.877us     102.877us             1  
                                              aten::mul         6.15%     154.372us        10.54%     264.762us      11.032us      52.638us        51.83%      52.638us       2.193us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.638us        51.83%      52.638us       2.193us            24  
                                            aten::copy_         4.16%     104.580us        68.26%       1.714ms      95.219us      32.416us        31.92%      33.728us       1.874us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        24.26%      24.641us       2.053us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.511us        16.26%      16.511us       1.376us            12  
                                            aten::clone         0.84%      21.090us        65.15%       1.636ms     272.671us       0.000us         0.00%       9.087us       1.514us             6  
                                              aten::sub         1.51%      38.031us         2.44%      61.190us      10.198us       8.288us         8.16%       8.288us       1.381us             6  
                                              aten::add         1.29%      32.470us         2.19%      54.880us       9.147us       8.223us         8.10%       8.223us       1.371us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.775us         7.66%       7.775us       1.296us             6  
                                Activity Buffer Request        52.27%       1.312ms        52.27%       1.312ms       1.312ms       1.312us         1.29%       1.312us       1.312us             1  
                                    aten::empty_strided         1.29%      32.302us         1.29%      32.302us       5.384us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.44%     236.943us         9.44%     236.943us      39.491us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.53%      63.496us         3.16%      79.393us       3.308us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.63%      15.897us         0.63%      15.897us       0.662us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.60%     215.892us         8.60%     215.892us       4.498us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.340us         0.21%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.511ms
Self CUDA time total: 101.565us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     991.709us      1060.94%     991.709us     991.709us             1  
                                            torch_eager        10.56%     336.649us        99.82%       3.183ms       3.183ms       0.000us         0.00%      94.755us      94.755us             1  
                                              aten::mul         5.20%     165.794us         8.73%     278.295us      11.596us      48.674us        52.07%      48.674us       2.028us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.674us        52.07%      48.674us       2.028us            24  
                                            aten::copy_         3.76%     119.863us        72.07%       2.298ms     127.674us      30.622us        32.76%      31.902us       1.772us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.878us        24.47%      22.878us       1.907us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.179us        15.17%      14.179us       1.182us            12  
                                            aten::clone         0.88%      28.161us        69.55%       2.218ms     369.616us       0.000us         0.00%       9.024us       1.504us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         8.28%       7.744us       1.291us             6  
                                              aten::sub         1.28%      40.920us         2.05%      65.511us      10.918us       7.138us         7.64%       7.138us       1.190us             6  
                                              aten::add         1.05%      33.330us         1.81%      57.620us       9.603us       7.041us         7.53%       7.041us       1.173us             6  
                                Activity Buffer Request        55.60%       1.773ms        55.60%       1.773ms       1.773ms       1.280us         1.37%       1.280us       1.280us             1  
                                    aten::empty_strided         1.06%      33.640us         1.06%      33.640us       5.607us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.74%     342.585us        10.74%     342.585us      57.097us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.14%      68.349us         2.66%      84.959us       3.540us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.52%      16.610us         0.52%      16.610us       0.692us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.03%     224.072us         7.03%     224.072us       4.668us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       5.590us         0.18%       5.590us       5.590us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 3.189ms
Self CUDA time total: 93.475us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     941.177us       926.36%     941.177us     941.177us             1  
                                            torch_eager         9.56%     295.804us        99.83%       3.088ms       3.088ms       0.000us         0.00%     102.911us     102.911us             1  
                                              aten::mul         5.03%     155.643us         8.60%     265.986us      11.083us      52.802us        51.97%      52.802us       2.200us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.802us        51.97%      52.802us       2.200us            24  
                                            aten::copy_         3.66%     113.330us        73.34%       2.269ms     126.052us      32.447us        31.94%      33.759us       1.876us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.703us        24.31%      24.703us       2.059us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.350us        16.09%      16.350us       1.363us            12  
                                            aten::clone         0.71%      21.820us        70.53%       2.182ms     363.694us       0.000us         0.00%       9.056us       1.509us             6  
                                              aten::sub         1.30%      40.120us         2.07%      63.950us      10.658us       8.223us         8.09%       8.223us       1.370us             6  
                                              aten::add         1.17%      36.201us         1.90%      58.931us       9.822us       8.127us         8.00%       8.127us       1.355us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.62%       7.744us       1.291us             6  
                                Activity Buffer Request        57.23%       1.771ms        57.23%       1.771ms       1.771ms       1.312us         1.29%       1.312us       1.312us             1  
                                    aten::empty_strided         0.98%      30.371us         0.98%      30.371us       5.062us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.40%     321.885us        10.40%     321.885us      53.647us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.12%      65.592us         2.67%      82.622us       3.443us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.55%      17.030us         0.55%      17.030us       0.710us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.11%     219.985us         7.11%     219.985us       4.583us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.17%       5.340us         0.17%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 3.094ms
Self CUDA time total: 101.599us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.963us       782.64%     943.963us     943.963us             1  
                                            torch_eager         9.85%     301.136us        99.82%       3.051ms       3.051ms       0.000us         0.00%     122.468us     122.468us             1  
                                              aten::mul         5.14%     157.189us         8.67%     264.988us      11.041us      61.985us        51.39%      61.985us       2.583us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.985us        51.39%      61.985us       2.583us            24  
                                            aten::copy_         3.53%     107.981us        72.58%       2.218ms     123.247us      39.362us        32.64%      41.218us       2.290us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.802us        23.88%      28.802us       2.400us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.265us        15.97%      19.265us       1.605us            12  
                                            aten::clone         0.97%      29.629us        70.14%       2.144ms     357.356us       0.000us         0.00%      12.416us       2.069us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.560us         8.76%      10.560us       1.760us             6  
                                              aten::add         1.14%      34.930us         1.90%      58.161us       9.693us       9.633us         7.99%       9.633us       1.606us             6  
                                              aten::sub         1.25%      38.210us         2.05%      62.510us      10.418us       9.632us         7.99%       9.632us       1.605us             6  
                                Activity Buffer Request        57.00%       1.742ms        57.00%       1.742ms       1.742ms       1.856us         1.54%       1.856us       1.856us             1  
                                    aten::empty_strided         1.01%      31.021us         1.01%      31.021us       5.170us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.03%     306.454us        10.03%     306.454us      51.076us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.23%      68.242us         2.79%      85.430us       3.560us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.56%      17.188us         0.56%      17.188us       0.716us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.10%     217.131us         7.10%     217.131us       4.524us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       5.390us         0.18%       5.390us       5.390us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 3.057ms
Self CUDA time total: 120.612us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     928.245us       538.18%     928.245us     928.245us             1  
                                            torch_eager        19.14%     292.425us        99.66%       1.523ms       1.523ms       0.000us         0.00%     175.325us     175.325us             1  
                                              aten::mul        10.16%     155.270us        17.20%     262.742us      10.948us      89.630us        51.97%      89.630us       3.735us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.630us        51.97%      89.630us       3.735us            24  
                                            aten::copy_         6.82%     104.170us        46.76%     714.441us      39.691us      57.920us        33.58%      60.768us       3.376us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.768us        23.64%      40.768us       3.397us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.927us        14.45%      24.927us       2.077us            12  
                                            aten::clone         1.34%      20.471us        41.24%     630.180us     105.030us       0.000us         0.00%      20.000us       3.333us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.152us         9.94%      17.152us       2.859us             6  
                                              aten::sub         2.56%      39.072us         4.07%      62.112us      10.352us      12.480us         7.24%      12.480us       2.080us             6  
                                              aten::add         2.20%      33.610us         3.65%      55.810us       9.302us      12.447us         7.22%      12.447us       2.075us             6  
                                Activity Buffer Request        16.69%     254.944us        16.69%     254.944us     254.944us       2.848us         1.65%       2.848us       2.848us             1  
                                    aten::empty_strided         2.04%      31.181us         2.04%      31.181us       5.197us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        19.06%     291.294us        19.06%     291.294us      48.549us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.37%      66.700us         5.47%      83.522us       3.480us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.10%      16.822us         1.10%      16.822us       0.701us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.19%     216.745us        14.19%     216.745us       4.516us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       5.240us         0.34%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.528ms
Self CUDA time total: 172.477us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     923.899us       767.46%     923.899us     923.899us             1  
                                            torch_eager        19.14%     287.798us        99.65%       1.499ms       1.499ms       0.000us         0.00%     122.144us     122.144us             1  
                                              aten::mul        10.49%     157.698us        17.70%     266.255us      11.094us      61.982us        51.49%      61.982us       2.583us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.982us        51.49%      61.982us       2.583us            24  
                                            aten::copy_         6.99%     105.118us        46.36%     697.187us      38.733us      39.264us        32.62%      41.024us       2.279us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.832us        23.95%      28.832us       2.403us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.138us        15.90%      19.138us       1.595us            12  
                                            aten::clone         1.32%      19.822us        40.79%     613.519us     102.253us       0.000us         0.00%      12.192us       2.032us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.67%      10.432us       1.739us             6  
                                              aten::sub         2.51%      37.801us         4.08%      61.341us      10.224us       9.570us         7.95%       9.570us       1.595us             6  
                                              aten::add         2.16%      32.471us         3.63%      54.661us       9.110us       9.568us         7.95%       9.568us       1.595us             6  
                                Activity Buffer Request        16.71%     251.314us        16.71%     251.314us     251.314us       1.760us         1.46%       1.760us       1.760us             1  
                                    aten::empty_strided         2.00%      30.060us         2.00%      30.060us       5.010us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        18.58%     279.394us        18.58%     279.394us      46.566us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.31%      64.750us         5.43%      81.609us       3.400us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.12%      16.859us         1.12%      16.859us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.34%     215.648us        14.34%     215.648us       4.493us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       5.220us         0.35%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.504ms
Self CUDA time total: 120.384us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.259us       547.68%     943.259us     943.259us             1  
                                            torch_eager         9.82%     293.988us        99.82%       2.988ms       2.988ms       0.000us         0.00%     175.075us     175.075us             1  
                                              aten::mul         5.17%     154.631us         8.81%     263.742us      10.989us      89.536us        51.99%      89.536us       3.731us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.536us        51.99%      89.536us       3.731us            24  
                                            aten::copy_         3.66%     109.570us        72.53%       2.171ms     120.590us      57.795us        33.56%      60.643us       3.369us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.835us        23.71%      40.835us       3.403us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.896us        14.46%      24.896us       2.075us            12  
                                            aten::clone         0.74%      22.030us        69.74%       2.087ms     347.874us       0.000us         0.00%      19.808us       3.301us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us         9.85%      16.960us       2.827us             6  
                                              aten::add         1.10%      32.890us         1.87%      55.840us       9.307us      12.481us         7.25%      12.481us       2.080us             6  
                                              aten::sub         1.28%      38.273us         2.11%      63.142us      10.524us      12.415us         7.21%      12.415us       2.069us             6  
                                Activity Buffer Request        58.02%       1.736ms        58.02%       1.736ms       1.736ms       2.848us         1.65%       2.848us       2.848us             1  
                                    aten::empty_strided         1.00%      30.050us         1.00%      30.050us       5.008us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.83%     264.325us         8.83%     264.325us      44.054us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.36%      70.650us         2.95%      88.161us       3.673us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.59%      17.511us         0.59%      17.511us       0.730us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.26%     217.282us         7.26%     217.282us       4.527us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       5.289us         0.18%       5.289us       5.289us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.993ms
Self CUDA time total: 172.227us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     922.006us       322.11%     922.006us     922.006us             1  
                                            torch_eager        19.42%     278.764us        99.64%       1.431ms       1.431ms       0.000us         0.00%     304.543us     304.543us             1  
                                              aten::mul        10.68%     153.400us        18.09%     259.803us      10.825us     134.112us        46.85%     134.112us       5.588us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     134.112us        46.85%     134.112us       5.588us            24  
                                            aten::copy_         7.65%     109.831us        44.83%     643.670us      35.759us     111.232us        38.86%     129.536us       7.196us            18  
                                            aten::clone         1.43%      20.539us        38.82%     557.349us      92.892us       0.000us         0.00%      72.160us      12.027us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.376us        20.04%      57.376us       4.781us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.856us        18.82%      53.856us       8.976us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.895us        14.29%      40.895us       3.408us            12  
                                              aten::sub         2.68%      38.501us         4.30%      61.692us      10.282us      20.543us         7.18%      20.543us       3.424us             6  
                                              aten::add         2.29%      32.829us         3.81%      54.730us       9.122us      20.352us         7.11%      20.352us       3.392us             6  
                                Activity Buffer Request        16.08%     230.904us        16.08%     230.904us     230.904us      18.304us         6.39%      18.304us      18.304us             1  
                                    aten::empty_strided         2.06%      29.601us         2.06%      29.601us       4.933us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        16.83%     241.674us        16.83%     241.674us      40.279us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.51%      64.754us         5.69%      81.743us       3.406us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.18%      16.989us         1.18%      16.989us       0.708us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.82%     212.756us        14.82%     212.756us       4.432us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.36%       5.240us         0.36%       5.240us       5.240us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.436ms
Self CUDA time total: 286.239us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     970.352us       169.72%     970.352us     970.352us             1  
                                            torch_eager        19.50%     289.365us        99.64%       1.478ms       1.478ms       0.000us         0.00%     595.480us     595.480us             1  
                                            aten::copy_         7.05%     104.551us        43.31%     642.598us      35.700us     273.596us        47.85%     297.340us      16.519us            18  
                                              aten::mul        11.63%     172.532us        19.46%     288.666us      12.028us     232.863us        40.73%     232.863us       9.703us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.863us        40.73%     232.863us       9.703us            24  
                                            aten::clone         1.45%      21.521us        37.67%     558.878us      93.146us       0.000us         0.00%     205.949us      34.325us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.205us        31.87%     182.205us      30.367us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.391us        15.98%      91.391us       7.616us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.277us        11.42%      65.277us       5.440us            12  
                                              aten::sub         2.70%      40.111us         4.36%      64.701us      10.784us      32.768us         5.73%      32.768us       5.461us             6  
                                              aten::add         2.31%      34.320us         3.88%      57.510us       9.585us      32.509us         5.69%      32.509us       5.418us             6  
                                Activity Buffer Request        17.48%     259.324us        17.48%     259.324us     259.324us      23.744us         4.15%      23.744us      23.744us             1  
                                    aten::empty_strided         2.00%      29.720us         2.00%      29.720us       4.953us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        14.68%     217.742us        14.68%     217.742us      36.290us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.50%      66.694us         5.68%      84.252us       3.511us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.18%      17.558us         1.18%      17.558us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.16%     224.895us        15.16%     224.895us       4.685us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.36%       5.340us         0.36%       5.340us       5.340us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.484ms
Self CUDA time total: 571.736us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     936.155us      1011.59%     936.155us     936.155us             1  
                                            torch_eager         9.66%     281.404us        99.82%       2.908ms       2.908ms       0.000us         0.00%      93.663us      93.663us             1  
                                              aten::mul         5.48%     159.764us         9.36%     272.564us      11.357us      49.568us        53.56%      49.568us       2.065us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.568us        53.56%      49.568us       2.065us            24  
                                            aten::copy_         3.70%     107.711us        72.25%       2.105ms     116.944us      29.407us        31.78%      30.527us       1.696us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.591us        24.41%      22.591us       1.883us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.568us        14.66%      13.568us       1.131us            12  
                                            aten::clone         0.74%      21.551us        69.34%       2.020ms     336.695us       0.000us         0.00%       7.936us       1.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us         7.37%       6.816us       1.136us             6  
                                              aten::sub         1.31%      38.128us         2.13%      61.912us      10.319us       6.815us         7.36%       6.815us       1.136us             6  
                                              aten::add         1.08%      31.450us         1.84%      53.600us       8.933us       6.753us         7.30%       6.753us       1.126us             6  
                                Activity Buffer Request        59.75%       1.741ms        59.75%       1.741ms       1.741ms       1.120us         1.21%       1.120us       1.120us             1  
                                    aten::empty_strided         1.04%      30.170us         1.04%      30.170us       5.028us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.73%     196.044us         6.73%     196.044us      32.674us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.24%      65.300us         2.82%      82.022us       3.418us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.57%      16.722us         0.57%      16.722us       0.697us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.53%     219.305us         7.53%     219.305us       4.569us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       5.160us         0.18%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.913ms
Self CUDA time total: 92.543us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     918.262us       956.86%     918.262us     918.262us             1  
                                            torch_eager        20.02%     274.163us        99.62%       1.364ms       1.364ms       0.000us         0.00%      97.279us      97.279us             1  
                                              aten::mul        11.52%     157.766us        19.39%     265.646us      11.069us      51.167us        53.32%      51.167us       2.132us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.167us        53.32%      51.167us       2.132us            24  
                                            aten::copy_         7.76%     106.268us        42.02%     575.576us      31.976us      30.720us        32.01%      32.033us       1.780us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        23.88%      22.912us       1.909us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.079us        14.67%      14.079us       1.173us            12  
                                            aten::clone         1.48%      20.322us        36.02%     493.298us      82.216us       0.000us         0.00%       9.121us       1.520us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         8.14%       7.808us       1.301us             6  
                                              aten::sub         2.81%      38.541us         4.49%      61.481us      10.247us       7.072us         7.37%       7.072us       1.179us             6  
                                              aten::add         2.42%      33.131us         4.04%      55.302us       9.217us       7.007us         7.30%       7.007us       1.168us             6  
                                Activity Buffer Request        16.17%     221.544us        16.17%     221.544us     221.544us       1.313us         1.37%       1.313us       1.313us             1  
                                    aten::empty_strided         2.33%      31.950us         2.33%      31.950us       5.325us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.69%     187.513us        13.69%     187.513us      31.252us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.61%      63.101us         5.84%      79.961us       3.332us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.23%      16.860us         1.23%      16.860us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.57%     213.242us        15.57%     213.242us       4.443us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.38%       5.270us         0.38%       5.270us       5.270us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.370ms
Self CUDA time total: 95.966us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     929.528us       892.96%     929.528us     929.528us             1  
                                            torch_eager        20.25%     278.528us        99.63%       1.370ms       1.370ms       0.000us         0.00%     105.439us     105.439us             1  
                                              aten::mul        11.59%     159.422us        19.60%     269.583us      11.233us      55.326us        53.15%      55.326us       2.305us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.326us        53.15%      55.326us       2.305us            24  
                                            aten::copy_         7.64%     105.130us        41.59%     572.021us      31.779us      32.351us        31.08%      33.695us       1.872us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.607us        23.64%      24.607us       2.051us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.418us        15.77%      16.418us       1.368us            12  
                                            aten::clone         1.49%      20.431us        35.49%     488.057us      81.343us       0.000us         0.00%       9.088us       1.515us             6  
                                              aten::sub         2.60%      35.723us         4.36%      59.953us       9.992us       8.258us         7.93%       8.258us       1.376us             6  
                                              aten::add         2.46%      33.770us         4.07%      55.940us       9.323us       8.160us         7.84%       8.160us       1.360us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.44%       7.744us       1.291us             6  
                                Activity Buffer Request        16.10%     221.454us        16.10%     221.454us     221.454us       1.344us         1.29%       1.344us       1.344us             1  
                                    aten::empty_strided         2.25%      30.990us         2.25%      30.990us       5.165us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.30%     182.863us        13.30%     182.863us      30.477us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.81%      66.212us         6.02%      82.825us       3.451us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.21%      16.613us         1.21%      16.613us       0.692us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.93%     219.135us        15.93%     219.135us       4.565us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.37%       5.090us         0.37%       5.090us       5.090us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.375ms
Self CUDA time total: 104.095us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     943.134us       762.57%     943.134us     943.134us             1  
                                            torch_eager         9.91%     288.756us        99.81%       2.907ms       2.907ms       0.000us         0.00%     125.503us     125.503us             1  
                                              aten::mul         5.47%     159.428us         9.14%     266.247us      11.094us      65.088us        52.63%      65.088us       2.712us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.088us        52.63%      65.088us       2.712us            24  
                                            aten::copy_         3.82%     111.411us        72.08%       2.100ms     116.650us      39.391us        31.85%      41.215us       2.290us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.799us        23.29%      28.799us       2.400us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.200us        15.52%      19.200us       1.600us            12  
                                            aten::clone         0.71%      20.821us        69.14%       2.014ms     335.649us       0.000us         0.00%      12.416us       2.069us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.592us         8.56%      10.592us       1.765us             6  
                                              aten::sub         1.35%      39.440us         2.20%      63.980us      10.663us       9.632us         7.79%       9.632us       1.605us             6  
                                              aten::add         1.16%      33.802us         1.92%      55.961us       9.327us       9.568us         7.74%       9.568us       1.595us             6  
                                Activity Buffer Request        59.81%       1.742ms        59.81%       1.742ms       1.742ms       1.824us         1.47%       1.824us       1.824us             1  
                                    aten::empty_strided         1.06%      30.871us         1.06%      30.871us       5.145us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.32%     184.202us         6.32%     184.202us      30.700us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.20%      64.120us         2.78%      80.888us       3.370us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.58%      16.768us         0.58%      16.768us       0.699us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.39%     215.298us         7.39%     215.298us       4.485us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.660us         0.19%       5.660us       5.660us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.913ms
Self CUDA time total: 123.679us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     926.451us       888.37%     926.451us     926.451us             1  
                                            torch_eager        20.56%     277.090us        99.61%       1.342ms       1.342ms       0.000us         0.00%     105.599us     105.599us             1  
                                              aten::mul        11.75%     158.363us        19.88%     267.883us      11.162us      55.423us        53.14%      55.423us       2.309us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.423us        53.14%      55.423us       2.309us            24  
                                            aten::copy_         7.94%     107.035us        40.62%     547.383us      30.410us      32.352us        31.02%      33.664us       1.870us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        23.63%      24.640us       2.053us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.512us        15.83%      16.512us       1.376us            12  
                                            aten::clone         1.47%      19.840us        34.29%     462.099us      77.016us       0.000us         0.00%       9.024us       1.504us             6  
                                              aten::sub         2.93%      39.461us         4.68%      63.054us      10.509us       8.287us         7.95%       8.287us       1.381us             6  
                                              aten::add         2.50%      33.680us         4.16%      56.100us       9.350us       8.225us         7.89%       8.225us       1.371us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.712us         7.39%       7.712us       1.285us             6  
                                Activity Buffer Request        14.74%     198.654us        14.74%     198.654us     198.654us       1.312us         1.26%       1.312us       1.312us             1  
                                    aten::empty_strided         2.26%      30.481us         2.26%      30.481us       5.080us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.39%     180.523us        13.39%     180.523us      30.087us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.73%      63.708us         5.98%      80.630us       3.360us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.26%      16.922us         1.26%      16.922us       0.705us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.08%     216.704us        16.08%     216.704us       4.515us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.39%       5.231us         0.39%       5.231us       5.231us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.348ms
Self CUDA time total: 104.287us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     931.662us       754.64%     931.662us     931.662us             1  
                                            torch_eager        20.88%     278.302us        99.60%       1.328ms       1.328ms       0.000us         0.00%     125.281us     125.281us             1  
                                              aten::mul        11.71%     156.112us        20.55%     273.936us      11.414us      65.153us        52.77%      65.153us       2.715us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.153us        52.77%      65.153us       2.715us            24  
                                            aten::copy_         7.95%     105.951us        39.52%     526.779us      29.265us      39.169us        31.73%      40.993us       2.277us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.737us        23.28%      28.737us       2.395us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.135us        15.50%      19.135us       1.595us            12  
                                            aten::clone         1.44%      19.200us        33.27%     443.406us      73.901us       0.000us         0.00%      12.256us       2.043us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.45%      10.432us       1.739us             6  
                                              aten::sub         2.81%      37.440us         4.58%      61.110us      10.185us       9.632us         7.80%       9.632us       1.605us             6  
                                              aten::add         2.52%      33.611us         4.17%      55.611us       9.268us       9.503us         7.70%       9.503us       1.584us             6  
                                Activity Buffer Request        13.21%     176.083us        13.21%     176.083us     176.083us       1.824us         1.48%       1.824us       1.824us             1  
                                    aten::empty_strided         2.29%      30.570us         2.29%      30.570us       5.095us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.82%     184.192us        13.82%     184.192us      30.699us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.90%      65.274us         6.16%      82.123us       3.422us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.26%      16.849us         1.26%      16.849us       0.702us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.81%     224.047us        16.81%     224.047us       4.668us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.40%       5.310us         0.40%       5.310us       5.310us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.333ms
Self CUDA time total: 123.457us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.092us       532.26%     944.092us     944.092us             1  
                                            torch_eager         9.66%     282.874us        99.81%       2.921ms       2.921ms       0.000us         0.00%     180.253us     180.253us             1  
                                              aten::mul         5.51%     161.402us         9.28%     271.603us      11.317us      95.040us        53.58%      95.040us       3.960us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.040us        53.58%      95.040us       3.960us            24  
                                            aten::copy_         3.62%     106.065us        72.07%       2.109ms     117.193us      57.663us        32.51%      60.543us       3.364us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.703us        22.95%      40.703us       3.392us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.670us        13.91%      24.670us       2.056us            12  
                                            aten::clone         0.77%      22.428us        69.22%       2.026ms     337.680us       0.000us         0.00%      19.840us       3.307us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us         9.56%      16.960us       2.827us             6  
                                              aten::add         1.16%      34.010us         1.95%      57.150us       9.525us      12.383us         6.98%      12.383us       2.064us             6  
                                              aten::sub         1.32%      38.563us         2.15%      62.972us      10.495us      12.287us         6.93%      12.287us       2.048us             6  
                                Activity Buffer Request        59.97%       1.755ms        59.97%       1.755ms       1.755ms       2.880us         1.62%       2.880us       2.880us             1  
                                    aten::empty_strided         1.05%      30.691us         1.05%      30.691us       5.115us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.31%     184.633us         6.31%     184.633us      30.772us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.32%      67.977us         2.88%      84.170us       3.507us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.55%      16.193us         0.55%      16.193us       0.675us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.56%     221.262us         7.56%     221.262us       4.610us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.669us         0.19%       5.669us       5.669us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.927ms
Self CUDA time total: 177.373us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     956.029us       320.35%     956.029us     956.029us             1  
                                            torch_eager        10.28%     306.488us        99.82%       2.977ms       2.977ms       0.000us         0.00%     316.194us     316.194us             1  
                                              aten::mul         5.10%     152.001us         8.95%     266.845us      11.119us     146.560us        49.11%     146.560us       6.107us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.560us        49.11%     146.560us       6.107us            24  
                                            aten::copy_         3.72%     110.901us        71.64%       2.137ms     118.718us     110.754us        37.11%     128.514us       7.140us            18  
                                            aten::clone         0.97%      28.901us        68.99%       2.058ms     342.957us       0.000us         0.00%      70.944us      11.824us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.570us        19.29%      57.570us       4.797us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.184us        17.82%      53.184us       8.864us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.120us        13.78%      41.120us       3.427us            12  
                                              aten::add         1.16%      34.740us         1.93%      57.500us       9.583us      20.641us         6.92%      20.641us       3.440us             6  
                                              aten::sub         1.34%      39.998us         2.18%      65.101us      10.850us      20.479us         6.86%      20.479us       3.413us             6  
                                Activity Buffer Request        59.58%       1.777ms        59.58%       1.777ms       1.777ms      17.760us         5.95%      17.760us      17.760us             1  
                                    aten::empty_strided         1.05%      31.260us         1.05%      31.260us       5.210us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         6.26%     186.663us         6.26%     186.663us      31.111us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.24%      66.809us         2.82%      84.238us       3.510us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.58%      17.429us         0.58%      17.429us       0.726us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.54%     224.919us         7.54%     224.919us       4.686us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       5.469us         0.18%       5.469us       5.469us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.983ms
Self CUDA time total: 298.434us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.392us       515.61%     916.392us     916.392us             1  
                                            torch_eager        19.58%     274.201us        99.60%       1.394ms       1.394ms       0.000us         0.00%     180.610us     180.610us             1  
                                              aten::mul        11.24%     157.371us        18.87%     264.183us      11.008us      95.074us        53.49%      95.074us       3.961us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      95.074us        53.49%      95.074us       3.961us            24  
                                            aten::copy_         7.77%     108.775us        43.49%     608.863us      33.826us      57.825us        32.54%      60.705us       3.373us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.897us        23.01%      40.897us       3.408us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.831us        13.97%      24.831us       2.069us            12  
                                            aten::clone         1.40%      19.580us        37.38%     523.368us      87.228us       0.000us         0.00%      19.808us       3.301us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.928us         9.52%      16.928us       2.821us             6  
                                              aten::add         2.38%      33.360us         4.00%      56.040us       9.340us      12.416us         6.99%      12.416us       2.069us             6  
                                              aten::sub         2.76%      38.582us         4.39%      61.472us      10.245us      12.415us         6.99%      12.415us       2.069us             6  
                                Activity Buffer Request        18.14%     253.955us        18.14%     253.955us     253.955us       2.880us         1.62%       2.880us       2.880us             1  
                                    aten::empty_strided         2.13%      29.860us         2.13%      29.860us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.38%     187.273us        13.38%     187.273us      31.212us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.53%      63.391us         5.73%      80.293us       3.346us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.21%      16.902us         1.21%      16.902us       0.704us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.09%     211.242us        15.09%     211.242us       4.401us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.40%       5.600us         0.40%       5.600us       5.600us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.400ms
Self CUDA time total: 177.730us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     934.618us       312.71%     934.618us     934.618us             1  
                                            torch_eager        20.60%     280.895us        99.62%       1.358ms       1.358ms       0.000us         0.00%     316.921us     316.921us             1  
                                              aten::mul        11.57%     157.759us        19.61%     267.373us      11.141us     146.460us        49.00%     146.460us       6.102us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     146.460us        49.00%     146.460us       6.102us            24  
                                            aten::copy_         8.07%     110.072us        41.19%     561.700us      31.206us     111.966us        37.46%     130.013us       7.223us            18  
                                            aten::clone         1.51%      20.600us        34.77%     474.096us      79.016us       0.000us         0.00%      72.670us      12.112us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.19%      57.343us       4.779us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      54.623us        18.28%      54.623us       9.104us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.448us        13.53%      40.448us       3.371us            12  
                                              aten::add         2.59%      35.260us         4.22%      57.590us       9.598us      20.288us         6.79%      20.288us       3.381us             6  
                                              aten::sub         2.60%      35.410us         4.30%      58.621us       9.770us      20.160us         6.75%      20.160us       3.360us             6  
                                Activity Buffer Request        14.73%     200.853us        14.73%     200.853us     200.853us      18.047us         6.04%      18.047us      18.047us             1  
                                    aten::empty_strided         2.18%      29.660us         2.18%      29.660us       4.943us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.85%     188.823us        13.85%     188.823us      31.471us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.75%      64.754us         6.01%      81.922us       3.413us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.26%      17.168us         1.26%      17.168us       0.715us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.92%     217.107us        15.92%     217.107us       4.523us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.38%       5.180us         0.38%       5.180us       5.180us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.364ms
Self CUDA time total: 298.874us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     956.919us       161.50%     956.919us     956.919us             1  
                                            torch_eager        21.30%     289.504us        99.57%       1.353ms       1.353ms       0.000us         0.00%     616.281us     616.281us             1  
                                            aten::copy_         7.84%     106.532us        38.89%     528.548us      29.364us     278.013us        46.92%     301.788us      16.766us            18  
                                              aten::mul        11.95%     162.407us        20.79%     282.469us      11.770us     248.703us        41.97%     248.703us      10.363us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     248.703us        41.97%     248.703us      10.363us            24  
                                            aten::clone         1.53%      20.799us        32.73%     444.735us      74.123us       0.000us         0.00%     210.204us      35.034us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     186.429us        31.46%     186.429us      31.072us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      91.584us        15.46%      91.584us       7.632us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.790us        11.10%      65.790us       5.483us            12  
                                              aten::add         2.44%      33.161us         4.08%      55.501us       9.250us      32.927us         5.56%      32.927us       5.488us             6  
                                              aten::sub         2.95%      40.030us         4.74%      64.440us      10.740us      32.863us         5.55%      32.863us       5.477us             6  
                                Activity Buffer Request        13.07%     177.663us        13.07%     177.663us     177.663us      23.775us         4.01%      23.775us      23.775us             1  
                                    aten::empty_strided         2.15%      29.270us         2.15%      29.270us       4.878us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        13.63%     185.172us        13.63%     185.172us      30.862us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.83%      65.662us         6.08%      82.660us       3.444us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.25%      16.998us         1.25%      16.998us       0.708us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.63%     225.993us        16.63%     225.993us       4.708us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.43%       5.780us         0.43%       5.780us       5.780us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.359ms
Self CUDA time total: 592.506us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager        12.69%     276.287us        61.52%       1.340ms       1.340ms       0.000us         0.00%       1.863ms       1.863ms             1  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.835ms       102.22%       1.835ms       1.835ms             1  
                                            aten::copy_         5.01%     109.060us        24.98%     544.137us      30.230us     806.007us        44.89%     873.590us      48.533us            18  
                                              aten::mul         7.11%     154.844us        12.06%     262.604us      10.942us     842.615us        46.93%     842.615us      35.109us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     842.615us        46.93%     842.615us      35.109us            24  
                                            aten::clone         1.01%      22.000us        21.12%     459.916us      76.653us       0.000us         0.00%     622.361us     103.727us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     554.778us        30.90%     554.778us      92.463us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     251.229us        13.99%     251.229us      20.936us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     146.939us         8.18%     146.939us      12.245us            12  
                                              aten::sub         1.90%      41.421us         3.00%      65.411us      10.902us      88.573us         4.93%      88.573us      14.762us             6  
                                Activity Buffer Request         8.49%     184.983us         8.49%     184.983us     184.983us      67.583us         3.76%      67.583us      67.583us             1  
                                              aten::add         1.54%      33.561us         2.59%      56.461us       9.410us      58.366us         3.25%      58.366us       9.728us             6  
                                    aten::empty_strided         1.42%      30.960us         1.42%      30.960us       5.160us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.70%     189.543us         8.70%     189.543us      31.591us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.99%      65.113us         3.77%      82.061us       3.419us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.78%      16.948us         0.78%      16.948us       0.706us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         9.88%     215.201us         9.88%     215.201us       4.483us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize        38.48%     838.063us        38.48%     838.063us     838.063us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.178ms
Self CUDA time total: 1.796ms


impl                     wl                  p50(ms)  ok
torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S128_H32_D64_R32     0.23  True
torch_eager              cuda_B1_S128_H8_D128_R64     0.23  True
torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
torch_eager              cuda_B1_S2048_H32_D128_R64     0.23  True
torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S512_H8_D128_R64     0.23  True
torch_eager              cuda_B1_S512_H8_D64_R32     0.23  True
torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S2048_H32_D128_R64     0.65  True
torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
torch_eager              cuda_B2_S512_H32_D64_R32     0.23  True
torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S512_H8_D64_R32     0.23  True
▶ UV Install Logs

Artifacts:

rotary.jsonl