PyTorch Native - Rotary Position Embeddings

GPU Info

▼ code ▼ output ▶ uv-logs | Cell: nv | 0.26s | Raw GitHub
import subprocess
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
Wed Oct 29 04:13:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
| N/A   30C    P0            116W /  350W |       0MiB /  46068MiB |     67%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Rotary Embeddings Benchmark (PyTorch Native)

▼ code ▼ output ▶ uv-logs | Cell: benchmark | 3.91s | Raw GitHub
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "torch==2.8.0",
#     "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark


def apply_rotary_torch(x1, x2, cos, sin, conj=False):
    """Reference rotary implementation."""
    if not conj:
        out1 = x1 * cos - x2 * sin
        out2 = x1 * sin + x2 * cos
    else:
        out1 = x1 * cos + x2 * sin
        out2 = -x1 * sin + x2 * cos
    return out1, out2


def torch_rotary(query, key, cos, sin, conj=False):
    rotary_dim = cos.shape[-1]

    # Clone inputs to avoid modifying them
    q_out = query.clone()
    k_out = key.clone()

    # Apply rotation to query
    q1 = q_out[..., :rotary_dim]
    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
    q_out[..., :rotary_dim] = q_out_1
    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2

    # Apply rotation to key
    k1 = k_out[..., :rotary_dim]
    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
    k_out[..., :rotary_dim] = k_out_1
    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2

    return q_out, k_out


run_benchmark(
    kernel_type=KernelTypeEnum.ROTARY,
    impl_name="torch_eager",
    impl_tags={"family": "pytorch", "backend": "eager"},
    impl_func=torch_rotary,
)
Running rotary benchmark on cuda with 24 workloads.

======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.037ms      1163.70%       1.037ms       1.037ms             1  
                                            torch_eager        14.48%     388.465us        99.71%       2.675ms       2.675ms       0.000us         0.00%      90.368us      90.368us             1  
                                              aten::mul         6.24%     167.371us        10.81%     289.974us      12.082us      46.850us        52.55%      46.850us       1.952us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.850us        52.55%      46.850us       1.952us            24  
                                            aten::copy_         3.95%     106.042us        62.52%       1.677ms      93.189us      29.055us        32.59%      30.271us       1.682us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.431us        25.16%      22.431us       1.869us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.247us        14.86%      13.247us       1.104us            12  
                                            aten::clone         1.33%      35.811us        60.70%       1.628ms     271.409us       0.000us         0.00%       7.840us       1.307us             6  
                                              aten::sub         1.70%      45.710us         2.68%      71.932us      11.989us       6.688us         7.50%       6.688us       1.115us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.624us         7.43%       6.624us       1.104us             6  
                                              aten::add         1.31%      35.129us         2.15%      57.710us       9.618us       6.559us         7.36%       6.559us       1.093us             6  
                                Activity Buffer Request        53.28%       1.429ms        53.28%       1.429ms       1.429ms       1.216us         1.36%       1.216us       1.216us             1  
                                    aten::empty_strided         2.07%      55.651us         2.07%      55.651us       9.275us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.67%      71.682us         2.67%      71.682us      11.947us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.88%      77.398us         3.66%      98.099us       4.087us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.77%      20.701us         0.77%      20.701us       0.863us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         9.01%     241.667us         9.01%     241.667us       5.035us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.29%       7.810us         0.29%       7.810us       7.810us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.683ms
Self CUDA time total: 89.152us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     974.701us      1078.26%     974.701us     974.701us             1  
                                            torch_eager        13.04%     331.863us        99.80%       2.539ms       2.539ms       0.000us         0.00%      91.516us      91.516us             1  
                                              aten::mul         6.08%     154.764us        10.71%     272.436us      11.351us      47.740us        52.81%      47.740us       1.989us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.740us        52.81%      47.740us       1.989us            24  
                                            aten::copy_         4.22%     107.278us        65.67%       1.671ms      92.831us      29.344us        32.46%      30.464us       1.692us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.528us        24.92%      22.528us       1.877us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.312us        14.73%      13.312us       1.109us            12  
                                            aten::clone         1.12%      28.494us        62.70%       1.595ms     265.883us       0.000us         0.00%       7.936us       1.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.816us         7.54%       6.816us       1.136us             6  
                                              aten::sub         1.50%      38.242us         2.49%      63.402us      10.567us       6.688us         7.40%       6.688us       1.115us             6  
                                              aten::add         1.20%      30.490us         2.06%      52.342us       8.724us       6.624us         7.33%       6.624us       1.104us             6  
                                Activity Buffer Request        56.69%       1.442ms        56.69%       1.442ms       1.442ms       1.120us         1.24%       1.120us       1.120us             1  
                                    aten::empty_strided         1.23%      31.410us         1.23%      31.410us       5.235us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.23%      56.711us         2.23%      56.711us       9.452us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.71%      68.925us         3.47%      88.365us       3.682us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.76%      19.440us         0.76%      19.440us       0.810us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         9.01%     229.327us         9.01%     229.327us       4.778us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.20%       5.130us         0.20%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.544ms
Self CUDA time total: 90.396us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     992.026us      1054.84%     992.026us     992.026us             1  
                                            torch_eager        13.38%     342.168us        99.79%       2.552ms       2.552ms       0.000us         0.00%      95.357us      95.357us             1  
                                              aten::mul         6.15%     157.234us        10.75%     274.750us      11.448us      48.894us        51.99%      48.894us       2.037us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.894us        51.99%      48.894us       2.037us            24  
                                            aten::copy_         4.01%     102.532us        65.33%       1.670ms      92.800us      30.817us        32.77%      32.129us       1.785us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.977us        24.43%      22.977us       1.915us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.334us        15.24%      14.334us       1.194us            12  
                                            aten::clone         1.05%      26.950us        62.23%       1.591ms     265.191us       0.000us         0.00%       9.152us       1.525us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.840us         8.34%       7.840us       1.307us             6  
                                              aten::sub         1.50%      38.270us         2.44%      62.460us      10.410us       7.198us         7.65%       7.198us       1.200us             6  
                                              aten::add         1.23%      31.400us         2.10%      53.770us       8.962us       7.136us         7.59%       7.136us       1.189us             6  
                                Activity Buffer Request        56.41%       1.442ms        56.41%       1.442ms       1.442ms       1.312us         1.40%       1.312us       1.312us             1  
                                    aten::empty_strided         1.23%      31.530us         1.23%      31.530us       5.255us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.22%      56.682us         2.22%      56.682us       9.447us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.75%      70.221us         3.50%      89.542us       3.731us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.76%      19.321us         0.76%      19.321us       0.805us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         9.11%     232.827us         9.11%     232.827us       4.851us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.280us         0.21%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.557ms
Self CUDA time total: 94.045us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     963.417us       953.96%     963.417us     963.417us             1  
                                            torch_eager        11.52%     317.176us        99.82%       2.749ms       2.749ms       0.000us         0.00%     102.303us     102.303us             1  
                                              aten::mul         5.45%     150.206us         9.82%     270.557us      11.273us      52.736us        52.22%      52.736us       2.197us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.736us        52.22%      52.736us       2.197us            24  
                                            aten::copy_         3.72%     102.545us        68.70%       1.892ms     105.120us      32.255us        31.94%      33.567us       1.865us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.511us        24.27%      24.511us       2.043us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.000us        15.84%      16.000us       1.333us            12  
                                            aten::clone         1.08%      29.720us        65.99%       1.817ms     302.902us       0.000us         0.00%       9.056us       1.509us             6  
                                              aten::sub         1.33%      36.580us         2.29%      63.082us      10.514us       8.000us         7.92%       8.000us       1.333us             6  
                                              aten::add         1.19%      32.640us         2.06%      56.790us       9.465us       8.000us         7.92%       8.000us       1.333us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.744us         7.67%       7.744us       1.291us             6  
                                Activity Buffer Request        52.30%       1.440ms        52.30%       1.440ms       1.440ms       1.312us         1.30%       1.312us       1.312us             1  
                                    aten::empty_strided         1.15%      31.721us         1.15%      31.721us       5.287us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.21%     281.246us        10.21%     281.246us      46.874us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.50%      68.838us         3.19%      87.951us       3.665us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.69%      19.113us         0.69%      19.113us       0.796us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.68%     239.024us         8.68%     239.024us       4.980us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       4.960us         0.18%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.754ms
Self CUDA time total: 100.991us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     994.264us      1059.01%     994.264us     994.264us             1  
                                            torch_eager        12.10%     336.594us        99.83%       2.776ms       2.776ms       0.000us         0.00%      95.197us      95.197us             1  
                                              aten::mul         5.53%     153.843us         9.85%     273.965us      11.415us      48.927us        52.11%      48.927us       2.039us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.927us        52.11%      48.927us       2.039us            24  
                                            aten::copy_         3.84%     106.831us        68.28%       1.899ms     105.502us      30.784us        32.79%      32.095us       1.783us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.911us        24.40%      22.911us       1.909us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.175us        15.10%      14.175us       1.181us            12  
                                            aten::clone         0.99%      27.653us        65.29%       1.816ms     302.643us       0.000us         0.00%       9.184us       1.531us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.873us         8.39%       7.873us       1.312us             6  
                                              aten::add         1.11%      30.890us         1.95%      54.150us       9.025us       7.103us         7.57%       7.103us       1.184us             6  
                                              aten::sub         1.31%      36.550us         2.21%      61.372us      10.229us       7.072us         7.53%       7.072us       1.179us             6  
                                Activity Buffer Request        52.41%       1.458ms        52.41%       1.458ms       1.458ms       1.311us         1.40%       1.311us       1.311us             1  
                                    aten::empty_strided         1.15%      31.950us         1.15%      31.950us       5.325us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.52%     264.666us         9.52%     264.666us      44.111us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.56%      71.249us         3.30%      91.758us       3.823us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.74%      20.509us         0.74%      20.509us       0.855us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.56%     238.154us         8.56%     238.154us       4.962us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.17%       4.831us         0.17%       4.831us       4.831us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.781ms
Self CUDA time total: 93.886us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     947.484us       938.47%     947.484us     947.484us             1  
                                            torch_eager        10.88%     292.632us        99.82%       2.684ms       2.684ms       0.000us         0.00%     102.274us     102.274us             1  
                                              aten::mul         5.59%     150.412us         9.99%     268.638us      11.193us      52.575us        52.07%      52.575us       2.191us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.575us        52.07%      52.575us       2.191us            24  
                                            aten::copy_         3.76%     101.124us        69.31%       1.864ms     103.538us      32.417us        32.11%      33.730us       1.874us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        24.41%      24.640us       2.053us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.969us        15.82%      15.969us       1.331us            12  
                                            aten::clone         0.88%      23.678us        66.40%       1.785ms     297.581us       0.000us         0.00%       9.090us       1.515us             6  
                                              aten::add         1.17%      31.492us         2.09%      56.082us       9.347us       8.001us         7.92%       8.001us       1.333us             6  
                                              aten::sub         1.33%      35.751us         2.27%      61.172us      10.195us       7.968us         7.89%       7.968us       1.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.777us         7.70%       7.777us       1.296us             6  
                                Activity Buffer Request        53.61%       1.442ms        53.61%       1.442ms       1.442ms       1.313us         1.30%       1.313us       1.313us             1  
                                    aten::empty_strided         1.16%      31.231us         1.16%      31.231us       5.205us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.49%     255.066us         9.49%     255.066us      42.511us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.55%      68.470us         3.23%      86.863us       3.619us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.68%      18.393us         0.68%      18.393us       0.766us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.71%     234.118us         8.71%     234.118us       4.877us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       4.960us         0.18%       4.960us       4.960us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.689ms
Self CUDA time total: 100.961us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.015ms       841.96%       1.015ms       1.015ms             1  
                                            torch_eager        12.11%     330.713us        99.82%       2.726ms       2.726ms       0.000us         0.00%     122.270us     122.270us             1  
                                              aten::mul         5.81%     158.614us        10.12%     276.274us      11.511us      62.015us        51.46%      62.015us       2.584us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.015us        51.46%      62.015us       2.584us            24  
                                            aten::copy_         3.83%     104.612us        67.54%       1.845ms     102.474us      39.328us        32.63%      41.088us       2.283us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.832us        23.92%      28.832us       2.403us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.167us        15.90%      19.167us       1.597us            12  
                                            aten::clone         0.82%      22.270us        64.60%       1.764ms     294.026us       0.000us         0.00%      12.256us       2.043us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us         8.71%      10.496us       1.749us             6  
                                              aten::add         1.30%      35.623us         2.23%      60.872us      10.145us       9.600us         7.97%       9.600us       1.600us             6  
                                              aten::sub         1.39%      37.930us         2.30%      62.752us      10.459us       9.567us         7.94%       9.567us       1.594us             6  
                                Activity Buffer Request        51.93%       1.418ms        51.93%       1.418ms       1.418ms       1.760us         1.46%       1.760us       1.760us             1  
                                    aten::empty_strided         1.39%      37.931us         1.39%      37.931us       6.322us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.20%     251.364us         9.20%     251.364us      41.894us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.57%      70.176us         3.31%      90.509us       3.771us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.74%      20.333us         0.74%      20.333us       0.847us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.72%     238.202us         8.72%     238.202us       4.963us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       4.991us         0.18%       4.991us       4.991us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.731ms
Self CUDA time total: 120.510us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     969.687us       565.36%     969.687us     969.687us             1  
                                            torch_eager        11.93%     323.252us        99.82%       2.704ms       2.704ms       0.000us         0.00%     174.431us     174.431us             1  
                                              aten::mul         5.73%     155.191us        10.09%     273.452us      11.394us      89.149us        51.98%      89.149us       3.715us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.149us        51.98%      89.149us       3.715us            24  
                                            aten::copy_         3.81%     103.212us        67.97%       1.841ms     102.304us      57.504us        33.53%      60.417us       3.357us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.448us        23.58%      40.448us       3.371us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.865us        14.50%      24.865us       2.072us            12  
                                            aten::clone         1.01%      27.391us        65.07%       1.763ms     293.813us       0.000us         0.00%      19.969us       3.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.056us         9.94%      17.056us       2.843us             6  
                                              aten::sub         1.36%      36.973us         2.33%      63.083us      10.514us      12.448us         7.26%      12.448us       2.075us             6  
                                              aten::add         1.19%      32.138us         2.00%      54.180us       9.030us      12.417us         7.24%      12.417us       2.069us             6  
                                Activity Buffer Request        52.59%       1.425ms        52.59%       1.425ms       1.425ms       2.913us         1.70%       2.913us       2.913us             1  
                                    aten::empty_strided         1.13%      30.731us         1.13%      30.731us       5.122us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.09%     246.234us         9.09%     246.234us      41.039us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.62%      70.850us         3.34%      90.602us       3.775us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.73%      19.752us         0.73%      19.752us       0.823us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.62%     233.633us         8.62%     233.633us       4.867us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       4.920us         0.18%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.709ms
Self CUDA time total: 171.518us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     954.052us       791.00%     954.052us     954.052us             1  
                                            torch_eager        19.97%     292.412us        99.63%       1.459ms       1.459ms       0.000us         0.00%     122.437us     122.437us             1  
                                              aten::mul        10.56%     154.645us        18.69%     273.576us      11.399us      62.020us        51.42%      62.020us       2.584us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.020us        51.42%      62.020us       2.584us            24  
                                            aten::copy_         7.03%     103.000us        43.41%     635.575us      35.310us      39.424us        32.69%      41.248us       2.292us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.88%      28.800us       2.400us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.169us        15.89%      19.169us       1.597us            12  
                                            aten::clone         1.44%      21.120us        37.54%     549.571us      91.595us       0.000us         0.00%      12.448us       2.075us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.624us         8.81%      10.624us       1.771us             6  
                                              aten::add         2.24%      32.821us         3.84%      56.284us       9.381us       9.600us         7.96%       9.600us       1.600us             6  
                                              aten::sub         2.67%      39.093us         4.44%      64.973us      10.829us       9.569us         7.93%       9.569us       1.595us             6  
                                Activity Buffer Request        15.36%     224.935us        15.36%     224.935us     224.935us       1.824us         1.51%       1.824us       1.824us             1  
                                    aten::empty_strided         1.99%      29.111us         1.99%      29.111us       4.852us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        16.48%     241.265us        16.48%     241.265us      40.211us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.62%      67.580us         5.85%      85.721us       3.572us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.24%      18.141us         1.24%      18.141us       0.756us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.03%     234.649us        16.03%     234.649us       4.889us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.37%       5.351us         0.37%       5.351us       5.351us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.464ms
Self CUDA time total: 120.613us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     956.920us       558.23%     956.920us     956.920us             1  
                                            torch_eager        19.50%     289.238us        99.68%       1.478ms       1.478ms       0.000us         0.00%     174.235us     174.235us             1  
                                              aten::mul        10.48%     155.363us        18.66%     276.703us      11.529us      89.180us        52.02%      89.180us       3.716us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.180us        52.02%      89.180us       3.716us            24  
                                            aten::copy_         6.89%     102.110us        44.09%     653.841us      36.324us      57.375us        33.47%      60.191us       3.344us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.415us        23.58%      40.415us       3.368us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.864us        14.50%      24.864us       2.072us            12  
                                            aten::clone         1.47%      21.742us        38.41%     569.623us      94.937us       0.000us         0.00%      19.776us       3.296us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.960us         9.89%      16.960us       2.827us             6  
                                              aten::add         2.10%      31.093us         3.60%      53.332us       8.889us      12.512us         7.30%      12.512us       2.085us             6  
                                              aten::sub         2.55%      37.851us         4.17%      61.831us      10.305us      12.352us         7.21%      12.352us       2.059us             6  
                                Activity Buffer Request        16.56%     245.575us        16.56%     245.575us     245.575us       2.816us         1.64%       2.816us       2.816us             1  
                                    aten::empty_strided         2.00%      29.651us         2.00%      29.651us       4.942us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        16.13%     239.165us        16.13%     239.165us      39.861us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.82%      71.554us         6.20%      91.934us       3.831us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.37%      20.380us         1.37%      20.380us       0.849us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.82%     234.550us        15.82%     234.550us       4.886us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.32%       4.730us         0.32%       4.730us       4.730us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.483ms
Self CUDA time total: 171.419us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     977.978us       346.82%     977.978us     977.978us             1  
                                            torch_eager        11.12%     340.956us        99.84%       3.061ms       3.061ms       0.000us         0.00%     300.126us     300.126us             1  
                                              aten::mul         4.97%     152.432us         8.78%     269.242us      11.218us     132.256us        46.90%     132.256us       5.511us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.256us        46.90%     132.256us       5.511us            24  
                                            aten::copy_         3.32%     101.920us        71.12%       2.181ms     121.149us     108.702us        38.55%     126.846us       7.047us            18  
                                            aten::clone         0.96%      29.312us        68.68%       2.106ms     350.996us       0.000us         0.00%      69.855us      11.642us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      56.991us        20.21%      56.991us       4.749us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      51.711us        18.34%      51.711us       8.618us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.024us        14.55%      41.024us       3.419us            12  
                                              aten::sub         1.25%      38.245us         2.06%      63.315us      10.553us      20.608us         7.31%      20.608us       3.435us             6  
                                              aten::add         1.02%      31.345us         1.82%      55.786us       9.298us      20.416us         7.24%      20.416us       3.403us             6  
                                Activity Buffer Request        57.69%       1.769ms        57.69%       1.769ms       1.769ms      18.144us         6.43%      18.144us      18.144us             1  
                                    aten::empty_strided         1.06%      32.360us         1.06%      32.360us       5.393us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.88%     241.465us         7.88%     241.465us      40.244us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.31%      70.749us         2.93%      89.730us       3.739us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.62%      18.981us         0.62%      18.981us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.66%     234.833us         7.66%     234.833us       4.892us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.16%       4.769us         0.16%       4.769us       4.769us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 3.066ms
Self CUDA time total: 281.982us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     965.746us       171.28%     965.746us     965.746us             1  
                                            torch_eager        20.15%     293.418us        99.65%       1.451ms       1.451ms       0.000us         0.00%     587.545us     587.545us             1  
                                            aten::copy_         6.98%     101.683us        42.57%     619.773us      34.432us     272.605us        48.35%     296.317us      16.462us            18  
                                              aten::mul        10.89%     158.509us        19.23%     280.051us      11.669us     225.082us        39.92%     225.082us       9.378us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     225.082us        39.92%     225.082us       9.378us            24  
                                            aten::clone         1.41%      20.520us        36.78%     535.511us      89.252us       0.000us         0.00%     206.046us      34.341us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     182.334us        32.34%     182.334us      30.389us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.271us        16.01%      90.271us       7.523us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      66.146us        11.73%      66.146us       5.512us            12  
                                              aten::sub         2.75%      40.021us         4.44%      64.623us      10.771us      33.857us         6.00%      33.857us       5.643us             6  
                                              aten::add         2.25%      32.703us         3.82%      55.604us       9.267us      32.289us         5.73%      32.289us       5.381us             6  
                                Activity Buffer Request        15.08%     219.615us        15.08%     219.615us     219.615us      23.712us         4.21%      23.712us      23.712us             1  
                                    aten::empty_strided         2.09%      30.380us         2.09%      30.380us       5.063us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        16.00%     233.025us        16.00%     233.025us      38.838us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.65%      67.660us         5.95%      86.582us       3.608us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.30%      18.922us         1.30%      18.922us       0.788us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.11%     234.495us        16.11%     234.495us       4.885us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       5.030us         0.35%       5.030us       5.030us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.456ms
Self CUDA time total: 563.833us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     970.000us      1049.25%     970.000us     970.000us             1  
                                            torch_eager        21.04%     308.715us        99.66%       1.462ms       1.462ms       0.000us         0.00%      93.567us      93.567us             1  
                                              aten::mul        10.47%     153.593us        18.60%     272.905us      11.371us      49.631us        53.69%      49.631us       2.068us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.631us        53.69%      49.631us       2.068us            24  
                                            aten::copy_         7.22%     105.943us        42.60%     624.955us      34.720us      29.345us        31.74%      30.465us       1.693us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.592us        24.44%      22.592us       1.883us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.471us        14.57%      13.471us       1.123us            12  
                                            aten::clone         1.54%      22.631us        36.99%     542.672us      90.445us       0.000us         0.00%       7.873us       1.312us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.753us         7.30%       6.753us       1.126us             6  
                                              aten::sub         2.47%      36.281us         4.16%      61.001us      10.167us       6.751us         7.30%       6.751us       1.125us             6  
                                              aten::add         2.12%      31.122us         3.62%      53.173us       8.862us       6.720us         7.27%       6.720us       1.120us             6  
                                Activity Buffer Request        15.54%     227.975us        15.54%     227.975us     227.975us       1.120us         1.21%       1.120us       1.120us             1  
                                    aten::empty_strided         2.05%      30.140us         2.05%      30.140us       5.023us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        15.29%     224.265us        15.29%     224.265us      37.378us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.74%      69.541us         6.04%      88.642us       3.693us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.30%      19.101us         1.30%      19.101us       0.796us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.87%     232.855us        15.87%     232.855us       4.851us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       4.941us         0.34%       4.941us       4.941us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.467ms
Self CUDA time total: 92.447us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     961.400us       998.83%     961.400us     961.400us             1  
                                            torch_eager        11.86%     316.997us        99.82%       2.667ms       2.667ms       0.000us         0.00%      97.565us      97.565us             1  
                                              aten::mul         5.68%     151.840us        10.03%     267.904us      11.163us      51.071us        53.06%      51.071us       2.128us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.071us        53.06%      51.071us       2.128us            24  
                                            aten::copy_         3.83%     102.366us        67.99%       1.817ms     100.926us      30.911us        32.11%      32.223us       1.790us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.040us        23.94%      23.040us       1.920us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.271us        14.83%      14.271us       1.189us            12  
                                            aten::clone         1.08%      28.789us        65.14%       1.741ms     290.113us       0.000us         0.00%       9.183us       1.530us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.871us         8.18%       7.871us       1.312us             6  
                                              aten::add         1.17%      31.182us         2.13%      57.023us       9.504us       7.136us         7.41%       7.136us       1.189us             6  
                                              aten::sub         1.39%      37.021us         2.28%      60.881us      10.147us       7.135us         7.41%       7.135us       1.189us             6  
                                Activity Buffer Request        53.27%       1.423ms        53.27%       1.423ms       1.423ms       1.312us         1.36%       1.312us       1.312us             1  
                                    aten::empty_strided         1.17%      31.390us         1.17%      31.390us       5.232us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.40%     224.384us         8.40%     224.384us      37.397us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.55%      68.065us         3.28%      87.603us       3.650us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.73%      19.538us         0.73%      19.538us       0.814us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.69%     232.215us         8.69%     232.215us       4.838us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       4.791us         0.18%       4.791us       4.791us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.672ms
Self CUDA time total: 96.253us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     938.779us       906.86%     938.779us     938.779us             1  
                                            torch_eager        20.95%     294.336us        99.65%       1.400ms       1.400ms       0.000us         0.00%     104.832us     104.832us             1  
                                              aten::mul        10.92%     153.493us        19.14%     268.855us      11.202us      55.265us        53.39%      55.265us       2.303us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.265us        53.39%      55.265us       2.303us            24  
                                            aten::copy_         7.29%     102.391us        41.81%     587.481us      32.638us      32.287us        31.19%      33.599us       1.867us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.544us        23.71%      24.544us       2.045us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.968us        15.43%      15.968us       1.331us            12  
                                            aten::clone         1.48%      20.724us        35.96%     505.273us      84.212us       0.000us         0.00%       9.055us       1.509us             6  
                                              aten::sub         2.56%      35.921us         4.48%      63.011us      10.502us       8.000us         7.73%       8.000us       1.333us             6  
                                              aten::add         2.24%      31.440us         3.80%      53.431us       8.905us       7.968us         7.70%       7.968us       1.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.743us         7.48%       7.743us       1.290us             6  
                                Activity Buffer Request        14.48%     203.474us        14.48%     203.474us     203.474us       1.312us         1.27%       1.312us       1.312us             1  
                                    aten::empty_strided         2.08%      29.281us         2.08%      29.281us       4.880us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        15.64%     219.755us        15.64%     219.755us      36.626us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.61%      64.735us         5.90%      82.941us       3.456us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.30%      18.206us         1.30%      18.206us       0.759us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.11%     226.304us        16.11%     226.304us       4.715us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       4.920us         0.35%       4.920us       4.920us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.405ms
Self CUDA time total: 103.520us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     946.116us       766.76%     946.116us     946.116us             1  
                                            torch_eager        20.39%     290.555us        99.66%       1.420ms       1.420ms       0.000us         0.00%     125.184us     125.184us             1  
                                              aten::mul        10.89%     155.196us        19.03%     271.116us      11.296us      64.930us        52.62%      64.930us       2.705us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      64.930us        52.62%      64.930us       2.705us            24  
                                            aten::copy_         7.20%     102.573us        42.57%     606.535us      33.696us      39.295us        31.85%      41.087us       2.283us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.928us        23.44%      28.928us       2.411us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.167us        15.53%      19.167us       1.597us            12  
                                            aten::clone         1.46%      20.780us        36.49%     519.930us      86.655us       0.000us         0.00%      12.159us       2.026us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.367us         8.40%      10.367us       1.728us             6  
                                              aten::add         2.22%      31.661us         3.85%      54.881us       9.147us       9.632us         7.81%       9.632us       1.605us             6  
                                              aten::sub         2.54%      36.222us         4.30%      61.232us      10.205us       9.535us         7.73%       9.535us       1.589us             6  
                                Activity Buffer Request        15.30%     218.045us        15.30%     218.045us     218.045us       1.792us         1.45%       1.792us       1.792us             1  
                                    aten::empty_strided         2.05%      29.230us         2.05%      29.230us       4.872us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        15.35%     218.676us        15.35%     218.676us      36.446us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.69%      66.771us         6.02%      85.802us       3.575us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.34%      19.031us         1.34%      19.031us       0.793us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.24%     231.391us        16.24%     231.391us       4.821us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       4.790us         0.34%       4.790us       4.790us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.425ms
Self CUDA time total: 123.392us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     984.379us       951.82%     984.379us     984.379us             1  
                                            torch_eager        12.07%     328.136us        99.82%       2.714ms       2.714ms       0.000us         0.00%     104.765us     104.765us             1  
                                              aten::mul         5.81%     158.021us        10.21%     277.512us      11.563us      55.167us        53.34%      55.167us       2.299us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.167us        53.34%      55.167us       2.299us            24  
                                            aten::copy_         3.85%     104.771us        67.79%       1.843ms     102.400us      32.352us        31.28%      33.696us       1.872us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.576us        23.76%      24.576us       2.048us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.902us        15.38%      15.902us       1.325us            12  
                                            aten::clone         1.05%      28.482us        64.81%       1.762ms     293.686us       0.000us         0.00%       9.120us       1.520us             6  
                                              aten::add         1.18%      32.072us         2.05%      55.622us       9.270us       7.966us         7.70%       7.966us       1.328us             6  
                                              aten::sub         1.34%      36.429us         2.30%      62.454us      10.409us       7.936us         7.67%       7.936us       1.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.776us         7.52%       7.776us       1.296us             6  
                                Activity Buffer Request        53.37%       1.451ms        53.37%       1.451ms       1.451ms       1.344us         1.30%       1.344us       1.344us             1  
                                    aten::empty_strided         1.13%      30.791us         1.13%      30.791us       5.132us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.01%     217.895us         8.01%     217.895us      36.316us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.53%      68.900us         3.23%      87.945us       3.664us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.70%      19.045us         0.70%      19.045us       0.794us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.78%     238.656us         8.78%     238.656us       4.972us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       4.760us         0.18%       4.760us       4.760us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.719ms
Self CUDA time total: 103.421us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     975.859us       788.60%     975.859us     975.859us             1  
                                            torch_eager        11.99%     325.892us        99.81%       2.713ms       2.713ms       0.000us         0.00%     125.537us     125.537us             1  
                                              aten::mul         5.63%     152.991us        10.00%     271.842us      11.327us      65.056us        52.57%      65.056us       2.711us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.056us        52.57%      65.056us       2.711us            24  
                                            aten::copy_         3.75%     101.941us        67.93%       1.846ms     102.570us      39.393us        31.83%      41.185us       2.288us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.961us        23.40%      28.961us       2.413us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.296us        15.59%      19.296us       1.608us            12  
                                            aten::clone         1.11%      30.152us        65.06%       1.768ms     294.730us       0.000us         0.00%      12.224us       2.037us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.432us         8.43%      10.432us       1.739us             6  
                                              aten::sub         1.31%      35.640us         2.30%      62.642us      10.440us       9.696us         7.84%       9.696us       1.616us             6  
                                              aten::add         1.19%      32.290us         2.11%      57.400us       9.567us       9.600us         7.76%       9.600us       1.600us             6  
                                Activity Buffer Request        53.71%       1.460ms        53.71%       1.460ms       1.460ms       1.792us         1.45%       1.792us       1.792us             1  
                                    aten::empty_strided         1.14%      30.851us         1.14%      30.851us       5.142us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         7.91%     214.935us         7.91%     214.935us      35.822us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.54%      69.161us         3.23%      87.912us       3.663us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.69%      18.751us         0.69%      18.751us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.85%     240.634us         8.85%     240.634us       5.013us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.031us         0.19%       5.031us       5.031us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.718ms
Self CUDA time total: 123.745us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     974.260us       552.15%     974.260us     974.260us             1  
                                            torch_eager        19.78%     293.688us        99.66%       1.480ms       1.480ms       0.000us         0.00%     179.361us     179.361us             1  
                                              aten::mul        10.71%     158.995us        19.78%     293.648us      12.235us      94.434us        53.52%      94.434us       3.935us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.434us        53.52%      94.434us       3.935us            24  
                                            aten::copy_         6.79%     100.834us        42.92%     637.126us      35.396us      57.375us        32.52%      60.287us       3.349us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.416us        22.91%      40.416us       3.368us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.640us        13.96%      24.640us       2.053us            12  
                                            aten::clone         1.76%      26.199us        37.53%     557.122us      92.854us       0.000us         0.00%      19.871us       3.312us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.959us         9.61%      16.959us       2.826us             6  
                                              aten::sub         2.42%      35.930us         4.08%      60.590us      10.098us      12.320us         6.98%      12.320us       2.053us             6  
                                              aten::add         2.11%      31.302us         3.66%      54.401us       9.067us      12.320us         6.98%      12.320us       2.053us             6  
                                Activity Buffer Request        16.99%     252.166us        16.99%     252.166us     252.166us       2.912us         1.65%       2.912us       2.912us             1  
                                    aten::empty_strided         2.00%      29.691us         2.00%      29.691us       4.948us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        14.50%     215.285us        14.50%     215.285us      35.881us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.45%      66.098us         5.67%      84.159us       3.507us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.22%      18.061us         1.22%      18.061us       0.753us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.92%     251.253us        16.92%     251.253us       5.234us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       5.020us         0.34%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.485ms
Self CUDA time total: 176.449us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     953.139us       322.89%     953.139us     953.139us             1  
                                            torch_eager        20.45%     288.223us        99.65%       1.404ms       1.404ms       0.000us         0.00%     312.341us     312.341us             1  
                                              aten::mul        10.90%     153.585us        19.25%     271.218us      11.301us     144.345us        48.90%     144.345us       6.014us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     144.345us        48.90%     144.345us       6.014us            24  
                                            aten::copy_         7.18%     101.222us        41.60%     586.173us      32.565us     110.174us        37.32%     127.326us       7.074us            18  
                                            aten::clone         1.48%      20.790us        35.41%     498.991us      83.165us       0.000us         0.00%      70.207us      11.701us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.119us        19.35%      57.119us       4.760us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.055us        17.97%      53.055us       8.843us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.670us        13.78%      40.670us       3.389us            12  
                                              aten::sub         2.69%      37.950us         4.51%      63.611us      10.602us      20.448us         6.93%      20.448us       3.408us             6  
                                              aten::add         2.21%      31.201us         3.90%      54.891us       9.149us      20.222us         6.85%      20.222us       3.370us             6  
                                Activity Buffer Request        14.48%     203.984us        14.48%     203.984us     203.984us      17.152us         5.81%      17.152us      17.152us             1  
                                    aten::empty_strided         2.20%      31.071us         2.20%      31.071us       5.179us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        15.00%     211.404us        15.00%     211.404us      35.234us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.92%      69.344us         6.26%      88.243us       3.677us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.34%      18.899us         1.34%      18.899us       0.787us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.79%     236.547us        16.79%     236.547us       4.928us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       4.990us         0.35%       4.990us       4.990us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.409ms
Self CUDA time total: 295.189us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     955.061us       540.29%     955.061us     955.061us             1  
                                            torch_eager        20.13%     285.326us        99.66%       1.412ms       1.412ms       0.000us         0.00%     179.647us     179.647us             1  
                                              aten::mul        11.53%     163.362us        19.81%     280.694us      11.696us      94.558us        53.49%      94.558us       3.940us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.558us        53.49%      94.558us       3.940us            24  
                                            aten::copy_         7.22%     102.272us        41.72%     591.162us      32.842us      57.633us        32.60%      60.513us       3.362us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.610us        22.97%      40.610us       3.384us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.576us        13.90%      24.576us       2.048us            12  
                                            aten::clone         1.52%      21.581us        35.73%     506.321us      84.387us       0.000us         0.00%      19.903us       3.317us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.023us         9.63%      17.023us       2.837us             6  
                                              aten::add         2.20%      31.170us         3.74%      52.991us       8.832us      12.352us         6.99%      12.352us       2.059us             6  
                                              aten::sub         2.66%      37.720us         4.39%      62.161us      10.360us      12.224us         6.92%      12.224us       2.037us             6  
                                Activity Buffer Request        14.91%     211.305us        14.91%     211.305us     211.305us       2.880us         1.63%       2.880us       2.880us             1  
                                    aten::empty_strided         2.11%      29.970us         2.11%      29.970us       4.995us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        14.80%     209.714us        14.80%     209.714us      34.952us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.81%      68.154us         6.24%      88.396us       3.683us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.43%      20.242us         1.43%      20.242us       0.843us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.33%     231.465us        16.33%     231.465us       4.822us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       4.860us         0.34%       4.860us       4.860us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.417ms
Self CUDA time total: 176.767us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     947.102us       319.90%     947.102us     947.102us             1  
                                            torch_eager        20.76%     285.746us        99.63%       1.371ms       1.371ms       0.000us         0.00%     313.885us     313.885us             1  
                                              aten::mul        11.26%     155.004us        19.90%     273.893us      11.412us     144.735us        48.89%     144.735us       6.031us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     144.735us        48.89%     144.735us       6.031us            24  
                                            aten::copy_         7.73%     106.340us        40.54%     558.012us      31.001us     110.624us        37.37%     128.447us       7.136us            18  
                                            aten::clone         1.60%      22.060us        34.25%     471.499us      78.583us       0.000us         0.00%      71.454us      11.909us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      56.993us        19.25%      56.993us       4.749us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.631us        18.11%      53.631us       8.939us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.703us        13.75%      40.703us       3.392us            12  
                                              aten::sub         2.65%      36.432us         4.41%      60.743us      10.124us      20.447us         6.91%      20.447us       3.408us             6  
                                              aten::add         2.33%      32.010us         4.20%      57.842us       9.640us      20.256us         6.84%      20.256us       3.376us             6  
                                Activity Buffer Request        13.03%     179.384us        13.03%     179.384us     179.384us      17.823us         6.02%      17.823us      17.823us             1  
                                    aten::empty_strided         2.15%      29.560us         2.15%      29.560us       4.927us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        14.91%     205.294us        14.91%     205.294us      34.216us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.76%      65.565us         6.07%      83.544us       3.481us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.31%      17.979us         1.31%      17.979us       0.749us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        17.15%     236.026us        17.15%     236.026us       4.917us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.37%       5.100us         0.37%       5.100us       5.100us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.377ms
Self CUDA time total: 296.062us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.189ms       203.30%       1.189ms       1.189ms             1  
                                            torch_eager        20.67%     348.290us        99.66%       1.679ms       1.679ms       0.000us         0.00%     608.543us     608.543us             1  
                                            aten::copy_         7.05%     118.714us        39.22%     660.725us      36.707us     268.894us        45.98%     292.638us      16.258us            18  
                                              aten::mul        12.17%     204.984us        20.89%     351.955us      14.665us     249.922us        42.74%     249.922us      10.413us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     249.922us        42.74%     249.922us      10.413us            24  
                                            aten::clone         1.51%      25.362us        33.77%     568.912us      94.819us       0.000us         0.00%     201.823us      33.637us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     178.079us        30.45%     178.079us      29.680us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.815us        15.53%      90.815us       7.568us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.983us        11.28%      65.983us       5.499us            12  
                                              aten::sub         2.59%      43.649us         4.45%      75.001us      12.500us      33.056us         5.65%      33.056us       5.509us             6  
                                              aten::add         2.76%      46.482us         4.73%      79.603us      13.267us      32.927us         5.63%      32.927us       5.488us             6  
                                Activity Buffer Request        13.27%     223.575us        13.27%     223.575us     223.575us      23.744us         4.06%      23.744us      23.744us             1  
                                    aten::empty_strided         2.16%      36.470us         2.16%      36.470us       6.078us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        14.48%     243.975us        14.48%     243.975us      40.662us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.76%      80.109us         6.03%     101.610us       4.234us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.28%      21.501us         1.28%      21.501us       0.896us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.97%     285.905us        16.97%     285.905us       5.956us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       5.680us         0.34%       5.680us       5.680us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.685ms
Self CUDA time total: 584.799us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         9.32%     354.797us        80.27%       3.054ms       3.054ms       0.000us         0.00%       1.838ms       1.838ms             1  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.810ms       102.14%       1.810ms       1.810ms             1  
                                            aten::copy_         2.75%     104.734us        56.42%       2.147ms     119.270us     795.642us        44.90%     861.818us      47.879us            18  
                                              aten::mul         4.14%     157.684us         7.25%     275.917us      11.497us     828.220us        46.73%     828.220us      34.509us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     828.220us        46.73%     828.220us      34.509us            24  
                                            aten::clone         0.75%      28.679us        54.47%       2.072ms     345.404us       0.000us         0.00%     628.732us     104.789us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     562.556us        31.74%     562.556us      93.759us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     233.086us        13.15%     233.086us      19.424us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     148.320us         8.37%     148.320us      12.360us            12  
                                              aten::sub         1.03%      39.321us         1.69%      64.121us      10.687us      89.920us         5.07%      89.920us      14.987us             6  
                                Activity Buffer Request        46.12%       1.755ms        46.12%       1.755ms       1.755ms      66.176us         3.73%      66.176us      66.176us             1  
                                              aten::add         0.96%      36.600us         1.59%      60.490us      10.082us      58.400us         3.30%      58.400us       9.733us             6  
                                    aten::empty_strided         0.88%      33.672us         0.88%      33.672us       5.612us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         5.82%     221.424us         5.82%     221.424us      36.904us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         1.87%      71.004us         2.35%      89.583us       3.733us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.49%      18.579us         0.49%      18.579us       0.774us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         6.12%     232.984us         6.12%     232.984us       4.854us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize        19.73%     750.696us        19.73%     750.696us     750.696us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 3.805ms
Self CUDA time total: 1.772ms


impl                     wl                  p50(ms)  ok
torch_eager              cuda_B1_S128_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
torch_eager              cuda_B1_S128_H8_D64_R32     0.18  True
torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S2048_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S2048_H8_D128_R64     0.22  True
torch_eager              cuda_B1_S2048_H8_D64_R32     0.22  True
torch_eager              cuda_B1_S512_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S512_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S512_H8_D128_R64     0.22  True
torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S128_H32_D128_R64     0.22  True
torch_eager              cuda_B2_S128_H32_D64_R32     0.22  True
torch_eager              cuda_B2_S128_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S128_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S2048_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S512_H32_D128_R64     0.22  True
torch_eager              cuda_B2_S512_H32_D64_R32     0.22  True
torch_eager              cuda_B2_S512_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S512_H8_D64_R32     0.22  True

Artifacts:

rotary.jsonl