PyTorch Native - Rotary Position Embeddings

GPU Info

▼ code ▼ output ▶ uv-logs | Cell: nv | 0.23s | Raw GitHub
import subprocess
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
Wed Oct 29 15:50:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.195.03             Driver Version: 570.195.03     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L40S                    On  |   00000000:4D:00.0 Off |                    0 |
| N/A   29C    P0             88W /  350W |       0MiB /  46068MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Rotary Embeddings Benchmark (PyTorch Native)

▼ code ▼ output ▶ uv-logs | Cell: benchmark | 7.50s | Raw GitHub
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "numpy",
#     "torch==2.8.0",
#     "kernels-benchmark-tools",
# ]
#
# [tool.uv.sources]
# kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
# ///
import torch
import sys
from kernels_benchmark_tools import KernelTypeEnum, run_benchmark


def apply_rotary_torch(x1, x2, cos, sin, conj=False):
    """Reference rotary implementation."""
    if not conj:
        out1 = x1 * cos - x2 * sin
        out2 = x1 * sin + x2 * cos
    else:
        out1 = x1 * cos + x2 * sin
        out2 = -x1 * sin + x2 * cos
    return out1, out2


def torch_rotary(query, key, cos, sin, conj=False):
    rotary_dim = cos.shape[-1]

    # Clone inputs to avoid modifying them
    q_out = query.clone()
    k_out = key.clone()

    # Apply rotation to query
    q1 = q_out[..., :rotary_dim]
    q2 = q_out[..., rotary_dim : 2 * rotary_dim]
    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
    q_out[..., :rotary_dim] = q_out_1
    q_out[..., rotary_dim : 2 * rotary_dim] = q_out_2

    # Apply rotation to key
    k1 = k_out[..., :rotary_dim]
    k2 = k_out[..., rotary_dim : 2 * rotary_dim]
    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
    k_out[..., :rotary_dim] = k_out_1
    k_out[..., rotary_dim : 2 * rotary_dim] = k_out_2

    return q_out, k_out


run_benchmark(
    kernel_type=KernelTypeEnum.ROTARY,
    impl_name="torch_eager",
    impl_tags={"family": "pytorch", "backend": "eager"},
    impl_func=torch_rotary,
)
Running rotary benchmark on cuda with 24 workloads.

======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.033ms      1157.58%       1.033ms       1.033ms             1  
                                            torch_eager        14.26%     386.998us        99.70%       2.705ms       2.705ms       0.000us         0.00%      90.431us      90.431us             1  
                                              aten::mul         6.08%     164.867us        10.45%     283.577us      11.816us      46.976us        52.65%      46.976us       1.957us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      46.976us        52.65%      46.976us       1.957us            24  
                                            aten::copy_         3.96%     107.533us        62.14%       1.686ms      93.665us      28.959us        32.46%      30.175us       1.676us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.303us        25.00%      22.303us       1.859us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.280us        14.89%      13.280us       1.107us            12  
                                            aten::clone         1.58%      42.971us        61.19%       1.660ms     276.703us       0.000us         0.00%       7.872us       1.312us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.656us         7.46%       6.656us       1.109us             6  
                                              aten::sub         1.73%      46.871us         2.69%      72.911us      12.152us       6.656us         7.46%       6.656us       1.109us             6  
                                              aten::add         1.35%      36.531us         2.16%      58.672us       9.779us       6.624us         7.42%       6.624us       1.104us             6  
                                Activity Buffer Request        53.14%       1.442ms        53.14%       1.442ms       1.442ms       1.216us         1.36%       1.216us       1.216us             1  
                                    aten::empty_strided         2.28%      61.772us         2.28%      61.772us      10.295us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.73%      74.144us         2.73%      74.144us      12.357us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         3.20%      86.920us         4.13%     112.081us       4.670us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.93%      25.161us         0.93%      25.161us       1.048us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.45%     229.371us         8.45%     229.371us       4.779us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.30%       8.270us         0.30%       8.270us       8.270us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.713ms
Self CUDA time total: 89.215us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     920.346us      1019.88%     920.346us     920.346us             1  
                                            torch_eager        11.67%     287.669us        99.75%       2.459ms       2.459ms       0.000us         0.00%      91.392us      91.392us             1  
                                              aten::mul         5.97%     147.150us        10.47%     258.131us      10.755us      47.681us        52.84%      47.681us       1.987us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      47.681us        52.84%      47.681us       1.987us            24  
                                            aten::copy_         4.01%      98.743us        66.94%       1.650ms      91.665us      29.184us        32.34%      30.335us       1.685us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.433us        24.86%      22.433us       1.869us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.376us        14.82%      13.376us       1.115us            12  
                                            aten::clone         0.96%      23.772us        64.13%       1.581ms     263.446us       0.000us         0.00%       7.902us       1.317us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.751us         7.48%       6.751us       1.125us             6  
                                              aten::sub         1.51%      37.314us         2.51%      61.954us      10.326us       6.720us         7.45%       6.720us       1.120us             6  
                                              aten::add         1.33%      32.821us         2.21%      54.451us       9.075us       6.656us         7.38%       6.656us       1.109us             6  
                                Activity Buffer Request        58.20%       1.434ms        58.20%       1.434ms       1.434ms       1.151us         1.28%       1.151us       1.151us             1  
                                    aten::empty_strided         1.33%      32.830us         1.33%      32.830us       5.472us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.21%      54.420us         2.21%      54.420us       9.070us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.84%      69.900us         3.65%      89.853us       3.744us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.81%      19.953us         0.81%      19.953us       0.831us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.92%     219.731us         8.92%     219.731us       4.578us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.25%       6.050us         0.25%       6.050us       6.050us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.465ms
Self CUDA time total: 90.241us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.568us       966.47%     909.568us     909.568us             1  
                                            torch_eager        11.23%     276.876us        99.79%       2.460ms       2.460ms       0.000us         0.00%      95.424us      95.424us             1  
                                              aten::mul         6.27%     154.461us        10.66%     262.794us      10.950us      48.800us        51.85%      48.800us       2.033us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      48.800us        51.85%      48.800us       2.033us            24  
                                            aten::copy_         4.02%      99.094us        67.67%       1.668ms      92.677us      30.912us        32.85%      32.224us       1.790us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      23.008us        24.45%      23.008us       1.917us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.400us        15.30%      14.400us       1.200us            12  
                                            aten::clone         0.93%      22.950us        64.64%       1.593ms     265.583us       0.000us         0.00%       9.216us       1.536us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.904us         8.40%       7.904us       1.317us             6  
                                              aten::sub         1.56%      38.564us         2.52%      62.034us      10.339us       7.200us         7.65%       7.200us       1.200us             6  
                                              aten::add         1.24%      30.660us         2.12%      52.250us       8.708us       7.200us         7.65%       7.200us       1.200us             6  
                                Activity Buffer Request        58.87%       1.451ms        58.87%       1.451ms       1.451ms       1.312us         1.39%       1.312us       1.312us             1  
                                    aten::empty_strided         1.24%      30.531us         1.24%      30.531us       5.089us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         2.20%      54.240us         2.20%      54.240us       9.040us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.65%      65.401us         3.42%      84.323us       3.513us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.77%      18.922us         0.77%      18.922us       0.788us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.80%     216.993us         8.80%     216.993us       4.521us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.190us         0.21%       5.190us       5.190us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.465ms
Self CUDA time total: 94.112us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     892.572us       880.74%     892.572us     892.572us             1  
                                            torch_eager        11.35%     283.366us        99.78%       2.492ms       2.492ms       0.000us         0.00%     102.687us     102.687us             1  
                                              aten::mul         5.93%     148.202us        10.19%     254.513us      10.605us      52.956us        52.25%      52.956us       2.207us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.956us        52.25%      52.956us       2.207us            24  
                                            aten::copy_         3.94%      98.395us        68.27%       1.705ms      94.725us      32.482us        32.05%      33.826us       1.879us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.641us        24.31%      24.641us       2.053us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.905us        15.69%      15.905us       1.325us            12  
                                            aten::clone         0.86%      21.380us        65.50%       1.636ms     272.651us       0.000us         0.00%       9.185us       1.531us             6  
                                              aten::add         1.24%      31.000us         2.12%      53.041us       8.840us       8.032us         7.93%       8.032us       1.339us             6  
                                              aten::sub         1.40%      35.052us         2.32%      58.022us       9.670us       7.873us         7.77%       7.873us       1.312us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.841us         7.74%       7.841us       1.307us             6  
                                Activity Buffer Request        52.43%       1.309ms        52.43%       1.309ms       1.309ms       1.344us         1.33%       1.344us       1.344us             1  
                                    aten::empty_strided         1.32%      33.071us         1.32%      33.071us       5.512us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.52%     237.764us         9.52%     237.764us      39.627us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.60%      64.825us         3.35%      83.624us       3.484us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.75%      18.799us         0.75%      18.799us       0.783us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.44%     210.793us         8.44%     210.793us       4.392us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.22%       5.611us         0.22%       5.611us       5.611us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.498ms
Self CUDA time total: 101.343us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.478us       966.25%     907.478us     907.478us             1  
                                            torch_eager        11.02%     305.318us        99.81%       2.765ms       2.765ms       0.000us         0.00%      95.230us      95.230us             1  
                                              aten::mul         5.24%     145.172us         9.20%     254.787us      10.616us      49.023us        52.20%      49.023us       2.043us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.023us        52.20%      49.023us       2.043us            24  
                                            aten::copy_         3.74%     103.536us        70.23%       1.945ms     108.067us      30.719us        32.71%      32.031us       1.779us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.912us        24.40%      22.912us       1.909us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.176us        15.09%      14.176us       1.181us            12  
                                            aten::clone         1.09%      30.110us        67.87%       1.880ms     313.329us       0.000us         0.00%       9.119us       1.520us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.807us         8.31%       7.807us       1.301us             6  
                                              aten::sub         1.24%      34.480us         2.10%      58.270us       9.712us       7.104us         7.56%       7.104us       1.184us             6  
                                              aten::add         1.09%      30.091us         1.87%      51.880us       8.647us       7.072us         7.53%       7.072us       1.179us             6  
                                Activity Buffer Request        52.12%       1.444ms        52.12%       1.444ms       1.444ms       1.312us         1.40%       1.312us       1.312us             1  
                                    aten::empty_strided         1.13%      31.430us         1.13%      31.430us       5.238us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        12.15%     336.439us        12.15%     336.439us      56.073us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.48%      68.768us         3.17%      87.719us       3.655us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.68%      18.951us         0.68%      18.951us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.82%     216.674us         7.82%     216.674us       4.514us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.210us         0.19%       5.210us       5.210us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.770ms
Self CUDA time total: 93.918us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     917.786us       906.49%     917.786us     917.786us             1  
                                            torch_eager        10.59%     290.695us        99.81%       2.741ms       2.741ms       0.000us         0.00%     102.558us     102.558us             1  
                                              aten::mul         5.39%     148.136us         9.30%     255.477us      10.645us      52.735us        52.09%      52.735us       2.197us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      52.735us        52.09%      52.735us       2.197us            24  
                                            aten::copy_         4.15%     114.085us        70.69%       1.941ms     107.839us      32.512us        32.11%      33.824us       1.879us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.704us        24.40%      24.704us       2.059us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.999us        15.80%      15.999us       1.333us            12  
                                            aten::clone         0.78%      21.500us        67.65%       1.858ms     309.627us       0.000us         0.00%       9.120us       1.520us             6  
                                              aten::sub         1.39%      38.270us         2.26%      62.070us      10.345us       8.063us         7.96%       8.063us       1.344us             6  
                                              aten::add         1.13%      31.111us         1.93%      52.881us       8.813us       7.936us         7.84%       7.936us       1.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.71%       7.808us       1.301us             6  
                                Activity Buffer Request        52.71%       1.447ms        52.71%       1.447ms       1.447ms       1.312us         1.30%       1.312us       1.312us             1  
                                    aten::empty_strided         1.19%      32.762us         1.19%      32.762us       5.460us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        11.56%     317.516us        11.56%     317.516us      52.919us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.38%      65.270us         3.07%      84.260us       3.511us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.69%      18.990us         0.69%      18.990us       0.791us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.83%     214.935us         7.83%     214.935us       4.478us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.200us         0.19%       5.200us       5.200us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.746ms
Self CUDA time total: 101.246us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     896.601us       744.17%     896.601us     896.601us             1  
                                            torch_eager        10.66%     286.835us        99.81%       2.687ms       2.687ms       0.000us         0.00%     122.275us     122.275us             1  
                                              aten::mul         5.47%     147.118us         9.41%     253.291us      10.554us      61.985us        51.45%      61.985us       2.583us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      61.985us        51.45%      61.985us       2.583us            24  
                                            aten::copy_         3.72%     100.260us        70.38%       1.894ms     105.246us      39.265us        32.59%      41.057us       2.281us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.834us        23.93%      28.834us       2.403us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.233us        15.96%      19.233us       1.603us            12  
                                            aten::clone         0.83%      22.211us        67.89%       1.827ms     304.542us       0.000us         0.00%      12.223us       2.037us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.431us         8.66%      10.431us       1.738us             6  
                                              aten::add         1.14%      30.799us         1.94%      52.140us       8.690us       9.632us         7.99%       9.632us       1.605us             6  
                                              aten::sub         1.37%      36.770us         2.23%      59.970us       9.995us       9.601us         7.97%       9.601us       1.600us             6  
                                Activity Buffer Request        53.18%       1.431ms        53.18%       1.431ms       1.431ms       1.792us         1.49%       1.792us       1.792us             1  
                                    aten::empty_strided         1.21%      32.491us         1.21%      32.491us       5.415us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        11.26%     303.147us        11.26%     303.147us      50.525us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.49%      66.932us         3.17%      85.280us       3.553us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.68%      18.348us         0.68%      18.348us       0.765us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.81%     210.347us         7.81%     210.347us       4.382us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.020us         0.19%       5.020us       5.020us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.692ms
Self CUDA time total: 120.483us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     885.202us       514.56%     885.202us     885.202us             1  
                                            torch_eager        18.81%     279.303us        99.64%       1.480ms       1.480ms       0.000us         0.00%     174.944us     174.944us             1  
                                              aten::mul         9.70%     144.115us        16.98%     252.116us      10.505us      89.439us        51.99%      89.439us       3.727us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.439us        51.99%      89.439us       3.727us            24  
                                            aten::copy_         6.85%     101.723us        47.28%     702.206us      39.011us      57.632us        33.50%      60.544us       3.364us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.608us        23.60%      40.608us       3.384us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.961us        14.51%      24.961us       2.080us            12  
                                            aten::clone         1.41%      20.892us        42.46%     630.635us     105.106us       0.000us         0.00%      19.936us       3.323us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.024us         9.90%      17.024us       2.837us             6  
                                              aten::add         2.07%      30.702us         3.51%      52.142us       8.690us      12.545us         7.29%      12.545us       2.091us             6  
                                              aten::sub         2.41%      35.732us         4.00%      59.442us       9.907us      12.416us         7.22%      12.416us       2.069us             6  
                                Activity Buffer Request        17.15%     254.675us        17.15%     254.675us     254.675us       2.912us         1.69%       2.912us       2.912us             1  
                                    aten::empty_strided         2.07%      30.780us         2.07%      30.780us       5.130us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        19.36%     287.456us        19.36%     287.456us      47.909us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.32%      64.164us         5.58%      82.803us       3.450us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.26%      18.639us         1.26%      18.639us       0.777us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.24%     211.503us        14.24%     211.503us       4.406us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.36%       5.410us         0.36%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.485ms
Self CUDA time total: 172.032us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     907.735us       751.64%     907.735us     907.735us             1  
                                            torch_eager        18.35%     272.536us        99.65%       1.480ms       1.480ms       0.000us         0.00%     122.527us     122.527us             1  
                                              aten::mul         9.89%     146.883us        17.48%     259.553us      10.815us      62.078us        51.40%      62.078us       2.587us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      62.078us        51.40%      62.078us       2.587us            24  
                                            aten::copy_         6.65%      98.730us        45.99%     682.885us      37.938us      39.328us        32.57%      41.088us       2.283us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.832us        23.87%      28.832us       2.403us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.361us        16.03%      19.361us       1.613us            12  
                                            aten::clone         2.58%      38.249us        42.54%     631.763us     105.294us       0.000us         0.00%      12.256us       2.043us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.496us         8.69%      10.496us       1.749us             6  
                                              aten::add         2.13%      31.663us         3.60%      53.483us       8.914us       9.728us         8.06%       9.728us       1.621us             6  
                                              aten::sub         2.35%      34.954us         3.91%      58.043us       9.674us       9.633us         7.98%       9.633us       1.605us             6  
                                Activity Buffer Request        16.88%     250.706us        16.88%     250.706us     250.706us       1.760us         1.46%       1.760us       1.760us             1  
                                    aten::empty_strided         2.15%      31.912us         2.15%      31.912us       5.319us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        18.48%     274.437us        18.48%     274.437us      45.739us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.31%      63.964us         5.59%      83.053us       3.461us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.29%      19.089us         1.29%      19.089us       0.795us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.59%     216.591us        14.59%     216.591us       4.512us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       5.220us         0.35%       5.220us       5.220us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.485ms
Self CUDA time total: 120.767us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     894.399us       519.81%     894.399us     894.399us             1  
                                            torch_eager        10.51%     278.801us        99.79%       2.648ms       2.648ms       0.000us         0.00%     174.911us     174.911us             1  
                                              aten::mul         5.47%     145.104us         9.49%     251.734us      10.489us      89.535us        52.04%      89.535us       3.731us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      89.535us        52.04%      89.535us       3.731us            24  
                                            aten::copy_         3.73%      98.901us        70.34%       1.866ms     103.682us      57.696us        33.53%      60.544us       3.364us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.704us        23.66%      40.704us       3.392us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.832us        14.43%      24.832us       2.069us            12  
                                            aten::clone         0.84%      22.190us        67.69%       1.796ms     299.337us       0.000us         0.00%      19.840us       3.307us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.992us         9.88%      16.992us       2.832us             6  
                                              aten::sub         1.44%      38.162us         2.33%      61.942us      10.324us      12.448us         7.23%      12.448us       2.075us             6  
                                              aten::add         1.15%      30.549us         1.97%      52.171us       8.695us      12.384us         7.20%      12.384us       2.064us             6  
                                Activity Buffer Request        54.02%       1.433ms        54.02%       1.433ms       1.433ms       2.848us         1.66%       2.848us       2.848us             1  
                                    aten::empty_strided         1.13%      30.052us         1.13%      30.052us       5.009us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.37%     275.065us        10.37%     275.065us      45.844us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.49%      65.991us         3.19%      84.601us       3.525us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.70%      18.610us         0.70%      18.610us       0.775us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.95%     211.023us         7.95%     211.023us       4.396us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.21%       5.640us         0.21%       5.640us       5.640us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.653ms
Self CUDA time total: 172.063us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.595us       313.84%     888.595us     888.595us             1  
                                            torch_eager        18.64%     271.692us        99.64%       1.452ms       1.452ms       0.000us         0.00%     301.536us     301.536us             1  
                                              aten::mul         9.98%     145.418us        17.29%     252.060us      10.503us     132.896us        46.94%     132.896us       5.537us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     132.896us        46.94%     132.896us       5.537us            24  
                                            aten::copy_         6.89%     100.362us        46.38%     676.084us      37.560us     109.376us        38.63%     127.776us       7.099us            18  
                                            aten::clone         1.48%      21.511us        41.22%     600.853us     100.142us       0.000us         0.00%      70.560us      11.760us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.216us        20.21%      57.216us       4.768us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.160us        18.42%      52.160us       8.693us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.864us        14.43%      40.864us       3.405us            12  
                                              aten::sub         2.41%      35.143us         4.02%      58.572us       9.762us      20.512us         7.24%      20.512us       3.419us             6  
                                              aten::add         2.12%      30.932us         3.62%      52.783us       8.797us      20.352us         7.19%      20.352us       3.392us             6  
                                Activity Buffer Request        16.97%     247.406us        16.97%     247.406us     247.406us      18.400us         6.50%      18.400us      18.400us             1  
                                    aten::empty_strided         2.15%      31.370us         2.15%      31.370us       5.228us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        18.35%     267.496us        18.35%     267.496us      44.583us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.85%      70.742us         6.06%      88.302us       3.679us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.20%      17.560us         1.20%      17.560us       0.732us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.59%     212.742us        14.59%     212.742us       4.432us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.36%       5.280us         0.36%       5.280us       5.280us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.458ms
Self CUDA time total: 283.136us



======================================================================
PROFILE TRACE: torch_eager | cuda_B1_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     944.856us       167.33%     944.856us     944.856us             1  
                                            torch_eager        19.10%     286.874us        99.66%       1.497ms       1.497ms       0.000us         0.00%     588.218us     588.218us             1  
                                            aten::copy_         6.48%      97.352us        44.49%     668.224us      37.124us     273.885us        48.50%     297.437us      16.524us            18  
                                              aten::mul        11.54%     173.280us        19.20%     288.361us      12.015us     224.990us        39.84%     224.990us       9.375us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     224.990us        39.84%     224.990us       9.375us            24  
                                            aten::clone         1.34%      20.121us        39.51%     593.393us      98.899us       0.000us         0.00%     206.910us      34.485us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     183.358us        32.47%     183.358us      30.560us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.527us        16.03%      90.527us       7.544us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.791us        11.65%      65.791us       5.483us            12  
                                              aten::sub         2.45%      36.872us         4.07%      61.073us      10.179us      33.407us         5.92%      33.407us       5.568us             6  
                                              aten::add         2.13%      32.018us         3.64%      54.631us       9.105us      32.384us         5.74%      32.384us       5.397us             6  
                                Activity Buffer Request        16.63%     249.816us        16.63%     249.816us     249.816us      23.552us         4.17%      23.552us      23.552us             1  
                                    aten::empty_strided         2.02%      30.350us         2.02%      30.350us       5.058us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        17.28%     259.545us        17.28%     259.545us      43.258us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.52%      67.913us         5.81%      87.211us       3.634us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.28%      19.298us         1.28%      19.298us       0.804us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.87%     223.406us        14.87%     223.406us       4.654us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       5.141us         0.34%       5.141us       5.141us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.502ms
Self CUDA time total: 564.666us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.448us       990.96%     916.448us     916.448us             1  
                                            torch_eager        10.63%     281.892us        99.80%       2.647ms       2.647ms       0.000us         0.00%      93.601us      93.601us             1  
                                              aten::mul         5.58%     148.028us         9.67%     256.571us      10.690us      49.634us        53.67%      49.634us       2.068us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      49.634us        53.67%      49.634us       2.068us            24  
                                            aten::copy_         3.99%     105.971us        69.88%       1.854ms     102.991us      29.439us        31.83%      30.559us       1.698us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.655us        24.50%      22.655us       1.888us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      13.408us        14.50%      13.408us       1.117us            12  
                                            aten::clone         0.82%      21.802us        66.79%       1.772ms     295.325us       0.000us         0.00%       7.904us       1.317us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.784us         7.34%       6.784us       1.131us             6  
                                              aten::sub         1.36%      36.061us         2.24%      59.441us       9.907us       6.720us         7.27%       6.720us       1.120us             6  
                                              aten::add         1.25%      33.260us         2.10%      55.590us       9.265us       6.688us         7.23%       6.688us       1.115us             6  
                                Activity Buffer Request        54.00%       1.433ms        54.00%       1.433ms       1.433ms       1.120us         1.21%       1.120us       1.120us             1  
                                    aten::empty_strided         1.13%      29.861us         1.13%      29.861us       4.977us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.52%     252.488us         9.52%     252.488us      42.081us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.59%      68.801us         3.33%      88.471us       3.686us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.74%      19.670us         0.74%      19.670us       0.820us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         8.18%     216.965us         8.18%     216.965us       4.520us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.20%       5.410us         0.20%       5.410us       5.410us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.653ms
Self CUDA time total: 92.481us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.809us       923.35%     888.809us     888.809us             1  
                                            torch_eager        19.05%     273.129us        99.67%       1.429ms       1.429ms       0.000us         0.00%      97.571us      97.571us             1  
                                              aten::mul        10.09%     144.695us        17.61%     252.506us      10.521us      51.232us        53.22%      51.232us       2.135us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      51.232us        53.22%      51.232us       2.135us            24  
                                            aten::copy_         6.72%      96.301us        45.37%     650.385us      36.132us      30.786us        31.98%      32.098us       1.783us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      22.944us        23.84%      22.944us       1.912us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      14.241us        14.79%      14.241us       1.187us            12  
                                            aten::clone         1.39%      19.911us        40.43%     579.513us      96.586us       0.000us         0.00%       9.154us       1.526us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.842us         8.15%       7.842us       1.307us             6  
                                              aten::add         2.26%      32.360us         3.79%      54.320us       9.053us       7.136us         7.41%       7.136us       1.189us             6  
                                              aten::sub         2.55%      36.551us         4.17%      59.791us       9.965us       7.105us         7.38%       7.105us       1.184us             6  
                                Activity Buffer Request        16.56%     237.415us        16.56%     237.415us     237.415us       1.312us         1.36%       1.312us       1.312us             1  
                                    aten::empty_strided         2.18%      31.230us         2.18%      31.230us       5.205us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        17.96%     257.447us        17.96%     257.447us      42.908us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.71%      67.539us         6.11%      87.581us       3.649us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.40%      20.042us         1.40%      20.042us       0.835us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.80%     212.233us        14.80%     212.233us       4.422us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.33%       4.690us         0.33%       4.690us       4.690us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.434ms
Self CUDA time total: 96.259us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     903.536us       870.95%     903.536us     903.536us             1  
                                            torch_eager        18.87%     271.956us        99.65%       1.436ms       1.436ms       0.000us         0.00%     105.053us     105.053us             1  
                                              aten::mul        10.20%     146.935us        17.83%     256.897us      10.704us      55.262us        53.27%      55.262us       2.303us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.262us        53.27%      55.262us       2.303us            24  
                                            aten::copy_         6.83%      98.437us        45.05%     649.198us      36.067us      32.478us        31.31%      33.790us       1.877us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.670us        23.78%      24.670us       2.056us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      16.001us        15.42%      16.001us       1.333us            12  
                                            aten::clone         1.50%      21.580us        40.06%     577.333us      96.222us       0.000us         0.00%       9.120us       1.520us             6  
                                              aten::sub         2.49%      35.841us         4.72%      67.992us      11.332us       8.001us         7.71%       8.001us       1.333us             6  
                                              aten::add         2.31%      33.350us         3.86%      55.670us       9.278us       8.000us         7.71%       8.000us       1.333us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.53%       7.808us       1.301us             6  
                                Activity Buffer Request        16.46%     237.265us        16.46%     237.265us     237.265us       1.312us         1.26%       1.312us       1.312us             1  
                                    aten::empty_strided         2.16%      31.090us         2.16%      31.090us       5.182us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        17.50%     252.196us        17.50%     252.196us      42.033us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.40%      63.461us         5.67%      81.650us       3.402us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.26%      18.189us         1.26%      18.189us       0.758us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.66%     225.733us        15.66%     225.733us       4.703us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       5.060us         0.35%       5.060us       5.060us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.441ms
Self CUDA time total: 103.741us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S128_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     903.548us       729.80%     903.548us     903.548us             1  
                                            torch_eager        10.56%     280.674us        99.81%       2.652ms       2.652ms       0.000us         0.00%     125.567us     125.567us             1  
                                              aten::mul         5.49%     145.805us         9.46%     251.467us      10.478us      65.184us        52.65%      65.184us       2.716us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.184us        52.65%      65.184us       2.716us            24  
                                            aten::copy_         3.75%      99.563us        70.08%       1.862ms     103.468us      39.422us        31.84%      41.182us       2.288us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.928us        23.37%      28.928us       2.411us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.201us        15.51%      19.201us       1.600us            12  
                                            aten::clone         0.92%      24.379us        67.48%       1.793ms     298.872us       0.000us         0.00%      12.254us       2.042us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.494us         8.48%      10.494us       1.749us             6  
                                              aten::add         1.15%      30.622us         1.96%      52.162us       8.694us       9.633us         7.78%       9.633us       1.606us             6  
                                              aten::sub         1.45%      38.422us         2.36%      62.661us      10.443us       9.568us         7.73%       9.568us       1.595us             6  
                                Activity Buffer Request        54.94%       1.460ms        54.94%       1.460ms       1.460ms       1.760us         1.42%       1.760us       1.760us             1  
                                    aten::empty_strided         1.16%      30.801us         1.16%      30.801us       5.133us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         9.14%     242.866us         9.14%     242.866us      40.478us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.56%      67.990us         3.30%      87.783us       3.658us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.74%      19.793us         0.74%      19.793us       0.825us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.96%     211.432us         7.96%     211.432us       4.405us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.19%       5.160us         0.19%       5.160us       5.160us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.658ms
Self CUDA time total: 123.807us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     889.436us       855.74%     889.436us     889.436us             1  
                                            torch_eager        19.42%     274.045us        99.59%       1.406ms       1.406ms       0.000us         0.00%     105.282us     105.282us             1  
                                              aten::mul        10.41%     146.921us        18.18%     256.563us      10.690us      55.486us        53.38%      55.486us       2.312us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      55.486us        53.38%      55.486us       2.312us            24  
                                            aten::copy_         6.82%      96.302us        44.56%     628.895us      34.939us      32.513us        31.28%      33.857us       1.881us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      24.705us        23.77%      24.705us       2.059us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      15.939us        15.34%      15.939us       1.328us            12  
                                            aten::clone         1.41%      19.928us        39.46%     556.871us      92.812us       0.000us         0.00%       9.152us       1.525us             6  
                                              aten::sub         2.56%      36.082us         4.16%      58.744us       9.791us       7.970us         7.67%       7.970us       1.328us             6  
                                              aten::add         2.23%      31.511us         3.85%      54.282us       9.047us       7.969us         7.67%       7.969us       1.328us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       7.808us         7.51%       7.808us       1.301us             6  
                                Activity Buffer Request        15.99%     225.676us        15.99%     225.676us     225.676us       1.344us         1.29%       1.344us       1.344us             1  
                                    aten::empty_strided         2.17%      30.631us         2.17%      30.631us       5.105us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        17.52%     247.335us        17.52%     247.335us      41.223us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.52%      63.850us         5.84%      82.475us       3.436us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.32%      18.625us         1.32%      18.625us       0.776us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.21%     214.657us        15.21%     214.657us       4.472us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.41%       5.810us         0.41%       5.810us       5.810us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.411ms
Self CUDA time total: 103.938us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     888.135us       717.15%     888.135us     888.135us             1  
                                            torch_eager        18.91%     268.465us        99.65%       1.415ms       1.415ms       0.000us         0.00%     125.666us     125.666us             1  
                                              aten::mul        10.15%     144.114us        17.70%     251.265us      10.469us      65.346us        52.77%      65.346us       2.723us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      65.346us        52.77%      65.346us       2.723us            24  
                                            aten::copy_         6.90%      97.992us        45.41%     644.725us      35.818us      39.328us        31.76%      41.152us       2.286us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      28.800us        23.26%      28.800us       2.400us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      19.168us        15.48%      19.168us       1.597us            12  
                                            aten::clone         1.46%      20.690us        40.33%     572.532us      95.422us       0.000us         0.00%      12.352us       2.059us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      10.528us         8.50%      10.528us       1.755us             6  
                                              aten::add         2.19%      31.029us         3.69%      52.390us       8.732us       9.600us         7.75%       9.600us       1.600us             6  
                                              aten::sub         2.50%      35.469us         4.13%      58.580us       9.763us       9.568us         7.73%       9.568us       1.595us             6  
                                Activity Buffer Request        15.69%     222.765us        15.69%     222.765us     222.765us       1.824us         1.47%       1.824us       1.824us             1  
                                    aten::empty_strided         2.29%      32.500us         2.29%      32.500us       5.417us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        18.50%     262.716us        18.50%     262.716us      43.786us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.70%      66.710us         6.07%      86.108us       3.588us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.37%      19.398us         1.37%      19.398us       0.808us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.99%     212.875us        14.99%     212.875us       4.435us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       5.010us         0.35%       5.010us       5.010us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.420ms
Self CUDA time total: 123.842us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     910.045us       513.35%     910.045us     910.045us             1  
                                            torch_eager         9.66%     280.213us        99.83%       2.894ms       2.894ms       0.000us         0.00%     180.188us     180.188us             1  
                                              aten::mul         5.18%     150.102us         9.00%     260.863us      10.869us      94.655us        53.39%      94.655us       3.944us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.655us        53.39%      94.655us       3.944us            24  
                                            aten::copy_         3.40%      98.673us        72.45%       2.101ms     116.706us      57.885us        32.65%      60.797us       3.378us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.799us        23.01%      40.799us       3.400us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.736us        13.95%      24.736us       2.061us            12  
                                            aten::clone         0.79%      22.860us        70.00%       2.030ms     338.262us       0.000us         0.00%      19.998us       3.333us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      17.086us         9.64%      17.086us       2.848us             6  
                                              aten::add         1.13%      32.880us         1.89%      54.761us       9.127us      12.416us         7.00%      12.416us       2.069us             6  
                                              aten::sub         1.18%      34.239us         1.98%      57.551us       9.592us      12.320us         6.95%      12.320us       2.053us             6  
                                Activity Buffer Request        58.76%       1.704ms        58.76%       1.704ms       1.704ms       2.912us         1.64%       2.912us       2.912us             1  
                                    aten::empty_strided         1.11%      32.150us         1.11%      32.150us       5.358us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.21%     238.144us         8.21%     238.144us      39.691us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.29%      66.481us         2.94%      85.213us       3.551us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.65%      18.732us         0.65%      18.732us       0.781us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.46%     216.224us         7.46%     216.224us       4.505us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.17%       5.070us         0.17%       5.070us       5.070us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.899ms
Self CUDA time total: 177.276us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S512_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     919.612us       310.44%     919.612us     919.612us             1  
                                            torch_eager        10.49%     286.464us        99.82%       2.726ms       2.726ms       0.000us         0.00%     313.057us     313.057us             1  
                                              aten::mul         5.34%     145.716us         9.29%     253.789us      10.575us     145.182us        49.01%     145.182us       6.049us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.182us        49.01%     145.182us       6.049us            24  
                                            aten::copy_         3.69%     100.696us        70.60%       1.928ms     107.115us     109.985us        37.13%     126.817us       7.045us            18  
                                            aten::clone         0.88%      23.951us        68.02%       1.858ms     309.597us       0.000us         0.00%      69.474us      11.579us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.343us        19.36%      57.343us       4.779us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      52.642us        17.77%      52.642us       8.774us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      41.058us        13.86%      41.058us       3.421us            12  
                                              aten::sub         1.33%      36.191us         2.18%      59.621us       9.937us      20.609us         6.96%      20.609us       3.435us             6  
                                              aten::add         1.14%      31.230us         1.95%      53.190us       8.865us      20.449us         6.90%      20.449us       3.408us             6  
                                Activity Buffer Request        56.07%       1.531ms        56.07%       1.531ms       1.531ms      16.832us         5.68%      16.832us      16.832us             1  
                                    aten::empty_strided         1.17%      32.070us         1.17%      32.070us       5.345us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync         8.59%     234.696us         8.59%     234.696us      39.116us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.53%      69.062us         3.26%      88.922us       3.705us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.73%      19.860us         0.73%      19.860us       0.827us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         7.86%     214.752us         7.86%     214.752us       4.474us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.18%       4.930us         0.18%       4.930us       4.930us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.731ms
Self CUDA time total: 296.225us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     889.394us       501.79%     889.394us     889.394us             1  
                                            torch_eager        17.97%     266.975us        99.65%       1.481ms       1.481ms       0.000us         0.00%     180.092us     180.092us             1  
                                              aten::mul         9.80%     145.611us        16.96%     251.937us      10.497us      94.974us        53.58%      94.974us       3.957us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      94.974us        53.58%      94.974us       3.957us            24  
                                            aten::copy_         6.75%     100.282us        47.98%     712.837us      39.602us      57.694us        32.55%      60.542us       3.363us            18  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      40.736us        22.98%      40.736us       3.395us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      24.576us        13.87%      24.576us       2.048us            12  
                                            aten::clone         1.38%      20.549us        43.06%     639.725us     106.621us       0.000us         0.00%      19.806us       3.301us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      16.958us         9.57%      16.958us       2.826us             6  
                                              aten::sub         2.49%      37.040us         4.14%      61.531us      10.255us      12.289us         6.93%      12.289us       2.048us             6  
                                              aten::add         2.11%      31.282us         3.59%      53.402us       8.900us      12.287us         6.93%      12.287us       2.048us             6  
                                Activity Buffer Request        19.87%     295.257us        19.87%     295.257us     295.257us       2.848us         1.61%       2.848us       2.848us             1  
                                    aten::empty_strided         2.04%      30.372us         2.04%      30.372us       5.062us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        17.34%     257.637us        17.34%     257.637us      42.940us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.31%      64.000us         5.58%      82.951us       3.456us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.28%      18.951us         1.28%      18.951us       0.790us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        14.31%     212.598us        14.31%     212.598us       4.429us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       5.130us         0.35%       5.130us       5.130us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.486ms
Self CUDA time total: 177.244us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H8_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     909.016us       306.24%     909.016us     909.016us             1  
                                            torch_eager        19.05%     269.264us        99.66%       1.409ms       1.409ms       0.000us         0.00%     314.684us     314.684us             1  
                                              aten::mul        10.56%     149.323us        19.02%     268.875us      11.203us     145.440us        49.00%     145.440us       6.060us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     145.440us        49.00%     145.440us       6.060us            24  
                                            aten::copy_         6.96%      98.305us        44.09%     623.125us      34.618us     110.751us        37.31%     128.606us       7.145us            18  
                                            aten::clone         1.45%      20.520us        38.80%     548.422us      91.404us       0.000us         0.00%      71.453us      11.909us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      57.153us        19.25%      57.153us       4.763us            12  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      53.598us        18.06%      53.598us       8.933us             6  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      40.638us        13.69%      40.638us       3.387us            12  
                                              aten::add         2.27%      32.070us         3.85%      54.390us       9.065us      20.352us         6.86%      20.352us       3.392us             6  
                                              aten::sub         2.35%      33.277us         4.05%      57.282us       9.547us      20.286us         6.83%      20.286us       3.381us             6  
                                Activity Buffer Request        15.96%     225.655us        15.96%     225.655us     225.655us      17.855us         6.02%      17.855us      17.855us             1  
                                    aten::empty_strided         2.15%      30.350us         2.15%      30.350us       5.058us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        16.79%     237.294us        16.79%     237.294us      39.549us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.69%      66.249us         6.00%      84.797us       3.533us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.31%      18.548us         1.31%      18.548us       0.773us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        16.11%     227.748us        16.11%     227.748us       4.745us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.34%       4.840us         0.34%       4.840us       4.840us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.413ms
Self CUDA time total: 296.829us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D64_R32
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us     916.757us       157.09%     916.757us     916.757us             1  
                                            torch_eager        19.46%     274.242us        99.65%       1.404ms       1.404ms       0.000us         0.00%     607.350us     607.350us             1  
                                            aten::copy_         7.01%      98.793us        43.42%     611.905us      33.995us     268.603us        46.03%     292.379us      16.243us            18  
                                              aten::mul        10.57%     148.926us        18.84%     265.480us      11.062us     249.086us        42.68%     249.086us      10.379us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     249.086us        42.68%     249.086us      10.379us            24  
                                            aten::clone         1.44%      20.340us        38.12%     537.253us      89.542us       0.000us         0.00%     202.173us      33.696us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     178.397us        30.57%     178.397us      29.733us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      90.206us        15.46%      90.206us       7.517us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      65.885us        11.29%      65.885us       5.490us            12  
                                              aten::sub         2.63%      37.022us         4.37%      61.602us      10.267us      33.151us         5.68%      33.151us       5.525us             6  
                                              aten::add         2.33%      32.810us         3.92%      55.180us       9.197us      32.734us         5.61%      32.734us       5.456us             6  
                                Activity Buffer Request        15.58%     219.605us        15.58%     219.605us     219.605us      23.776us         4.07%      23.776us      23.776us             1  
                                    aten::empty_strided         2.10%      29.631us         2.10%      29.631us       4.938us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        16.49%     232.396us        16.49%     232.396us      38.733us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         4.73%      66.612us         6.10%      85.953us       3.581us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         1.37%      19.341us         1.37%      19.341us       0.806us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel        15.94%     224.615us        15.94%     224.615us       4.679us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize         0.35%       4.910us         0.35%       4.910us       4.910us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 1.409ms
Self CUDA time total: 583.574us



======================================================================
PROFILE TRACE: torch_eager | cuda_B2_S2048_H32_D128_R64
======================================================================
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            torch_eager        12.10%     272.127us        61.47%       1.382ms       1.382ms       0.000us         0.00%       1.837ms       1.837ms             1  
                                            torch_eager         0.00%       0.000us         0.00%       0.000us       0.000us       1.810ms       102.21%       1.810ms       1.810ms             1  
                                            aten::copy_         4.74%     106.692us        27.02%     607.756us      33.764us     794.110us        44.84%     859.966us      47.776us            18  
                                              aten::mul         6.35%     142.895us        11.18%     251.386us      10.474us     829.085us        46.82%     829.085us      34.545us            24  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     829.085us        46.82%     829.085us      34.545us            24  
                                            aten::clone         0.94%      21.099us        23.42%     526.743us      87.790us       0.000us         0.00%     627.678us     104.613us             6  
                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us     561.822us        31.73%     561.822us      93.637us             6  
void at::native::elementwise_kernel<128, 4, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us     232.288us        13.12%     232.288us      19.357us            12  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     147.650us         8.34%     147.650us      12.304us            12  
                                              aten::sub         1.58%      35.541us         2.61%      58.661us       9.777us      89.538us         5.06%      89.538us      14.923us             6  
                                Activity Buffer Request         9.29%     208.845us         9.29%     208.845us     208.845us      65.856us         3.72%      65.856us      65.856us             1  
                                              aten::add         1.43%      32.251us         2.42%      54.461us       9.077us      58.112us         3.28%      58.112us       9.685us             6  
                                    aten::empty_strided         1.39%      31.342us         1.39%      31.342us       5.224us       0.000us         0.00%       0.000us       0.000us             6  
                                        cudaMemcpyAsync        10.27%     230.957us        10.27%     230.957us      38.493us       0.000us         0.00%       0.000us       0.000us             6  
                                            aten::slice         2.99%      67.270us         3.80%      85.550us       3.565us       0.000us         0.00%       0.000us       0.000us            24  
                                       aten::as_strided         0.81%      18.280us         0.81%      18.280us       0.762us       0.000us         0.00%       0.000us       0.000us            24  
                                       cudaLaunchKernel         9.56%     215.083us         9.56%     215.083us       4.481us       0.000us         0.00%       0.000us       0.000us            48  
                                  cudaDeviceSynchronize        38.53%     866.589us        38.53%     866.589us     866.589us       0.000us         0.00%       0.000us       0.000us             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.249ms
Self CUDA time total: 1.771ms


impl                     wl                  p50(ms)  ok
torch_eager              cuda_B1_S128_H32_D128_R64     0.21  True
torch_eager              cuda_B1_S128_H32_D64_R32     0.22  True
torch_eager              cuda_B1_S128_H8_D128_R64     0.22  True
torch_eager              cuda_B1_S128_H8_D64_R32     0.17  True
torch_eager              cuda_B1_S2048_H32_D128_R64     0.22  True
torch_eager              cuda_B1_S2048_H32_D64_R32     0.21  True
torch_eager              cuda_B1_S2048_H8_D128_R64     0.21  True
torch_eager              cuda_B1_S2048_H8_D64_R32     0.21  True
torch_eager              cuda_B1_S512_H32_D128_R64     0.21  True
torch_eager              cuda_B1_S512_H32_D64_R32     0.21  True
torch_eager              cuda_B1_S512_H8_D128_R64     0.21  True
torch_eager              cuda_B1_S512_H8_D64_R32     0.22  True
torch_eager              cuda_B2_S128_H32_D128_R64     0.21  True
torch_eager              cuda_B2_S128_H32_D64_R32     0.21  True
torch_eager              cuda_B2_S128_H8_D128_R64     0.21  True
torch_eager              cuda_B2_S128_H8_D64_R32     0.21  True
torch_eager              cuda_B2_S2048_H32_D128_R64     0.64  True
torch_eager              cuda_B2_S2048_H32_D64_R32     0.23  True
torch_eager              cuda_B2_S2048_H8_D128_R64     0.22  True
torch_eager              cuda_B2_S2048_H8_D64_R32     0.21  True
torch_eager              cuda_B2_S512_H32_D128_R64     0.21  True
torch_eager              cuda_B2_S512_H32_D64_R32     0.21  True
torch_eager              cuda_B2_S512_H8_D128_R64     0.21  True
torch_eager              cuda_B2_S512_H8_D64_R32     0.21  True
▶ UV Install Logs

Artifacts:

rotary.jsonl