Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Nov 3, 2022

Commit

c88286f

1 Parent(s): 064a5f0

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -59

app.py CHANGED Viewed

@@ -116,7 +116,7 @@ def print_kernel_execution(flop, nbytes):
   c2.write(str(THREAD_OVERHEAD))
 st.title("Inference time MHA vs MQA")
-st.write("This space approximates the inference time for Multi-Query Attention and Multi-Head Attention model. You can change the hyperparameters in sidebar.")
 mqa_total_time = 0.
 mha_total_time = 0.
@@ -187,63 +187,62 @@ st.latex("max(T_{math}, T_{mem})")
 st.markdown("We also a minimum time for executing the operation due to [kernel launch overhead](https://forums.developer.nvidia.com/t/any-way-to-measure-the-latency-of-a-kernel-launch/221413/2)")
-st.subheader("Operations in MHA and MQA")
-breakdown = st.checkbox("Show inference time for each operation")
-if breakdown:
-  st.header('Attention layer')
-  st.subheader('QKV projection')
-  st.caption("Multi-Head Attention")
-  flop, nbytes, exec_time = qkv_mha_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.caption("Multi-Query Attention")
-  flop, nbytes, exec_time = qkv_mqa_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.subheader('QK gemm')
-  st.write("Showing calculation for the maximum sequence length (n)")
-  st.caption("Multi-Head Attention")
-  flop, nbytes, exec_time = att1_mha_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.caption("Multi-Query Attention")
-  flop, nbytes, exec_time = att1_mqa_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.subheader('Attention-value gemm')
-  st.write("Showing calculation for the maximum sequence length (n)")
-  st.caption("Multi-Head Attention")
-  flop, nbytes, exec_time = att2_mha_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.caption("Multi-Query Attention")
-  flop, nbytes, exec_time = att2_mqa_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.subheader('Output projection')
-  flop, nbytes, exec_time = out_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.subheader('Element-wise ops')
-  st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")
-  st.caption("Softmax")
-  flop, nbytes, exec_time = softmax_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.caption("Layer norm/residual connection")
-  flop, nbytes, exec_time = ln_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.header('MLP layer')
-  st.subheader('First and Second Linear Layer')
-  flop, nbytes, exec_time = mlp_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)
-  st.subheader('Element-wise ops')
-  st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
-  flop, nbytes, exec_time = ln_exec(bs, h, n, d)
-  print_kernel_execution(flop, nbytes)

   c2.write(str(THREAD_OVERHEAD))
 st.title("Inference time MHA vs MQA")
+st.write("This space approximates the inference time for Multi-Query Attention and Multi-Head Attention transformers. You can change the hyperparameters in sidebar.")
 mqa_total_time = 0.
 mha_total_time = 0.
 st.markdown("We also a minimum time for executing the operation due to [kernel launch overhead](https://forums.developer.nvidia.com/t/any-way-to-measure-the-latency-of-a-kernel-launch/221413/2)")
+st.subheader("Inference time for Transformer operations")
+st.text("We can now estimate the execution for each of the operations in the transformer model. I suggest you inspect the code for details on the calculations. ")
+st.subheader('Attention layer')
+st.markdown('**QKV projection**')
+st.caption("Multi-Head Attention")
+flop, nbytes, exec_time = qkv_mha_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.caption("Multi-Query Attention")
+flop, nbytes, exec_time = qkv_mqa_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.markdown('**QK gemm**')
+st.write("Showing calculation for the maximum sequence length (n)")
+st.caption("Multi-Head Attention")
+flop, nbytes, exec_time = att1_mha_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.caption("Multi-Query Attention")
+flop, nbytes, exec_time = att1_mqa_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.markdown('**Attention-value gemm**')
+st.write("Showing calculation for the maximum sequence length (n)")
+st.caption("Multi-Head Attention")
+flop, nbytes, exec_time = att2_mha_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.caption("Multi-Query Attention")
+flop, nbytes, exec_time = att2_mqa_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.markdown('**Output projection**')
+flop, nbytes, exec_time = out_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.markdown('**Element-wise ops**')
+st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")
+st.caption("Softmax")
+flop, nbytes, exec_time = softmax_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.caption("Layer norm/residual connection")
+flop, nbytes, exec_time = ln_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.subheader('MLP layer')
+st.markdown('**First and Second Linear Layer**')
+flop, nbytes, exec_time = mlp_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)
+st.markdown('**Element-wise ops**')
+st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
+flop, nbytes, exec_time = ln_exec(bs, h, n, d)
+print_kernel_execution(flop, nbytes)