Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 23, 2022

Commit

a6d7fbc

1 Parent(s): b31a1d5

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -89

app.py CHANGED Viewed

@@ -117,93 +117,92 @@ for i in range(n_start, n):
 st.write("Multi-Head Attention: " + str(mha_total_time))
 st.write("Multi-Query Attention: " + str(mqa_total_time))
-st.header('Attention layer')
-st.subheader('QKV projection')
-st.caption("Multi-Head Attention")
-mha_flop = 2*bs*1*d*3*d
-mha_bytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
-c1, c2 = st.columns([2, 3])
-qkv_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
-st.caption("Multi-Query Attention")
-mqa_flop = 2*bs*1*d*(1+2/h)*d
-mqa_bytes = 2*bs*1*d + 2*(2/h)*d*d + 2*bs*1*(2/h)*d
-c1, c2 = st.columns([2, 3])
-qkv_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
-st.subheader('QK gemm')
-st.write("Note that calculation depends on sequence length (n)")
-st.caption("Multi-Head Attention")
-mha_flop = 2*bs*h*(d/h)*n
-mha_bytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
-c1, c2 = st.columns([2, 3])
-att1_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
-st.caption("Multi-Query Attention")
-mqa_flop = 2*bs*h*(d/h)*n
-mqa_bytes = 2*bs*h*(d/h) + 2*bs*n*(d/h) + 2*bs*h*n
-c1, c2 = st.columns([2, 3])
-att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
-st.subheader('Attention-value gemm')
-st.write("Calculation depends on sequence length. We show numbers for maximum sequence length n.")
-st.caption("Multi-Head Attention")
-mha_flop = 2*bs*h*n*(d/h)
-mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
-c1, c2 = st.columns([2, 3])
-att2_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
-st.caption("Multi-Query Attention")
-mqa_flop = 2*bs*h*n*(d/h)
-mqa_bytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
-c1, c2 = st.columns([2, 3])
-att2_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
-st.subheader('Output projection')
-out_flop = 2*bs*1*d*d
-out_bytes = 2*bs*1*d + 2*d*d + 2*bs*1*d
-c1, c2 = st.columns([2, 3])
-out_time = print_kernel_execution(c1, c2, out_flop, out_bytes)
-st.subheader('Element-wise ops')
-st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")
-st.caption("Softmax")
-softmax_bytes = 2*bs*h*n + 2*bs*h*n
-c1, c2 = st.columns([2, 3])
-softmax_time = print_kernel_execution(c1, c2, 0, softmax_bytes)
-st.caption("Layer norm/residual connection")
-ln_bytes = 2*bs*1*d
-ln_flop = 0
-ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
-st.header('MLP')
-st.subheader('First Linear')
-mlp1_flop = 2*bs*1*d*4*d
-mlp1_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
-c1, c2 = st.columns([2, 3])
-mlp1_time = print_kernel_execution(c1, c2, mlp1_flop, mlp1_bytes)
-st.subheader('Second Linear')
-mlp2_flop = 2*bs*1*d*4*d
-mlp2_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
-c1, c2 = st.columns([2, 3])
-mlp2_time = print_kernel_execution(c1, c2, mlp2_flop, mlp2_bytes)
-st.subheader('Element-wise ops')
-st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
-ln_bytes = 2*bs*1*d
-ln_flop = 0
-ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
-st.header("Adding it all up")
-shared_time = out_time + softmax_time + 2*ln_time + mlp1_time + mlp2_time + 3*ln_time
-mha_total_time = qkv_mha_time + att1_mha_time + att2_mha_time + shared_time
-mqa_total_time = qkv_mqa_time + att1_mqa_time + att2_mqa_time + shared_time
-st.write("MHA exec time (ms): " + str(mha_total_time))
-st.write("MQA exec time (ms): " + str(mqa_total_time))

 st.write("Multi-Head Attention: " + str(mha_total_time))
 st.write("Multi-Query Attention: " + str(mqa_total_time))
+st.write("Speed-up MQA over MHA: " + str(mha_total_time/mqa_total_time))
+st.header("Memory consumption")
+breakdown = st.checkbox("Show breakdown per layer")
+if breakdown:
+  st.header('Attention layer')
+  st.subheader('QKV projection')
+  st.caption("Multi-Head Attention")
+  mha_flop = 2*bs*1*d*3*d
+  mha_bytes = 2*bs*1*d + 2*3*d*d + 2*bs*1*3*d
+  c1, c2 = st.columns([2, 3])
+  qkv_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
+  st.caption("Multi-Query Attention")
+  mqa_flop = 2*bs*1*d*(1+2/h)*d
+  mqa_bytes = 2*bs*1*d + 2*(2/h)*d*d + 2*bs*1*(2/h)*d
+  c1, c2 = st.columns([2, 3])
+  qkv_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+  st.subheader('QK gemm')
+  st.write("Note that calculation depends on sequence length (n)")
+  st.caption("Multi-Head Attention")
+  mha_flop = 2*bs*h*(d/h)*n
+  mha_bytes = 2*bs*h*(d/h) + 2*bs*h*n*(d/h) + 2*bs*h*n
+  c1, c2 = st.columns([2, 3])
+  att1_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
+  st.caption("Multi-Query Attention")
+  mqa_flop = 2*bs*h*(d/h)*n
+  mqa_bytes = 2*bs*h*(d/h) + 2*bs*n*(d/h) + 2*bs*h*n
+  c1, c2 = st.columns([2, 3])
+  att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+  st.subheader('Attention-value gemm')
+  st.write("Calculation depends on sequence length. We show numbers for maximum sequence length n.")
+  st.caption("Multi-Head Attention")
+  mha_flop = 2*bs*h*n*(d/h)
+  mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
+  c1, c2 = st.columns([2, 3])
+  att2_mha_time = print_kernel_execution(c1, c2, mha_flop, mha_bytes)
+  st.caption("Multi-Query Attention")
+  mqa_flop = 2*bs*h*n*(d/h)
+  mqa_bytes = 2*bs*n*(d/h) + 2*bs*n*(d/h) + 2*bs*h*(d/h)
+  c1, c2 = st.columns([2, 3])
+  att2_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
+  st.subheader('Output projection')
+  out_flop = 2*bs*1*d*d
+  out_bytes = 2*bs*1*d + 2*d*d + 2*bs*1*d
+  c1, c2 = st.columns([2, 3])
+  out_time = print_kernel_execution(c1, c2, out_flop, out_bytes)
+  st.subheader('Element-wise ops')
+  st.write("We also need to take into the softmax layer, layer norm, and residual connection. We assume that these operations are memory bound. ")
+  st.caption("Softmax")
+  softmax_bytes = 2*bs*h*n + 2*bs*h*n
+  c1, c2 = st.columns([2, 3])
+  softmax_time = print_kernel_execution(c1, c2, 0, softmax_bytes)
+  st.caption("Layer norm/residual connection")
+  ln_bytes = 2*bs*1*d
+  ln_flop = 0
+  ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
+  st.header('MLP')
+  st.subheader('First Linear')
+  mlp1_flop = 2*bs*1*d*4*d
+  mlp1_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
+  c1, c2 = st.columns([2, 3])
+  mlp1_time = print_kernel_execution(c1, c2, mlp1_flop, mlp1_bytes)
+  st.subheader('Second Linear')
+  mlp2_flop = 2*bs*1*d*4*d
+  mlp2_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
+  c1, c2 = st.columns([2, 3])
+  mlp2_time = print_kernel_execution(c1, c2, mlp2_flop, mlp2_bytes)
+  st.subheader('Element-wise ops')
+  st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
+  ln_bytes = 2*bs*1*d
+  ln_flop = 0
+  ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)