Spaces:

harmdevries
/

transformer_inference

Runtime error

App Files Files Community

harmdevries commited on Oct 23, 2022

Commit

992208b

1 Parent(s): a6d7fbc

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -17

app.py CHANGED Viewed

@@ -123,7 +123,7 @@ st.header("Memory consumption")
-breakdown = st.checkbox("Show breakdown per layer")
 if breakdown:
   st.header('Attention layer')
@@ -141,7 +141,7 @@ if breakdown:
   qkv_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
   st.subheader('QK gemm')
-  st.write("Note that calculation depends on sequence length (n)")
   st.caption("Multi-Head Attention")
   mha_flop = 2*bs*h*(d/h)*n
@@ -156,7 +156,7 @@ if breakdown:
   att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
   st.subheader('Attention-value gemm')
-  st.write("Calculation depends on sequence length. We show numbers for maximum sequence length n.")
   st.caption("Multi-Head Attention")
   mha_flop = 2*bs*h*n*(d/h)
   mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
@@ -188,21 +188,14 @@ if breakdown:
   ln_flop = 0
   ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
-  st.header('MLP')
-  st.subheader('First Linear')
-  mlp1_flop = 2*bs*1*d*4*d
-  mlp1_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
   c1, c2 = st.columns([2, 3])
-  mlp1_time = print_kernel_execution(c1, c2, mlp1_flop, mlp1_bytes)
-  st.subheader('Second Linear')
-  mlp2_flop = 2*bs*1*d*4*d
-  mlp2_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
-  c1, c2 = st.columns([2, 3])
-  mlp2_time = print_kernel_execution(c1, c2, mlp2_flop, mlp2_bytes)
   st.subheader('Element-wise ops')
   st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
-  ln_bytes = 2*bs*1*d
-  ln_flop = 0
-  ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)

+breakdown = st.checkbox("Show breakdown per operation")
 if breakdown:
   st.header('Attention layer')
   qkv_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
   st.subheader('QK gemm')
+  st.write("Showing calculation for the maximum sequence length (n)")
   st.caption("Multi-Head Attention")
   mha_flop = 2*bs*h*(d/h)*n
   att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
   st.subheader('Attention-value gemm')
+  st.write("Showing calculation for the maximum sequence length (n)")
   st.caption("Multi-Head Attention")
   mha_flop = 2*bs*h*n*(d/h)
   mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
   ln_flop = 0
   ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
+  st.header('MLP layer')
+  st.subheader('First and Second Linear Layer')
+  flop, nbytes, exec_time = mlp_exec(bs, h, n, d)
   c1, c2 = st.columns([2, 3])
+  mlp2_time = print_kernel_execution(c1, c2, flop, nbytes)
   st.subheader('Element-wise ops')
   st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
+  flop, nbytes, exec_time = ln_exec(bs, h, n, d)
+  c1, c2 = st.columns([2, 3])
+  mlp2_time = print_kernel_execution(c1, c2, flop, nbytes)