Spaces:
Runtime error
Runtime error
Commit
·
992208b
1
Parent(s):
a6d7fbc
Update app.py
Browse files
app.py
CHANGED
|
@@ -123,7 +123,7 @@ st.header("Memory consumption")
|
|
| 123 |
|
| 124 |
|
| 125 |
|
| 126 |
-
breakdown = st.checkbox("Show breakdown per
|
| 127 |
if breakdown:
|
| 128 |
st.header('Attention layer')
|
| 129 |
|
|
@@ -141,7 +141,7 @@ if breakdown:
|
|
| 141 |
qkv_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
|
| 142 |
|
| 143 |
st.subheader('QK gemm')
|
| 144 |
-
st.write("
|
| 145 |
|
| 146 |
st.caption("Multi-Head Attention")
|
| 147 |
mha_flop = 2*bs*h*(d/h)*n
|
|
@@ -156,7 +156,7 @@ if breakdown:
|
|
| 156 |
att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
|
| 157 |
|
| 158 |
st.subheader('Attention-value gemm')
|
| 159 |
-
st.write("
|
| 160 |
st.caption("Multi-Head Attention")
|
| 161 |
mha_flop = 2*bs*h*n*(d/h)
|
| 162 |
mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
|
|
@@ -188,21 +188,14 @@ if breakdown:
|
|
| 188 |
ln_flop = 0
|
| 189 |
ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
|
| 190 |
|
| 191 |
-
st.header('MLP')
|
| 192 |
-
st.subheader('First Linear')
|
| 193 |
-
|
| 194 |
-
mlp1_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
|
| 195 |
c1, c2 = st.columns([2, 3])
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
st.subheader('Second Linear')
|
| 199 |
-
mlp2_flop = 2*bs*1*d*4*d
|
| 200 |
-
mlp2_bytes = 2*bs*1*d + 2*d*4*d + 2*bs*1*4*d
|
| 201 |
-
c1, c2 = st.columns([2, 3])
|
| 202 |
-
mlp2_time = print_kernel_execution(c1, c2, mlp2_flop, mlp2_bytes)
|
| 203 |
|
| 204 |
st.subheader('Element-wise ops')
|
| 205 |
st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
|
| 126 |
+
breakdown = st.checkbox("Show breakdown per operation")
|
| 127 |
if breakdown:
|
| 128 |
st.header('Attention layer')
|
| 129 |
|
|
|
|
| 141 |
qkv_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
|
| 142 |
|
| 143 |
st.subheader('QK gemm')
|
| 144 |
+
st.write("Showing calculation for the maximum sequence length (n)")
|
| 145 |
|
| 146 |
st.caption("Multi-Head Attention")
|
| 147 |
mha_flop = 2*bs*h*(d/h)*n
|
|
|
|
| 156 |
att1_mqa_time = print_kernel_execution(c1, c2, mqa_flop, mqa_bytes)
|
| 157 |
|
| 158 |
st.subheader('Attention-value gemm')
|
| 159 |
+
st.write("Showing calculation for the maximum sequence length (n)")
|
| 160 |
st.caption("Multi-Head Attention")
|
| 161 |
mha_flop = 2*bs*h*n*(d/h)
|
| 162 |
mha_bytes = 2*bs*h*n + 2*bs*h*n*(d/h) + 2*bs*h*(d/h)
|
|
|
|
| 188 |
ln_flop = 0
|
| 189 |
ln_time = print_kernel_execution(c1, c2, 0, ln_bytes)
|
| 190 |
|
| 191 |
+
st.header('MLP layer')
|
| 192 |
+
st.subheader('First and Second Linear Layer')
|
| 193 |
+
flop, nbytes, exec_time = mlp_exec(bs, h, n, d)
|
|
|
|
| 194 |
c1, c2 = st.columns([2, 3])
|
| 195 |
+
mlp2_time = print_kernel_execution(c1, c2, flop, nbytes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
st.subheader('Element-wise ops')
|
| 198 |
st.write("We also need to take into the GeLU, layer norm, and residual connection. We assume that these operations are memory bound. ")
|
| 199 |
+
flop, nbytes, exec_time = ln_exec(bs, h, n, d)
|
| 200 |
+
c1, c2 = st.columns([2, 3])
|
| 201 |
+
mlp2_time = print_kernel_execution(c1, c2, flop, nbytes)
|