Spaces:

harmdevries
/

transformer_inference

Runtime error

harmdevries commited on Nov 3, 2022

Commit

064a5f0

1 Parent(s): 32aafee

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -160,7 +160,7 @@ c1.write("Cached keys and values (GB)")
 acts = round(2*bs*l*(d/h)*2*n/1e9, 2)
 c2.write(str(acts))
-st.subheader("Approximations")
 st.markdown("We use the [following crude approximation](https://docs.nvidia.com/deeplearning/performance/dl-performance-gpu-background/index.html#understand-perf) to estimate the execution time for each matrix multiplication.")
 st.latex("C = A \cdot B")
@@ -183,9 +183,13 @@ st.latex("T_{math}(A \cdot B) = 2*M*K*N / BW_{math}")
 st.markdown("where BW_math is the number of floating point operations per second (e.g. 312 TFLOPS for an A100 GPU)")
 st.markdown("If we assume we can *perfectly* overlap memory access with math operations, then the estimated execution time for the operation is:")
-st.latex("max(T_math, T_mem)")
-breakdown = st.checkbox("Show breakdown per operation")
 if breakdown:
   st.header('Attention layer')

 acts = round(2*bs*l*(d/h)*2*n/1e9, 2)
 c2.write(str(acts))
+st.subheader("Estimating execution time")
 st.markdown("We use the [following crude approximation](https://docs.nvidia.com/deeplearning/performance/dl-performance-gpu-background/index.html#understand-perf) to estimate the execution time for each matrix multiplication.")
 st.latex("C = A \cdot B")
 st.markdown("where BW_math is the number of floating point operations per second (e.g. 312 TFLOPS for an A100 GPU)")
 st.markdown("If we assume we can *perfectly* overlap memory access with math operations, then the estimated execution time for the operation is:")
+st.latex("max(T_{math}, T_{mem})")
+st.markdown("We also a minimum time for executing the operation due to [kernel launch overhead](https://forums.developer.nvidia.com/t/any-way-to-measure-the-latency-of-a-kernel-launch/221413/2)")
+st.subheader("Operations in MHA and MQA")
+breakdown = st.checkbox("Show inference time for each operation")
 if breakdown:
   st.header('Attention layer')