Spaces:

derek-thomas
/

transformer_calculator

Paused

App Files Files Community

derek-thomas commited on Sep 13, 2024

Commit

60517f0

verified ·

1 Parent(s): fc57cfc

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -0

app.py CHANGED Viewed

@@ -60,6 +60,55 @@ def calc_mem(hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_par
     return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
 # ---- Gradio Interface ---- #
 with gr.Blocks() as demo:
@@ -266,4 +315,97 @@ with gr.Blocks() as demo:
                 inputs=[hf_model_name_or_path],
                 outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
 demo.launch()

     return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
+# ---- FLOP Calculation ---- #
+def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer):
+    # An A_(m x k) X B_(k x n) matrix multiplication requires 2m x k x n FLOPs (factor of 2 needed to account for multiplies and adds)
+    # determine the flops factor.
+    iter_factor = 3
+    if checkpoint_activations:
+        iter_factor += 1
+    if infer:
+        iter_factor = 1
+    qkv_flops = int(iter_factor * 2 * (1 + 2 * kv_size_ratio) * num_layers * tokens * hidden_size * hidden_size)
+    attention_matrix_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
+    attention_over_values_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
+    linear_projection_flops = iter_factor * 2 * num_layers * tokens * hidden_size * hidden_size
+    ffn_flops = int(iter_factor * 2 * ffn_expansion_factor) * num_layers * tokens * hidden_size * hidden_size
+    embedding_flops = 6 * tokens * hidden_size * vocab_size
+    if moe and topk > 1:
+        ffn_flops += ffn_flops * topk / expert_interval
+    if moe:
+        gating_flops = 2 * num_experts * hidden_size / expert_interval
+    total_flops = qkv_flops + attention_matrix_flops + attention_over_values_flops + linear_projection_flops + ffn_flops + embedding_flops
+    if moe:
+        total_flops += gating_flops
+    def convert_flops(params):
+        if params == 0:
+            return "0"
+        size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
+        i = int(math.floor(math.log(params, 1000)))
+        p = math.pow(1000, i)
+        s = round(params / p, 2)
+        return f"{s} {size_name[i]}"
+    return {
+        'qkv_flops': convert_flops(qkv_flops),
+        'attention_matrix_flops': convert_flops(attention_matrix_flops),
+        'attention_over_values_flops': convert_flops(attention_over_values_flops),
+        'linear_projection_flops': convert_flops(linear_projection_flops),
+        'ffn_flops': convert_flops(ffn_flops),
+        'embedding_flops': convert_flops(embedding_flops),
+        'total_flops': convert_flops(total_flops)
+    }
 # ---- Gradio Interface ---- #
 with gr.Blocks() as demo:
                 inputs=[hf_model_name_or_path],
                 outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
+        # New FLOP Calculation Tab
+        with gr.TabItem("FLOP Calculation"):
+            gr.Markdown("""
+            ## FLOP Calculation
+            FLOP Calculation estimates the number of floating point operations (FLOPs) for training or inference of a model.
+            Provide the necessary model hyperparameters and click 'Calculate FLOPs' to get a result.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    vocab_size = gr.Number(
+                        label="Vocab Size",
+                        value=51200,
+                        info="How many tokens are in the embedding layer"
+                    )
+                    hidden_size = gr.Number(
+                        label="Hidden Size",
+                        value=6144,
+                        info="Dimension of the model's hidden size"
+                    )
+                    sequence_length = gr.Number(
+                        label="Sequence Length",
+                        value=2048,
+                        info="Sequence length used for training"
+                    )
+                    num_layers = gr.Number(
+                        label="Number of Layers",
+                        value=44,
+                        info="Number of transformer layers used in the model"
+                    )
+                    kv_size_ratio = gr.Number(
+                        label="KV Size Ratio",
+                        value=1.0,
+                        info="Ratio of kv heads to query heads used in model. 1.0 for MHA"
+                    )
+                    topk = gr.Number(
+                        label="Top K Routing for MoE",
+                        value=1,
+                        info="Top k routing for Mixture of Experts (MoE)"
+                    )
+                    moe = gr.Checkbox(
+                        label="Mixture of Experts (MoE)",
+                        value=False,
+                        info="Whether the model uses Mixture of Experts"
+                    )
+                    num_experts = gr.Number(
+                        label="Number of Experts",
+                        value=128,
+                        info="Number of experts for Mixture of Experts (MoE)"
+                    )
+                    expert_interval = gr.Number(
+                        label="Expert Interval",
+                        value=2,
+                        info="Expert interval for Mixture of Experts (MoE)"
+                    )
+                    batch_size = gr.Number(
+                        label="Batch Size",
+                        value=1,
+                        info="Global batch size in units of samples"
+                    )
+                    tokens = gr.Number(
+                        label="Number of Tokens",
+                        value=300e9,
+                        info="Total number of tokens for training"
+                    )
+                    checkpoint_activations = gr.Checkbox(
+                        label="Checkpoint Activations",
+                        value=True,
+                        info="Whether Megatron-style activation checkpointing is being used"
+                    )
+                    ffn_expansion_factor = gr.Number(
+                        label="FFN Expansion Factor",
+                        value=4,
+                        info="How much the MLP hidden size expands"
+                    )
+                    infer = gr.Checkbox(
+                        label="Inference-Only",
+                        value=False,
+                        info="Whether the model is being used for inference-only"
+                    )
+            calc_flops_button = gr.Button("Calculate FLOPs")
+            flops_result = gr.JSON(label="FLOP Calculation Result", interactive=False)
+            calc_flops_button.click(
+                calc_flops,
+                inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer],
+                outputs=flops_result
+            )
+            hf_model_name_or_path = gr.Textbox(label="HuggingFace Model Name or Path", info="Name of the HuggingFace model or local path")
+            hf_model_name_or_path.change(fn=get_hf_model_args, inputs=[hf_model_name_or_path], outputs=[num_layers, hidden_size, vocab_size, sequence_length])
 demo.launch()