Update app.py
Browse files
app.py
CHANGED
|
@@ -60,6 +60,55 @@ def calc_mem(hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_par
|
|
| 60 |
|
| 61 |
return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# ---- Gradio Interface ---- #
|
| 64 |
with gr.Blocks() as demo:
|
| 65 |
|
|
@@ -266,4 +315,97 @@ with gr.Blocks() as demo:
|
|
| 266 |
inputs=[hf_model_name_or_path],
|
| 267 |
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
demo.launch()
|
|
|
|
| 60 |
|
| 61 |
return f"Per-GPU Memory Required for Training: {per_gpu_mem_gib:.2f} GiB"
|
| 62 |
|
| 63 |
+
# ---- FLOP Calculation ---- #
|
| 64 |
+
def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer):
|
| 65 |
+
# An A_(m x k) X B_(k x n) matrix multiplication requires 2m x k x n FLOPs (factor of 2 needed to account for multiplies and adds)
|
| 66 |
+
|
| 67 |
+
# determine the flops factor.
|
| 68 |
+
iter_factor = 3
|
| 69 |
+
if checkpoint_activations:
|
| 70 |
+
iter_factor += 1
|
| 71 |
+
if infer:
|
| 72 |
+
iter_factor = 1
|
| 73 |
+
|
| 74 |
+
qkv_flops = int(iter_factor * 2 * (1 + 2 * kv_size_ratio) * num_layers * tokens * hidden_size * hidden_size)
|
| 75 |
+
attention_matrix_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
|
| 76 |
+
attention_over_values_flops = iter_factor * 2 * num_layers * tokens * sequence_length * hidden_size
|
| 77 |
+
linear_projection_flops = iter_factor * 2 * num_layers * tokens * hidden_size * hidden_size
|
| 78 |
+
ffn_flops = int(iter_factor * 2 * ffn_expansion_factor) * num_layers * tokens * hidden_size * hidden_size
|
| 79 |
+
embedding_flops = 6 * tokens * hidden_size * vocab_size
|
| 80 |
+
|
| 81 |
+
if moe and topk > 1:
|
| 82 |
+
ffn_flops += ffn_flops * topk / expert_interval
|
| 83 |
+
|
| 84 |
+
if moe:
|
| 85 |
+
gating_flops = 2 * num_experts * hidden_size / expert_interval
|
| 86 |
+
|
| 87 |
+
total_flops = qkv_flops + attention_matrix_flops + attention_over_values_flops + linear_projection_flops + ffn_flops + embedding_flops
|
| 88 |
+
|
| 89 |
+
if moe:
|
| 90 |
+
total_flops += gating_flops
|
| 91 |
+
|
| 92 |
+
def convert_flops(params):
|
| 93 |
+
if params == 0:
|
| 94 |
+
return "0"
|
| 95 |
+
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs")
|
| 96 |
+
i = int(math.floor(math.log(params, 1000)))
|
| 97 |
+
p = math.pow(1000, i)
|
| 98 |
+
s = round(params / p, 2)
|
| 99 |
+
return f"{s} {size_name[i]}"
|
| 100 |
+
|
| 101 |
+
return {
|
| 102 |
+
'qkv_flops': convert_flops(qkv_flops),
|
| 103 |
+
'attention_matrix_flops': convert_flops(attention_matrix_flops),
|
| 104 |
+
'attention_over_values_flops': convert_flops(attention_over_values_flops),
|
| 105 |
+
'linear_projection_flops': convert_flops(linear_projection_flops),
|
| 106 |
+
'ffn_flops': convert_flops(ffn_flops),
|
| 107 |
+
'embedding_flops': convert_flops(embedding_flops),
|
| 108 |
+
'total_flops': convert_flops(total_flops)
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
# ---- Gradio Interface ---- #
|
| 113 |
with gr.Blocks() as demo:
|
| 114 |
|
|
|
|
| 315 |
inputs=[hf_model_name_or_path],
|
| 316 |
outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
|
| 317 |
|
| 318 |
+
# New FLOP Calculation Tab
|
| 319 |
+
with gr.TabItem("FLOP Calculation"):
|
| 320 |
+
gr.Markdown("""
|
| 321 |
+
## FLOP Calculation
|
| 322 |
+
|
| 323 |
+
FLOP Calculation estimates the number of floating point operations (FLOPs) for training or inference of a model.
|
| 324 |
+
Provide the necessary model hyperparameters and click 'Calculate FLOPs' to get a result.
|
| 325 |
+
""")
|
| 326 |
+
with gr.Row():
|
| 327 |
+
with gr.Column():
|
| 328 |
+
vocab_size = gr.Number(
|
| 329 |
+
label="Vocab Size",
|
| 330 |
+
value=51200,
|
| 331 |
+
info="How many tokens are in the embedding layer"
|
| 332 |
+
)
|
| 333 |
+
hidden_size = gr.Number(
|
| 334 |
+
label="Hidden Size",
|
| 335 |
+
value=6144,
|
| 336 |
+
info="Dimension of the model's hidden size"
|
| 337 |
+
)
|
| 338 |
+
sequence_length = gr.Number(
|
| 339 |
+
label="Sequence Length",
|
| 340 |
+
value=2048,
|
| 341 |
+
info="Sequence length used for training"
|
| 342 |
+
)
|
| 343 |
+
num_layers = gr.Number(
|
| 344 |
+
label="Number of Layers",
|
| 345 |
+
value=44,
|
| 346 |
+
info="Number of transformer layers used in the model"
|
| 347 |
+
)
|
| 348 |
+
kv_size_ratio = gr.Number(
|
| 349 |
+
label="KV Size Ratio",
|
| 350 |
+
value=1.0,
|
| 351 |
+
info="Ratio of kv heads to query heads used in model. 1.0 for MHA"
|
| 352 |
+
)
|
| 353 |
+
topk = gr.Number(
|
| 354 |
+
label="Top K Routing for MoE",
|
| 355 |
+
value=1,
|
| 356 |
+
info="Top k routing for Mixture of Experts (MoE)"
|
| 357 |
+
)
|
| 358 |
+
moe = gr.Checkbox(
|
| 359 |
+
label="Mixture of Experts (MoE)",
|
| 360 |
+
value=False,
|
| 361 |
+
info="Whether the model uses Mixture of Experts"
|
| 362 |
+
)
|
| 363 |
+
num_experts = gr.Number(
|
| 364 |
+
label="Number of Experts",
|
| 365 |
+
value=128,
|
| 366 |
+
info="Number of experts for Mixture of Experts (MoE)"
|
| 367 |
+
)
|
| 368 |
+
expert_interval = gr.Number(
|
| 369 |
+
label="Expert Interval",
|
| 370 |
+
value=2,
|
| 371 |
+
info="Expert interval for Mixture of Experts (MoE)"
|
| 372 |
+
)
|
| 373 |
+
batch_size = gr.Number(
|
| 374 |
+
label="Batch Size",
|
| 375 |
+
value=1,
|
| 376 |
+
info="Global batch size in units of samples"
|
| 377 |
+
)
|
| 378 |
+
tokens = gr.Number(
|
| 379 |
+
label="Number of Tokens",
|
| 380 |
+
value=300e9,
|
| 381 |
+
info="Total number of tokens for training"
|
| 382 |
+
)
|
| 383 |
+
checkpoint_activations = gr.Checkbox(
|
| 384 |
+
label="Checkpoint Activations",
|
| 385 |
+
value=True,
|
| 386 |
+
info="Whether Megatron-style activation checkpointing is being used"
|
| 387 |
+
)
|
| 388 |
+
ffn_expansion_factor = gr.Number(
|
| 389 |
+
label="FFN Expansion Factor",
|
| 390 |
+
value=4,
|
| 391 |
+
info="How much the MLP hidden size expands"
|
| 392 |
+
)
|
| 393 |
+
infer = gr.Checkbox(
|
| 394 |
+
label="Inference-Only",
|
| 395 |
+
value=False,
|
| 396 |
+
info="Whether the model is being used for inference-only"
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
calc_flops_button = gr.Button("Calculate FLOPs")
|
| 400 |
+
flops_result = gr.JSON(label="FLOP Calculation Result", interactive=False)
|
| 401 |
+
calc_flops_button.click(
|
| 402 |
+
calc_flops,
|
| 403 |
+
inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer],
|
| 404 |
+
outputs=flops_result
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
hf_model_name_or_path = gr.Textbox(label="HuggingFace Model Name or Path", info="Name of the HuggingFace model or local path")
|
| 408 |
+
hf_model_name_or_path.change(fn=get_hf_model_args, inputs=[hf_model_name_or_path], outputs=[num_layers, hidden_size, vocab_size, sequence_length])
|
| 409 |
+
|
| 410 |
+
|
| 411 |
demo.launch()
|