Spaces:

scratchtoscale
/

training-time-calculator

Running

App Files Files Community

muellerzr commited on Sep 19

Commit

32e471c

verified ·

1 Parent(s): a687dc1

Create training_time_calculator.py

Browse files

Files changed (1) hide show

training_time_calculator.py +272 -0

training_time_calculator.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import gradio as gr
+import csv
+import os
+import numpy as np
+def load_gpu_data():
+    """Load GPU data from gpus.csv file."""
+    gpu_data = {}
+    csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv')
+    try:
+        with open(csv_path, 'r') as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                gpu_name = row['gpu_model'].replace('_', ' ')
+                tflops = float(row['sparce_tflops'])
+                gpu_data[gpu_name] = tflops
+    except Exception as e:
+        print(f"Error loading GPU data: {e}")
+        gpu_data = {"Custom": 0}
+    return gpu_data
+def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage):
+    """
+    Calculate the time to train a model.
+    Formula:
+    - Total FLOPs = 6 * num_params * num_tokens
+    - Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100)
+    - Training time = Total FLOPs / Effective FLOPs per second
+    Args:
+        model_size_billions: Model size in billions of parameters
+        tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity)
+        num_gpus: Number of GPUs used
+        tokens_millions: Number of tokens in millions
+        mfu_percentage: Model FLOPs Utilization percentage
+    Returns:
+        Training time in hours
+    """
+    # Convert inputs to base units
+    num_params = model_size_billions * 1e9
+    num_tokens = tokens_millions * 1e6
+    # Calculate total FLOPs needed
+    total_flops = 6 * num_params * num_tokens
+    # Calculate effective FLOPs per second
+    # tflops_per_gpu is in 10^12 FLOPs per second
+    flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100)
+    # Calculate training time in seconds
+    training_time_seconds = total_flops / flops_per_second
+    # Convert to hours
+    training_time_hours = training_time_seconds / 3600
+    return training_time_hours
+def format_output(hours):
+    """Format the output in a readable way."""
+    if hours < 24:
+        return f"{hours:.2f} hours"
+    else:
+        days = hours / 24
+        if days < 30:
+            return f"{days:.2f} days ({hours:.1f} hours)"
+        else:
+            months = days / 30
+            return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)"
+def slider_to_model_size(value):
+    """Convert logarithmic slider value to actual model size in billions."""
+    # Map 0-100 to 0.1B-1000B logarithmically
+    min_log = np.log10(0.1)  # -1
+    max_log = np.log10(1000)  # 3
+    log_value = min_log + (max_log - min_log) * value / 100
+    return 10 ** log_value
+def model_size_to_slider(size_billions):
+    """Convert model size in billions to slider value."""
+    min_log = np.log10(0.1)
+    max_log = np.log10(1000)
+    log_value = np.log10(size_billions)
+    return 100 * (log_value - min_log) / (max_log - min_log)
+def format_model_size(size_billions):
+    """Format model size for display."""
+    if size_billions < 1:
+        return f"{size_billions * 1000:.0f}M"
+    elif size_billions < 1000:
+        return f"{size_billions:.1f}B"
+    else:
+        return f"{size_billions / 1000:.1f}T"
+def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage):
+    """Update the calculation and return formatted results."""
+    # Convert model size to billions
+    if model_size_unit == "B":
+        model_size_billions = model_size_value
+    else:  # T
+        model_size_billions = model_size_value * 1000
+    # Convert tokens to millions
+    if tokens_unit == "M":
+        tokens_millions = tokens_value
+    elif tokens_unit == "B":
+        tokens_millions = tokens_value * 1000
+    else:  # T
+        tokens_millions = tokens_value * 1000000
+    # Determine TFLOPs value
+    if use_gpu_model and gpu_model != "Custom":
+        gpu_data = load_gpu_data()
+        tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops)
+        gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)"
+    else:
+        tflops_per_gpu = custom_tflops
+        gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)"
+    hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage)
+    # Create detailed breakdown
+    total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6)
+    effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100)
+    breakdown = f"""
+### Calculation Breakdown:
+- **GPU Selection**: {gpu_info}
+- **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B)
+- **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M)
+- **Total FLOPs**: {total_flops:.2e} FLOPs
+- **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens
+- **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s
+- **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU
+### Training Time:
+**{format_output(hours)}**
+"""
+    return breakdown
+# Load GPU data
+gpu_data = load_gpu_data()
+gpu_choices = ["Custom"] + list(gpu_data.keys())
+# Create the Gradio interface
+with gr.Blocks(title="Model Training Time Calculator") as demo:
+    gr.Markdown("# Model Training Time Calculator")
+    gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.")
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                model_size_value = gr.Number(
+                    minimum=0.5,
+                    maximum=1000,
+                    value=7,
+                    step=0.1,
+                    label="Model Size",
+                    info="Enter model size (0.5-1000)"
+                )
+                model_size_unit = gr.Radio(
+                    choices=["B", "T"],
+                    value="B",
+                    label="Unit",
+                    info="Model size unit"
+                )
+            # GPU Selection
+            use_gpu_model = gr.Checkbox(
+                value=True,
+                label="Use GPU Model from List",
+                info="Check to select a GPU model, uncheck to input custom TFLOPs"
+            )
+            gpu_model = gr.Dropdown(
+                choices=gpu_choices,
+                value="H100" if "H100" in gpu_choices else gpu_choices[0],
+                label="GPU Model",
+                info="Select a GPU model from the list",
+                visible=True
+            )
+            custom_tflops = gr.Slider(
+                minimum=10,
+                maximum=2000,
+                value=300,
+                step=10,
+                label="Custom BF16 TFLOPs per GPU",
+                info="Effective (non-sparsity) TFLOPs per GPU",
+                visible=False
+            )
+            num_gpus = gr.Slider(
+                minimum=1,
+                maximum=1024,
+                value=8,
+                step=1,
+                label="Number of GPUs",
+                info="Total number of GPUs for training"
+            )
+            with gr.Row():
+                tokens_value = gr.Slider(
+                    minimum=1,
+                    maximum=1000,
+                    value=100,
+                    step=1,
+                    label="Training Tokens",
+                    info="Number of training tokens"
+                )
+                tokens_unit = gr.Radio(
+                    choices=["M", "B", "T"],
+                    value="B",
+                    label="Unit",
+                    info="Token count unit"
+                )
+            mfu = gr.Slider(
+                minimum=10,
+                maximum=100,
+                value=50,
+                step=5,
+                label="Model FLOPs Utilization (MFU) %",
+                info="Efficiency of hardware utilization (50% is typical for low-end estimate)"
+            )
+        with gr.Column():
+            output = gr.Markdown(label="Results")
+    # Toggle between GPU model and custom TFLOPs
+    def toggle_gpu_input(use_gpu):
+        return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom")
+    use_gpu_model.change(
+        fn=toggle_gpu_input,
+        inputs=[use_gpu_model],
+        outputs=[gpu_model, custom_tflops]
+    )
+    # Show custom TFLOPs when "Custom" is selected
+    def check_custom_selected(gpu_model_value):
+        return gr.update(visible=gpu_model_value == "Custom")
+    gpu_model.change(
+        fn=check_custom_selected,
+        inputs=[gpu_model],
+        outputs=[custom_tflops]
+    )
+    # Set up live updating
+    all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu]
+    for input_component in all_inputs:
+        input_component.change(
+            fn=update_calculation,
+            inputs=all_inputs,
+            outputs=output
+        )
+    # Initial calculation
+    demo.load(
+        fn=update_calculation,
+        inputs=all_inputs,
+        outputs=output
+    )
+if __name__ == "__main__":
+    demo.launch()