import gradio as gr import csv import os import numpy as np def load_gpu_data(): """Load GPU data from gpus.csv file.""" gpu_data = {} csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv') try: with open(csv_path, 'r') as file: reader = csv.DictReader(file) for row in reader: gpu_name = row['gpu_model'].replace('_', ' ') tflops = float(row['sparce_tflops']) gpu_data[gpu_name] = tflops except Exception as e: print(f"Error loading GPU data: {e}") gpu_data = {"Custom": 0} return gpu_data def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage): """ Calculate the time to train a model. Formula: - Total FLOPs = 6 * num_params * num_tokens - Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100) - Training time = Total FLOPs / Effective FLOPs per second Args: model_size_billions: Model size in billions of parameters tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity) num_gpus: Number of GPUs used tokens_millions: Number of tokens in millions mfu_percentage: Model FLOPs Utilization percentage Returns: Training time in hours """ # Convert inputs to base units num_params = model_size_billions * 1e9 num_tokens = tokens_millions * 1e6 # Calculate total FLOPs needed total_flops = 6 * num_params * num_tokens # Calculate effective FLOPs per second # tflops_per_gpu is in 10^12 FLOPs per second flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100) # Calculate training time in seconds training_time_seconds = total_flops / flops_per_second # Convert to hours training_time_hours = training_time_seconds / 3600 return training_time_hours def format_output(hours): """Format the output in a readable way.""" if hours < 24: return f"{hours:.2f} hours" else: days = hours / 24 if days < 30: return f"{days:.2f} days ({hours:.1f} hours)" else: months = days / 30 return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)" def slider_to_model_size(value): """Convert logarithmic slider value to actual model size in billions.""" # Map 0-100 to 0.1B-1000B logarithmically min_log = np.log10(0.1) # -1 max_log = np.log10(1000) # 3 log_value = min_log + (max_log - min_log) * value / 100 return 10 ** log_value def model_size_to_slider(size_billions): """Convert model size in billions to slider value.""" min_log = np.log10(0.1) max_log = np.log10(1000) log_value = np.log10(size_billions) return 100 * (log_value - min_log) / (max_log - min_log) def format_model_size(size_billions): """Format model size for display.""" if size_billions < 1: return f"{size_billions * 1000:.0f}M" elif size_billions < 1000: return f"{size_billions:.1f}B" else: return f"{size_billions / 1000:.1f}T" def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage): """Update the calculation and return formatted results.""" # Convert model size to billions if model_size_unit == "B": model_size_billions = model_size_value else: # T model_size_billions = model_size_value * 1000 # Convert tokens to millions if tokens_unit == "M": tokens_millions = tokens_value elif tokens_unit == "B": tokens_millions = tokens_value * 1000 else: # T tokens_millions = tokens_value * 1000000 # Determine TFLOPs value if use_gpu_model and gpu_model != "Custom": gpu_data = load_gpu_data() tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops) gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)" else: tflops_per_gpu = custom_tflops gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)" hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage) # Create detailed breakdown total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6) effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100) breakdown = f""" ### Calculation Breakdown: - **GPU Selection**: {gpu_info} - **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B) - **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M) - **Total FLOPs**: {total_flops:.2e} FLOPs - **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens - **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s - **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU ### Training Time: **{format_output(hours)}** """ return breakdown # Load GPU data gpu_data = load_gpu_data() gpu_choices = ["Custom"] + list(gpu_data.keys()) # Create the Gradio interface with gr.Blocks(title="Model Training Time Calculator") as demo: gr.Markdown("# Model Training Time Calculator") gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.") with gr.Row(): with gr.Column(): with gr.Row(): model_size_value = gr.Number( minimum=0.5, maximum=1000, value=7, step=0.1, label="Model Size", info="Enter model size (0.5-1000)" ) model_size_unit = gr.Radio( choices=["B", "T"], value="B", label="Unit", info="Model size unit" ) # GPU Selection use_gpu_model = gr.Checkbox( value=True, label="Use GPU Model from List", info="Check to select a GPU model, uncheck to input custom TFLOPs" ) gpu_model = gr.Dropdown( choices=gpu_choices, value="H100" if "H100" in gpu_choices else gpu_choices[0], label="GPU Model", info="Select a GPU model from the list", visible=True ) custom_tflops = gr.Slider( minimum=10, maximum=2000, value=300, step=10, label="Custom BF16 TFLOPs per GPU", info="Effective (non-sparsity) TFLOPs per GPU", visible=False ) num_gpus = gr.Slider( minimum=1, maximum=1024, value=8, step=1, label="Number of GPUs", info="Total number of GPUs for training" ) with gr.Row(): tokens_value = gr.Slider( minimum=1, maximum=1000, value=100, step=1, label="Training Tokens", info="Number of training tokens" ) tokens_unit = gr.Radio( choices=["M", "B", "T"], value="B", label="Unit", info="Token count unit" ) mfu = gr.Slider( minimum=10, maximum=100, value=50, step=5, label="Model FLOPs Utilization (MFU) %", info="Efficiency of hardware utilization (50% is typical for low-end estimate)" ) with gr.Column(): output = gr.Markdown(label="Results") # Toggle between GPU model and custom TFLOPs def toggle_gpu_input(use_gpu): return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom") use_gpu_model.change( fn=toggle_gpu_input, inputs=[use_gpu_model], outputs=[gpu_model, custom_tflops] ) # Show custom TFLOPs when "Custom" is selected def check_custom_selected(gpu_model_value): return gr.update(visible=gpu_model_value == "Custom") gpu_model.change( fn=check_custom_selected, inputs=[gpu_model], outputs=[custom_tflops] ) # Set up live updating all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu] for input_component in all_inputs: input_component.change( fn=update_calculation, inputs=all_inputs, outputs=output ) # Initial calculation demo.load( fn=update_calculation, inputs=all_inputs, outputs=output ) if __name__ == "__main__": demo.launch()