muellerzr commited on
Commit
32e471c
·
verified ·
1 Parent(s): a687dc1

Create training_time_calculator.py

Browse files
Files changed (1) hide show
  1. training_time_calculator.py +272 -0
training_time_calculator.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import csv
3
+ import os
4
+ import numpy as np
5
+
6
+ def load_gpu_data():
7
+ """Load GPU data from gpus.csv file."""
8
+ gpu_data = {}
9
+ csv_path = os.path.join(os.path.dirname(__file__), 'gpus.csv')
10
+
11
+ try:
12
+ with open(csv_path, 'r') as file:
13
+ reader = csv.DictReader(file)
14
+ for row in reader:
15
+ gpu_name = row['gpu_model'].replace('_', ' ')
16
+ tflops = float(row['sparce_tflops'])
17
+ gpu_data[gpu_name] = tflops
18
+ except Exception as e:
19
+ print(f"Error loading GPU data: {e}")
20
+ gpu_data = {"Custom": 0}
21
+
22
+ return gpu_data
23
+
24
+ def calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage):
25
+ """
26
+ Calculate the time to train a model.
27
+
28
+ Formula:
29
+ - Total FLOPs = 6 * num_params * num_tokens
30
+ - Effective FLOPs per second = tflops_per_gpu * num_gpus * 10^12 * (MFU/100)
31
+ - Training time = Total FLOPs / Effective FLOPs per second
32
+
33
+ Args:
34
+ model_size_billions: Model size in billions of parameters
35
+ tflops_per_gpu: BF16 TFLOPs per GPU (effective, non-sparsity)
36
+ num_gpus: Number of GPUs used
37
+ tokens_millions: Number of tokens in millions
38
+ mfu_percentage: Model FLOPs Utilization percentage
39
+
40
+ Returns:
41
+ Training time in hours
42
+ """
43
+ # Convert inputs to base units
44
+ num_params = model_size_billions * 1e9
45
+ num_tokens = tokens_millions * 1e6
46
+
47
+ # Calculate total FLOPs needed
48
+ total_flops = 6 * num_params * num_tokens
49
+
50
+ # Calculate effective FLOPs per second
51
+ # tflops_per_gpu is in 10^12 FLOPs per second
52
+ flops_per_second = tflops_per_gpu * num_gpus * 1e12 * (mfu_percentage / 100)
53
+
54
+ # Calculate training time in seconds
55
+ training_time_seconds = total_flops / flops_per_second
56
+
57
+ # Convert to hours
58
+ training_time_hours = training_time_seconds / 3600
59
+
60
+ return training_time_hours
61
+
62
+ def format_output(hours):
63
+ """Format the output in a readable way."""
64
+ if hours < 24:
65
+ return f"{hours:.2f} hours"
66
+ else:
67
+ days = hours / 24
68
+ if days < 30:
69
+ return f"{days:.2f} days ({hours:.1f} hours)"
70
+ else:
71
+ months = days / 30
72
+ return f"{months:.2f} months ({days:.1f} days, {hours:.0f} hours)"
73
+
74
+ def slider_to_model_size(value):
75
+ """Convert logarithmic slider value to actual model size in billions."""
76
+ # Map 0-100 to 0.1B-1000B logarithmically
77
+ min_log = np.log10(0.1) # -1
78
+ max_log = np.log10(1000) # 3
79
+ log_value = min_log + (max_log - min_log) * value / 100
80
+ return 10 ** log_value
81
+
82
+ def model_size_to_slider(size_billions):
83
+ """Convert model size in billions to slider value."""
84
+ min_log = np.log10(0.1)
85
+ max_log = np.log10(1000)
86
+ log_value = np.log10(size_billions)
87
+ return 100 * (log_value - min_log) / (max_log - min_log)
88
+
89
+ def format_model_size(size_billions):
90
+ """Format model size for display."""
91
+ if size_billions < 1:
92
+ return f"{size_billions * 1000:.0f}M"
93
+ elif size_billions < 1000:
94
+ return f"{size_billions:.1f}B"
95
+ else:
96
+ return f"{size_billions / 1000:.1f}T"
97
+
98
+ def update_calculation(model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu_percentage):
99
+ """Update the calculation and return formatted results."""
100
+ # Convert model size to billions
101
+ if model_size_unit == "B":
102
+ model_size_billions = model_size_value
103
+ else: # T
104
+ model_size_billions = model_size_value * 1000
105
+
106
+ # Convert tokens to millions
107
+ if tokens_unit == "M":
108
+ tokens_millions = tokens_value
109
+ elif tokens_unit == "B":
110
+ tokens_millions = tokens_value * 1000
111
+ else: # T
112
+ tokens_millions = tokens_value * 1000000
113
+
114
+ # Determine TFLOPs value
115
+ if use_gpu_model and gpu_model != "Custom":
116
+ gpu_data = load_gpu_data()
117
+ tflops_per_gpu = gpu_data.get(gpu_model, custom_tflops)
118
+ gpu_info = f"{gpu_model} ({tflops_per_gpu} TFLOPs)"
119
+ else:
120
+ tflops_per_gpu = custom_tflops
121
+ gpu_info = f"Custom ({tflops_per_gpu} TFLOPs)"
122
+
123
+ hours = calculate_training_time(model_size_billions, tflops_per_gpu, num_gpus, tokens_millions, mfu_percentage)
124
+
125
+ # Create detailed breakdown
126
+ total_flops = 6 * (model_size_billions * 1e9) * (tokens_millions * 1e6)
127
+ effective_tflops = tflops_per_gpu * num_gpus * (mfu_percentage / 100)
128
+
129
+ breakdown = f"""
130
+ ### Calculation Breakdown:
131
+ - **GPU Selection**: {gpu_info}
132
+ - **Model Size**: {format_model_size(model_size_billions)} parameters ({model_size_billions:.2f}B)
133
+ - **Training Tokens**: {tokens_value}{tokens_unit} tokens ({tokens_millions:.0f}M)
134
+ - **Total FLOPs**: {total_flops:.2e} FLOPs
135
+ - **Formula**: 6 × {model_size_billions:.2f}B params × {tokens_millions:.0f}M tokens
136
+ - **Effective TFLOPs**: {effective_tflops:.2f} TFLOPs/s
137
+ - **Formula**: {tflops_per_gpu} TFLOPs/GPU × {num_gpus} GPUs × {mfu_percentage}% MFU
138
+
139
+ ### Training Time:
140
+ **{format_output(hours)}**
141
+ """
142
+
143
+ return breakdown
144
+
145
+ # Load GPU data
146
+ gpu_data = load_gpu_data()
147
+ gpu_choices = ["Custom"] + list(gpu_data.keys())
148
+
149
+ # Create the Gradio interface
150
+ with gr.Blocks(title="Model Training Time Calculator") as demo:
151
+ gr.Markdown("# Model Training Time Calculator")
152
+ gr.Markdown("Calculate the time required to train a model based on model size, hardware specs, and token count.")
153
+
154
+ with gr.Row():
155
+ with gr.Column():
156
+ with gr.Row():
157
+ model_size_value = gr.Number(
158
+ minimum=0.5,
159
+ maximum=1000,
160
+ value=7,
161
+ step=0.1,
162
+ label="Model Size",
163
+ info="Enter model size (0.5-1000)"
164
+ )
165
+ model_size_unit = gr.Radio(
166
+ choices=["B", "T"],
167
+ value="B",
168
+ label="Unit",
169
+ info="Model size unit"
170
+ )
171
+
172
+ # GPU Selection
173
+ use_gpu_model = gr.Checkbox(
174
+ value=True,
175
+ label="Use GPU Model from List",
176
+ info="Check to select a GPU model, uncheck to input custom TFLOPs"
177
+ )
178
+
179
+ gpu_model = gr.Dropdown(
180
+ choices=gpu_choices,
181
+ value="H100" if "H100" in gpu_choices else gpu_choices[0],
182
+ label="GPU Model",
183
+ info="Select a GPU model from the list",
184
+ visible=True
185
+ )
186
+
187
+ custom_tflops = gr.Slider(
188
+ minimum=10,
189
+ maximum=2000,
190
+ value=300,
191
+ step=10,
192
+ label="Custom BF16 TFLOPs per GPU",
193
+ info="Effective (non-sparsity) TFLOPs per GPU",
194
+ visible=False
195
+ )
196
+
197
+ num_gpus = gr.Slider(
198
+ minimum=1,
199
+ maximum=1024,
200
+ value=8,
201
+ step=1,
202
+ label="Number of GPUs",
203
+ info="Total number of GPUs for training"
204
+ )
205
+
206
+ with gr.Row():
207
+ tokens_value = gr.Slider(
208
+ minimum=1,
209
+ maximum=1000,
210
+ value=100,
211
+ step=1,
212
+ label="Training Tokens",
213
+ info="Number of training tokens"
214
+ )
215
+ tokens_unit = gr.Radio(
216
+ choices=["M", "B", "T"],
217
+ value="B",
218
+ label="Unit",
219
+ info="Token count unit"
220
+ )
221
+
222
+ mfu = gr.Slider(
223
+ minimum=10,
224
+ maximum=100,
225
+ value=50,
226
+ step=5,
227
+ label="Model FLOPs Utilization (MFU) %",
228
+ info="Efficiency of hardware utilization (50% is typical for low-end estimate)"
229
+ )
230
+
231
+ with gr.Column():
232
+ output = gr.Markdown(label="Results")
233
+
234
+ # Toggle between GPU model and custom TFLOPs
235
+ def toggle_gpu_input(use_gpu):
236
+ return gr.update(visible=use_gpu), gr.update(visible=not use_gpu or use_gpu and gpu_model.value == "Custom")
237
+
238
+ use_gpu_model.change(
239
+ fn=toggle_gpu_input,
240
+ inputs=[use_gpu_model],
241
+ outputs=[gpu_model, custom_tflops]
242
+ )
243
+
244
+ # Show custom TFLOPs when "Custom" is selected
245
+ def check_custom_selected(gpu_model_value):
246
+ return gr.update(visible=gpu_model_value == "Custom")
247
+
248
+ gpu_model.change(
249
+ fn=check_custom_selected,
250
+ inputs=[gpu_model],
251
+ outputs=[custom_tflops]
252
+ )
253
+
254
+ # Set up live updating
255
+ all_inputs = [model_size_value, model_size_unit, use_gpu_model, gpu_model, custom_tflops, num_gpus, tokens_value, tokens_unit, mfu]
256
+
257
+ for input_component in all_inputs:
258
+ input_component.change(
259
+ fn=update_calculation,
260
+ inputs=all_inputs,
261
+ outputs=output
262
+ )
263
+
264
+ # Initial calculation
265
+ demo.load(
266
+ fn=update_calculation,
267
+ inputs=all_inputs,
268
+ outputs=output
269
+ )
270
+
271
+ if __name__ == "__main__":
272
+ demo.launch()