File size: 10,901 Bytes
84c9ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import gradio as gr
import math

def calculate_automatic_distribution(vram_list, total_layers, model_size_gb, start_layer):
    """Distributes layers based on VRAM capacity and layer size"""
    if not vram_list or total_layers <= 0 or model_size_gb <= 0:
        return []
    
    # Calculate layer size in GB
    layer_size_gb = model_size_gb / total_layers
    
    # Calculate how many layers each GPU can hold based on its VRAM
    layers_per_gpu = []
    remaining_layers = total_layers
    
    for i, vram in enumerate(vram_list):
        if remaining_layers <= 0:
            layers_per_gpu.append(0)
        else:
            # Calculate how many layers this GPU can hold
            max_layers_for_gpu = int(vram / layer_size_gb)
            # Ensure we don't exceed remaining layers
            assigned_layers = min(max_layers_for_gpu, remaining_layers)
            # Ensure at least 1 layer if there are remaining layers and this is the last GPU
            if remaining_layers > 0 and i == len(vram_list) - 1:
                assigned_layers = max(1, assigned_layers)
            
            layers_per_gpu.append(assigned_layers)
            remaining_layers -= assigned_layers
    
    return layers_per_gpu

def generate_layer_assignment(gpu_count, layers_per_gpu, start_layer, pattern):
    """Creates the -ot command strings for llama.cpp"""
    assignments = []
    current_layer = start_layer
    
    for gpu_id in range(gpu_count):
        if gpu_id < len(layers_per_gpu) and layers_per_gpu[gpu_id] > 0:
            # Create layer range for this GPU
            layer_range = []
            for i in range(layers_per_gpu[gpu_id]):
                layer_range.append(str(current_layer + i))
            
            # Format as regex pattern
            layer_pattern = "|".join(layer_range)
            assignment = f'-ot "blk\\.({layer_pattern})\\.{pattern}=CUDA{gpu_id}"'
            assignments.append(assignment)
            current_layer += layers_per_gpu[gpu_id]
    
    return assignments

def format_output(assignments):
    """Formats assignments as multi-line command arguments"""
    if not assignments:
        return ""
    
    # Join with line continuation
    return " \\\n".join(assignments)

def generate_layer_config(num_gpus, vram_values, start_layer, total_layers, model_size_gb, mode, manual_layers, pattern):
    """Main function to generate layer configuration"""
    try:
        # Validate inputs
        if num_gpus <= 0 or total_layers <= 0:
            return "Error: Invalid number of GPUs or layers"
        
        if mode == "Automatic":
            # Use automatic distribution
            layers_per_gpu = calculate_automatic_distribution(vram_values, total_layers, model_size_gb, start_layer)
        else:
            # Use manual distribution
            layers_per_gpu = manual_layers[:num_gpus]
        
        # Generate assignments
        assignments = generate_layer_assignment(num_gpus, layers_per_gpu, start_layer, pattern)
        
        # Format output
        output = format_output(assignments)
        
        return output
        
    except Exception as e:
        return f"Error generating configuration: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Llama.cpp Layer Assignment Tool") as app:
    gr.Markdown("# Llama.cpp GPU Layer Assignment Tool")
    gr.Markdown("Generate `-ot` arguments for distributing model layers across multiple GPUs")
    
    with gr.Row():
        with gr.Column(scale=1):
            # Basic configuration
            num_gpus = gr.Slider(1, 8, value=7, step=1, label="Number of GPUs")
            start_layer = gr.Number(value=3, label="Starting Layer Number", minimum=0)
            total_layers = gr.Number(value=30, label="Total Number of Layers", minimum=1)
            model_size_gb = gr.Number(value=70, label="Model Size (GB)", minimum=1)
            pattern = gr.Textbox(value="ffn_.*", label="Layer Pattern", placeholder="ffn_.*")
            
            # Mode selection
            mode = gr.Radio(["Automatic", "Manual"], value="Automatic", label="Distribution Mode")
            
            # VRAM inputs container (for automatic mode)
            with gr.Column() as vram_container:
                gr.Markdown("### GPU VRAM Configuration (Automatic Mode)")
                vram_inputs = []
                for i in range(8):  # Create max inputs, show/hide as needed
                    vram_inputs.append(gr.Number(
                        label=f"GPU {i} VRAM (GB)", 
                        value=96 if i == 0 else (32 if i < 3 else 24), 
                        minimum=1, 
                        maximum=200,
                        visible=(i < 7)  # Show first 7 by default
                    ))
            
            # Manual layer inputs container (for manual mode)
            with gr.Column(visible=False) as manual_container:
                gr.Markdown("### Layer Assignment (Manual Mode)")
                gr.Markdown("Specify how many layers each GPU should handle:")
                manual_inputs = []
                for i in range(8):  # Create max inputs, show/hide as needed
                    manual_inputs.append(gr.Number(
                        label=f"GPU {i} - Number of Layers", 
                        value=13 if i == 0 else (3 if i < 3 else 2), 
                        minimum=0, 
                        maximum=100,
                        visible=(i < 7)  # Show first 7 by default
                    ))
            
        with gr.Column(scale=2):
            # Output
            output_text = gr.Textbox(
                label="Generated Command Arguments",
                lines=15,
                max_lines=20,
                show_copy_button=True,
                interactive=False
            )
    
    def generate_config(*args):
        """Generate layer configuration based on all inputs"""
        try:
            # Extract basic inputs
            num_gpus_val = int(args[0])
            start_layer_val = int(args[1]) if args[1] else 0
            total_layers_val = int(args[2]) if args[2] else 1
            model_size_gb_val = float(args[3]) if args[3] else 1
            pattern_val = args[4] if args[4] else "ffn_.*"
            mode_val = args[5]
            
            # Extract VRAM values (args[6:14])
            vram_values = []
            for i in range(num_gpus_val):
                vram_val = args[6 + i] if args[6 + i] else 24
                vram_values.append(float(vram_val))
            
            # Extract manual layer values (args[14:22])
            manual_values = []
            if mode_val == "Manual":
                for i in range(num_gpus_val):
                    manual_val = args[14 + i] if args[14 + i] else 4
                    manual_values.append(int(manual_val))
            
            return generate_layer_config(
                num_gpus_val, vram_values, start_layer_val, total_layers_val,
                model_size_gb_val, mode_val, manual_values, pattern_val
            )
        except Exception as e:
            return f"Error: {str(e)}"
    
    def sync_auto_to_manual(*args):
        """Sync automatic distribution to manual inputs when switching modes"""
        try:
            # Extract basic inputs
            num_gpus_val = int(args[0])
            start_layer_val = int(args[1]) if args[1] else 0
            total_layers_val = int(args[2]) if args[2] else 1
            model_size_gb_val = float(args[3]) if args[3] else 1
            
            # Extract VRAM values (args[6:14])
            vram_values = []
            for i in range(num_gpus_val):
                vram_val = args[6 + i] if args[6 + i] else 24
                vram_values.append(float(vram_val))
            
            # Calculate automatic distribution
            auto_distribution = calculate_automatic_distribution(vram_values, total_layers_val, model_size_gb_val, start_layer_val)
            
            # Update manual inputs with automatic distribution
            manual_updates = []
            for i in range(8):
                if i < len(auto_distribution):
                    manual_updates.append(int(auto_distribution[i]))
                else:
                    manual_updates.append(0)
            
            return manual_updates
        except Exception as e:
            # Return default values if calculation fails
            return [4] * 8
    
    # Collect all inputs for the generation function
    all_inputs = [num_gpus, start_layer, total_layers, model_size_gb, pattern, mode] + vram_inputs + manual_inputs
    
    # Update UI visibility when GPU count changes
    def update_gpu_count(num_gpus_val, mode_val):
        """Update visibility when GPU count changes"""
        updates = []
        
        # Update VRAM inputs visibility (show in automatic mode)
        for i in range(8):
            updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic")))
        
        # Update manual inputs visibility (show in manual mode)
        for i in range(8):
            updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual")))
        
        return updates
    
    num_gpus.change(
        fn=lambda n, m: update_gpu_count(n, m),
        inputs=[num_gpus, mode],
        outputs=vram_inputs + manual_inputs
    )
    
    # Handle mode change with sync from auto to manual
    def handle_mode_change(*args):
        """Handle mode change with sync from auto to manual"""
        num_gpus_val = int(args[0])
        mode_val = args[5]
        
        # Update container visibility
        container_updates = [
            gr.Column(visible=(mode_val == "Automatic")),  # vram_container
            gr.Column(visible=(mode_val == "Manual"))       # manual_container
        ]
        
        # Update input visibility
        input_updates = []
        for i in range(8):
            input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic")))
        for i in range(8):
            input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual")))
        
        # If switching to manual mode, sync automatic distribution
        if mode_val == "Manual":
            manual_updates = sync_auto_to_manual(*args)
            return container_updates + input_updates + manual_updates
        else:
            return container_updates + input_updates + [0] * 8
    
    mode.change(
        fn=handle_mode_change,
        inputs=all_inputs,
        outputs=[vram_container, manual_container] + vram_inputs + manual_inputs + manual_inputs
    )
    
    # Generate output on any input change
    for input_component in all_inputs:
        input_component.change(
            fn=generate_config,
            inputs=all_inputs,
            outputs=[output_text]
        )

if __name__ == "__main__":
    app.launch()