import gradio as gr import math def calculate_automatic_distribution(vram_list, total_layers, model_size_gb, start_layer): """Distributes layers based on VRAM capacity and layer size""" if not vram_list or total_layers <= 0 or model_size_gb <= 0: return [] # Calculate layer size in GB layer_size_gb = model_size_gb / total_layers # Calculate how many layers each GPU can hold based on its VRAM layers_per_gpu = [] remaining_layers = total_layers for i, vram in enumerate(vram_list): if remaining_layers <= 0: layers_per_gpu.append(0) else: # Calculate how many layers this GPU can hold max_layers_for_gpu = int(vram / layer_size_gb) # Ensure we don't exceed remaining layers assigned_layers = min(max_layers_for_gpu, remaining_layers) # Ensure at least 1 layer if there are remaining layers and this is the last GPU if remaining_layers > 0 and i == len(vram_list) - 1: assigned_layers = max(1, assigned_layers) layers_per_gpu.append(assigned_layers) remaining_layers -= assigned_layers return layers_per_gpu def generate_layer_assignment(gpu_count, layers_per_gpu, start_layer, pattern): """Creates the -ot command strings for llama.cpp""" assignments = [] current_layer = start_layer for gpu_id in range(gpu_count): if gpu_id < len(layers_per_gpu) and layers_per_gpu[gpu_id] > 0: # Create layer range for this GPU layer_range = [] for i in range(layers_per_gpu[gpu_id]): layer_range.append(str(current_layer + i)) # Format as regex pattern layer_pattern = "|".join(layer_range) assignment = f'-ot "blk\\.({layer_pattern})\\.{pattern}=CUDA{gpu_id}"' assignments.append(assignment) current_layer += layers_per_gpu[gpu_id] return assignments def format_output(assignments): """Formats assignments as multi-line command arguments""" if not assignments: return "" # Join with line continuation return " \\\n".join(assignments) def generate_layer_config(num_gpus, vram_values, start_layer, total_layers, model_size_gb, mode, manual_layers, pattern): """Main function to generate layer configuration""" try: # Validate inputs if num_gpus <= 0 or total_layers <= 0: return "Error: Invalid number of GPUs or layers" if mode == "Automatic": # Use automatic distribution layers_per_gpu = calculate_automatic_distribution(vram_values, total_layers, model_size_gb, start_layer) else: # Use manual distribution layers_per_gpu = manual_layers[:num_gpus] # Generate assignments assignments = generate_layer_assignment(num_gpus, layers_per_gpu, start_layer, pattern) # Format output output = format_output(assignments) return output except Exception as e: return f"Error generating configuration: {str(e)}" # Create Gradio interface with gr.Blocks(title="Llama.cpp Layer Assignment Tool") as app: gr.Markdown("# Llama.cpp GPU Layer Assignment Tool") gr.Markdown("Generate `-ot` arguments for distributing model layers across multiple GPUs") with gr.Row(): with gr.Column(scale=1): # Basic configuration num_gpus = gr.Slider(1, 8, value=7, step=1, label="Number of GPUs") start_layer = gr.Number(value=3, label="Starting Layer Number", minimum=0) total_layers = gr.Number(value=30, label="Total Number of Layers", minimum=1) model_size_gb = gr.Number(value=70, label="Model Size (GB)", minimum=1) pattern = gr.Textbox(value="ffn_.*", label="Layer Pattern", placeholder="ffn_.*") # Mode selection mode = gr.Radio(["Automatic", "Manual"], value="Automatic", label="Distribution Mode") # VRAM inputs container (for automatic mode) with gr.Column() as vram_container: gr.Markdown("### GPU VRAM Configuration (Automatic Mode)") vram_inputs = [] for i in range(8): # Create max inputs, show/hide as needed vram_inputs.append(gr.Number( label=f"GPU {i} VRAM (GB)", value=96 if i == 0 else (32 if i < 3 else 24), minimum=1, maximum=200, visible=(i < 7) # Show first 7 by default )) # Manual layer inputs container (for manual mode) with gr.Column(visible=False) as manual_container: gr.Markdown("### Layer Assignment (Manual Mode)") gr.Markdown("Specify how many layers each GPU should handle:") manual_inputs = [] for i in range(8): # Create max inputs, show/hide as needed manual_inputs.append(gr.Number( label=f"GPU {i} - Number of Layers", value=13 if i == 0 else (3 if i < 3 else 2), minimum=0, maximum=100, visible=(i < 7) # Show first 7 by default )) with gr.Column(scale=2): # Output output_text = gr.Textbox( label="Generated Command Arguments", lines=15, max_lines=20, show_copy_button=True, interactive=False ) def generate_config(*args): """Generate layer configuration based on all inputs""" try: # Extract basic inputs num_gpus_val = int(args[0]) start_layer_val = int(args[1]) if args[1] else 0 total_layers_val = int(args[2]) if args[2] else 1 model_size_gb_val = float(args[3]) if args[3] else 1 pattern_val = args[4] if args[4] else "ffn_.*" mode_val = args[5] # Extract VRAM values (args[6:14]) vram_values = [] for i in range(num_gpus_val): vram_val = args[6 + i] if args[6 + i] else 24 vram_values.append(float(vram_val)) # Extract manual layer values (args[14:22]) manual_values = [] if mode_val == "Manual": for i in range(num_gpus_val): manual_val = args[14 + i] if args[14 + i] else 4 manual_values.append(int(manual_val)) return generate_layer_config( num_gpus_val, vram_values, start_layer_val, total_layers_val, model_size_gb_val, mode_val, manual_values, pattern_val ) except Exception as e: return f"Error: {str(e)}" def sync_auto_to_manual(*args): """Sync automatic distribution to manual inputs when switching modes""" try: # Extract basic inputs num_gpus_val = int(args[0]) start_layer_val = int(args[1]) if args[1] else 0 total_layers_val = int(args[2]) if args[2] else 1 model_size_gb_val = float(args[3]) if args[3] else 1 # Extract VRAM values (args[6:14]) vram_values = [] for i in range(num_gpus_val): vram_val = args[6 + i] if args[6 + i] else 24 vram_values.append(float(vram_val)) # Calculate automatic distribution auto_distribution = calculate_automatic_distribution(vram_values, total_layers_val, model_size_gb_val, start_layer_val) # Update manual inputs with automatic distribution manual_updates = [] for i in range(8): if i < len(auto_distribution): manual_updates.append(int(auto_distribution[i])) else: manual_updates.append(0) return manual_updates except Exception as e: # Return default values if calculation fails return [4] * 8 # Collect all inputs for the generation function all_inputs = [num_gpus, start_layer, total_layers, model_size_gb, pattern, mode] + vram_inputs + manual_inputs # Update UI visibility when GPU count changes def update_gpu_count(num_gpus_val, mode_val): """Update visibility when GPU count changes""" updates = [] # Update VRAM inputs visibility (show in automatic mode) for i in range(8): updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic"))) # Update manual inputs visibility (show in manual mode) for i in range(8): updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual"))) return updates num_gpus.change( fn=lambda n, m: update_gpu_count(n, m), inputs=[num_gpus, mode], outputs=vram_inputs + manual_inputs ) # Handle mode change with sync from auto to manual def handle_mode_change(*args): """Handle mode change with sync from auto to manual""" num_gpus_val = int(args[0]) mode_val = args[5] # Update container visibility container_updates = [ gr.Column(visible=(mode_val == "Automatic")), # vram_container gr.Column(visible=(mode_val == "Manual")) # manual_container ] # Update input visibility input_updates = [] for i in range(8): input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic"))) for i in range(8): input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual"))) # If switching to manual mode, sync automatic distribution if mode_val == "Manual": manual_updates = sync_auto_to_manual(*args) return container_updates + input_updates + manual_updates else: return container_updates + input_updates + [0] * 8 mode.change( fn=handle_mode_change, inputs=all_inputs, outputs=[vram_container, manual_container] + vram_inputs + manual_inputs + manual_inputs ) # Generate output on any input change for input_component in all_inputs: input_component.change( fn=generate_config, inputs=all_inputs, outputs=[output_text] ) if __name__ == "__main__": app.launch()