bullerwins's picture
Add application file
84c9ea6
raw
history blame
10.9 kB
import gradio as gr
import math
def calculate_automatic_distribution(vram_list, total_layers, model_size_gb, start_layer):
"""Distributes layers based on VRAM capacity and layer size"""
if not vram_list or total_layers <= 0 or model_size_gb <= 0:
return []
# Calculate layer size in GB
layer_size_gb = model_size_gb / total_layers
# Calculate how many layers each GPU can hold based on its VRAM
layers_per_gpu = []
remaining_layers = total_layers
for i, vram in enumerate(vram_list):
if remaining_layers <= 0:
layers_per_gpu.append(0)
else:
# Calculate how many layers this GPU can hold
max_layers_for_gpu = int(vram / layer_size_gb)
# Ensure we don't exceed remaining layers
assigned_layers = min(max_layers_for_gpu, remaining_layers)
# Ensure at least 1 layer if there are remaining layers and this is the last GPU
if remaining_layers > 0 and i == len(vram_list) - 1:
assigned_layers = max(1, assigned_layers)
layers_per_gpu.append(assigned_layers)
remaining_layers -= assigned_layers
return layers_per_gpu
def generate_layer_assignment(gpu_count, layers_per_gpu, start_layer, pattern):
"""Creates the -ot command strings for llama.cpp"""
assignments = []
current_layer = start_layer
for gpu_id in range(gpu_count):
if gpu_id < len(layers_per_gpu) and layers_per_gpu[gpu_id] > 0:
# Create layer range for this GPU
layer_range = []
for i in range(layers_per_gpu[gpu_id]):
layer_range.append(str(current_layer + i))
# Format as regex pattern
layer_pattern = "|".join(layer_range)
assignment = f'-ot "blk\\.({layer_pattern})\\.{pattern}=CUDA{gpu_id}"'
assignments.append(assignment)
current_layer += layers_per_gpu[gpu_id]
return assignments
def format_output(assignments):
"""Formats assignments as multi-line command arguments"""
if not assignments:
return ""
# Join with line continuation
return " \\\n".join(assignments)
def generate_layer_config(num_gpus, vram_values, start_layer, total_layers, model_size_gb, mode, manual_layers, pattern):
"""Main function to generate layer configuration"""
try:
# Validate inputs
if num_gpus <= 0 or total_layers <= 0:
return "Error: Invalid number of GPUs or layers"
if mode == "Automatic":
# Use automatic distribution
layers_per_gpu = calculate_automatic_distribution(vram_values, total_layers, model_size_gb, start_layer)
else:
# Use manual distribution
layers_per_gpu = manual_layers[:num_gpus]
# Generate assignments
assignments = generate_layer_assignment(num_gpus, layers_per_gpu, start_layer, pattern)
# Format output
output = format_output(assignments)
return output
except Exception as e:
return f"Error generating configuration: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Llama.cpp Layer Assignment Tool") as app:
gr.Markdown("# Llama.cpp GPU Layer Assignment Tool")
gr.Markdown("Generate `-ot` arguments for distributing model layers across multiple GPUs")
with gr.Row():
with gr.Column(scale=1):
# Basic configuration
num_gpus = gr.Slider(1, 8, value=7, step=1, label="Number of GPUs")
start_layer = gr.Number(value=3, label="Starting Layer Number", minimum=0)
total_layers = gr.Number(value=30, label="Total Number of Layers", minimum=1)
model_size_gb = gr.Number(value=70, label="Model Size (GB)", minimum=1)
pattern = gr.Textbox(value="ffn_.*", label="Layer Pattern", placeholder="ffn_.*")
# Mode selection
mode = gr.Radio(["Automatic", "Manual"], value="Automatic", label="Distribution Mode")
# VRAM inputs container (for automatic mode)
with gr.Column() as vram_container:
gr.Markdown("### GPU VRAM Configuration (Automatic Mode)")
vram_inputs = []
for i in range(8): # Create max inputs, show/hide as needed
vram_inputs.append(gr.Number(
label=f"GPU {i} VRAM (GB)",
value=96 if i == 0 else (32 if i < 3 else 24),
minimum=1,
maximum=200,
visible=(i < 7) # Show first 7 by default
))
# Manual layer inputs container (for manual mode)
with gr.Column(visible=False) as manual_container:
gr.Markdown("### Layer Assignment (Manual Mode)")
gr.Markdown("Specify how many layers each GPU should handle:")
manual_inputs = []
for i in range(8): # Create max inputs, show/hide as needed
manual_inputs.append(gr.Number(
label=f"GPU {i} - Number of Layers",
value=13 if i == 0 else (3 if i < 3 else 2),
minimum=0,
maximum=100,
visible=(i < 7) # Show first 7 by default
))
with gr.Column(scale=2):
# Output
output_text = gr.Textbox(
label="Generated Command Arguments",
lines=15,
max_lines=20,
show_copy_button=True,
interactive=False
)
def generate_config(*args):
"""Generate layer configuration based on all inputs"""
try:
# Extract basic inputs
num_gpus_val = int(args[0])
start_layer_val = int(args[1]) if args[1] else 0
total_layers_val = int(args[2]) if args[2] else 1
model_size_gb_val = float(args[3]) if args[3] else 1
pattern_val = args[4] if args[4] else "ffn_.*"
mode_val = args[5]
# Extract VRAM values (args[6:14])
vram_values = []
for i in range(num_gpus_val):
vram_val = args[6 + i] if args[6 + i] else 24
vram_values.append(float(vram_val))
# Extract manual layer values (args[14:22])
manual_values = []
if mode_val == "Manual":
for i in range(num_gpus_val):
manual_val = args[14 + i] if args[14 + i] else 4
manual_values.append(int(manual_val))
return generate_layer_config(
num_gpus_val, vram_values, start_layer_val, total_layers_val,
model_size_gb_val, mode_val, manual_values, pattern_val
)
except Exception as e:
return f"Error: {str(e)}"
def sync_auto_to_manual(*args):
"""Sync automatic distribution to manual inputs when switching modes"""
try:
# Extract basic inputs
num_gpus_val = int(args[0])
start_layer_val = int(args[1]) if args[1] else 0
total_layers_val = int(args[2]) if args[2] else 1
model_size_gb_val = float(args[3]) if args[3] else 1
# Extract VRAM values (args[6:14])
vram_values = []
for i in range(num_gpus_val):
vram_val = args[6 + i] if args[6 + i] else 24
vram_values.append(float(vram_val))
# Calculate automatic distribution
auto_distribution = calculate_automatic_distribution(vram_values, total_layers_val, model_size_gb_val, start_layer_val)
# Update manual inputs with automatic distribution
manual_updates = []
for i in range(8):
if i < len(auto_distribution):
manual_updates.append(int(auto_distribution[i]))
else:
manual_updates.append(0)
return manual_updates
except Exception as e:
# Return default values if calculation fails
return [4] * 8
# Collect all inputs for the generation function
all_inputs = [num_gpus, start_layer, total_layers, model_size_gb, pattern, mode] + vram_inputs + manual_inputs
# Update UI visibility when GPU count changes
def update_gpu_count(num_gpus_val, mode_val):
"""Update visibility when GPU count changes"""
updates = []
# Update VRAM inputs visibility (show in automatic mode)
for i in range(8):
updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic")))
# Update manual inputs visibility (show in manual mode)
for i in range(8):
updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual")))
return updates
num_gpus.change(
fn=lambda n, m: update_gpu_count(n, m),
inputs=[num_gpus, mode],
outputs=vram_inputs + manual_inputs
)
# Handle mode change with sync from auto to manual
def handle_mode_change(*args):
"""Handle mode change with sync from auto to manual"""
num_gpus_val = int(args[0])
mode_val = args[5]
# Update container visibility
container_updates = [
gr.Column(visible=(mode_val == "Automatic")), # vram_container
gr.Column(visible=(mode_val == "Manual")) # manual_container
]
# Update input visibility
input_updates = []
for i in range(8):
input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic")))
for i in range(8):
input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual")))
# If switching to manual mode, sync automatic distribution
if mode_val == "Manual":
manual_updates = sync_auto_to_manual(*args)
return container_updates + input_updates + manual_updates
else:
return container_updates + input_updates + [0] * 8
mode.change(
fn=handle_mode_change,
inputs=all_inputs,
outputs=[vram_container, manual_container] + vram_inputs + manual_inputs + manual_inputs
)
# Generate output on any input change
for input_component in all_inputs:
input_component.change(
fn=generate_config,
inputs=all_inputs,
outputs=[output_text]
)
if __name__ == "__main__":
app.launch()