|
|
import gradio as gr |
|
|
import math |
|
|
|
|
|
def calculate_automatic_distribution(vram_list, total_layers, model_size_gb, start_layer): |
|
|
"""Distributes layers based on VRAM capacity and layer size""" |
|
|
if not vram_list or total_layers <= 0 or model_size_gb <= 0: |
|
|
return [] |
|
|
|
|
|
|
|
|
layer_size_gb = model_size_gb / total_layers |
|
|
|
|
|
|
|
|
layers_per_gpu = [] |
|
|
remaining_layers = total_layers |
|
|
|
|
|
for i, vram in enumerate(vram_list): |
|
|
if remaining_layers <= 0: |
|
|
layers_per_gpu.append(0) |
|
|
else: |
|
|
|
|
|
max_layers_for_gpu = int(vram / layer_size_gb) |
|
|
|
|
|
assigned_layers = min(max_layers_for_gpu, remaining_layers) |
|
|
|
|
|
if remaining_layers > 0 and i == len(vram_list) - 1: |
|
|
assigned_layers = max(1, assigned_layers) |
|
|
|
|
|
layers_per_gpu.append(assigned_layers) |
|
|
remaining_layers -= assigned_layers |
|
|
|
|
|
return layers_per_gpu |
|
|
|
|
|
def generate_layer_assignment(gpu_count, layers_per_gpu, start_layer, pattern): |
|
|
"""Creates the -ot command strings for llama.cpp""" |
|
|
assignments = [] |
|
|
current_layer = start_layer |
|
|
|
|
|
for gpu_id in range(gpu_count): |
|
|
if gpu_id < len(layers_per_gpu) and layers_per_gpu[gpu_id] > 0: |
|
|
|
|
|
layer_range = [] |
|
|
for i in range(layers_per_gpu[gpu_id]): |
|
|
layer_range.append(str(current_layer + i)) |
|
|
|
|
|
|
|
|
layer_pattern = "|".join(layer_range) |
|
|
assignment = f'-ot "blk\\.({layer_pattern})\\.{pattern}=CUDA{gpu_id}"' |
|
|
assignments.append(assignment) |
|
|
current_layer += layers_per_gpu[gpu_id] |
|
|
|
|
|
return assignments |
|
|
|
|
|
def format_output(assignments): |
|
|
"""Formats assignments as multi-line command arguments""" |
|
|
if not assignments: |
|
|
return "" |
|
|
|
|
|
|
|
|
return " \\\n".join(assignments) |
|
|
|
|
|
def generate_layer_config(num_gpus, vram_values, start_layer, total_layers, model_size_gb, mode, manual_layers, pattern): |
|
|
"""Main function to generate layer configuration""" |
|
|
try: |
|
|
|
|
|
if num_gpus <= 0 or total_layers <= 0: |
|
|
return "Error: Invalid number of GPUs or layers" |
|
|
|
|
|
if mode == "Automatic": |
|
|
|
|
|
layers_per_gpu = calculate_automatic_distribution(vram_values, total_layers, model_size_gb, start_layer) |
|
|
else: |
|
|
|
|
|
layers_per_gpu = manual_layers[:num_gpus] |
|
|
|
|
|
|
|
|
assignments = generate_layer_assignment(num_gpus, layers_per_gpu, start_layer, pattern) |
|
|
|
|
|
|
|
|
output = format_output(assignments) |
|
|
|
|
|
return output |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error generating configuration: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Llama.cpp Layer Assignment Tool") as app: |
|
|
gr.Markdown("# Llama.cpp GPU Layer Assignment Tool") |
|
|
gr.Markdown("Generate `-ot` arguments for distributing model layers across multiple GPUs") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
num_gpus = gr.Slider(1, 8, value=7, step=1, label="Number of GPUs") |
|
|
start_layer = gr.Number(value=3, label="Starting Layer Number", minimum=0) |
|
|
total_layers = gr.Number(value=30, label="Total Number of Layers", minimum=1) |
|
|
model_size_gb = gr.Number(value=70, label="Model Size (GB)", minimum=1) |
|
|
pattern = gr.Textbox(value="ffn_.*", label="Layer Pattern", placeholder="ffn_.*") |
|
|
|
|
|
|
|
|
mode = gr.Radio(["Automatic", "Manual"], value="Automatic", label="Distribution Mode") |
|
|
|
|
|
|
|
|
with gr.Column() as vram_container: |
|
|
gr.Markdown("### GPU VRAM Configuration (Automatic Mode)") |
|
|
vram_inputs = [] |
|
|
for i in range(8): |
|
|
vram_inputs.append(gr.Number( |
|
|
label=f"GPU {i} VRAM (GB)", |
|
|
value=96 if i == 0 else (32 if i < 3 else 24), |
|
|
minimum=1, |
|
|
maximum=200, |
|
|
visible=(i < 7) |
|
|
)) |
|
|
|
|
|
|
|
|
with gr.Column(visible=False) as manual_container: |
|
|
gr.Markdown("### Layer Assignment (Manual Mode)") |
|
|
gr.Markdown("Specify how many layers each GPU should handle:") |
|
|
manual_inputs = [] |
|
|
for i in range(8): |
|
|
manual_inputs.append(gr.Number( |
|
|
label=f"GPU {i} - Number of Layers", |
|
|
value=13 if i == 0 else (3 if i < 3 else 2), |
|
|
minimum=0, |
|
|
maximum=100, |
|
|
visible=(i < 7) |
|
|
)) |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
|
|
|
output_text = gr.Textbox( |
|
|
label="Generated Command Arguments", |
|
|
lines=15, |
|
|
max_lines=20, |
|
|
show_copy_button=True, |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
def generate_config(*args): |
|
|
"""Generate layer configuration based on all inputs""" |
|
|
try: |
|
|
|
|
|
num_gpus_val = int(args[0]) |
|
|
start_layer_val = int(args[1]) if args[1] else 0 |
|
|
total_layers_val = int(args[2]) if args[2] else 1 |
|
|
model_size_gb_val = float(args[3]) if args[3] else 1 |
|
|
pattern_val = args[4] if args[4] else "ffn_.*" |
|
|
mode_val = args[5] |
|
|
|
|
|
|
|
|
vram_values = [] |
|
|
for i in range(num_gpus_val): |
|
|
vram_val = args[6 + i] if args[6 + i] else 24 |
|
|
vram_values.append(float(vram_val)) |
|
|
|
|
|
|
|
|
manual_values = [] |
|
|
if mode_val == "Manual": |
|
|
for i in range(num_gpus_val): |
|
|
manual_val = args[14 + i] if args[14 + i] else 4 |
|
|
manual_values.append(int(manual_val)) |
|
|
|
|
|
return generate_layer_config( |
|
|
num_gpus_val, vram_values, start_layer_val, total_layers_val, |
|
|
model_size_gb_val, mode_val, manual_values, pattern_val |
|
|
) |
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
def sync_auto_to_manual(*args): |
|
|
"""Sync automatic distribution to manual inputs when switching modes""" |
|
|
try: |
|
|
|
|
|
num_gpus_val = int(args[0]) |
|
|
start_layer_val = int(args[1]) if args[1] else 0 |
|
|
total_layers_val = int(args[2]) if args[2] else 1 |
|
|
model_size_gb_val = float(args[3]) if args[3] else 1 |
|
|
|
|
|
|
|
|
vram_values = [] |
|
|
for i in range(num_gpus_val): |
|
|
vram_val = args[6 + i] if args[6 + i] else 24 |
|
|
vram_values.append(float(vram_val)) |
|
|
|
|
|
|
|
|
auto_distribution = calculate_automatic_distribution(vram_values, total_layers_val, model_size_gb_val, start_layer_val) |
|
|
|
|
|
|
|
|
manual_updates = [] |
|
|
for i in range(8): |
|
|
if i < len(auto_distribution): |
|
|
manual_updates.append(int(auto_distribution[i])) |
|
|
else: |
|
|
manual_updates.append(0) |
|
|
|
|
|
return manual_updates |
|
|
except Exception as e: |
|
|
|
|
|
return [4] * 8 |
|
|
|
|
|
|
|
|
all_inputs = [num_gpus, start_layer, total_layers, model_size_gb, pattern, mode] + vram_inputs + manual_inputs |
|
|
|
|
|
|
|
|
def update_gpu_count(num_gpus_val, mode_val): |
|
|
"""Update visibility when GPU count changes""" |
|
|
updates = [] |
|
|
|
|
|
|
|
|
for i in range(8): |
|
|
updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic"))) |
|
|
|
|
|
|
|
|
for i in range(8): |
|
|
updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual"))) |
|
|
|
|
|
return updates |
|
|
|
|
|
num_gpus.change( |
|
|
fn=lambda n, m: update_gpu_count(n, m), |
|
|
inputs=[num_gpus, mode], |
|
|
outputs=vram_inputs + manual_inputs |
|
|
) |
|
|
|
|
|
|
|
|
def handle_mode_change(*args): |
|
|
"""Handle mode change with sync from auto to manual""" |
|
|
num_gpus_val = int(args[0]) |
|
|
mode_val = args[5] |
|
|
|
|
|
|
|
|
container_updates = [ |
|
|
gr.Column(visible=(mode_val == "Automatic")), |
|
|
gr.Column(visible=(mode_val == "Manual")) |
|
|
] |
|
|
|
|
|
|
|
|
input_updates = [] |
|
|
for i in range(8): |
|
|
input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Automatic"))) |
|
|
for i in range(8): |
|
|
input_updates.append(gr.Number(visible=(i < num_gpus_val and mode_val == "Manual"))) |
|
|
|
|
|
|
|
|
if mode_val == "Manual": |
|
|
manual_updates = sync_auto_to_manual(*args) |
|
|
return container_updates + input_updates + manual_updates |
|
|
else: |
|
|
return container_updates + input_updates + [0] * 8 |
|
|
|
|
|
mode.change( |
|
|
fn=handle_mode_change, |
|
|
inputs=all_inputs, |
|
|
outputs=[vram_container, manual_container] + vram_inputs + manual_inputs + manual_inputs |
|
|
) |
|
|
|
|
|
|
|
|
for input_component in all_inputs: |
|
|
input_component.change( |
|
|
fn=generate_config, |
|
|
inputs=all_inputs, |
|
|
outputs=[output_text] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch() |
|
|
|