-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathoptimize_llamacpp_ngl.py
166 lines (148 loc) · 8.07 KB
/
optimize_llamacpp_ngl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import os
import struct
import subprocess
import sys
def try_llama_main(ngl, llama_main_path, model_path, desired_context,
split_mode, main_gpu, tensor_split):
print(f'trying -ngl {ngl}{tensor_split}')
cmd = f'{llama_main_path} -m {model_path} -c {desired_context} --split-mode {split_mode}{tensor_split} --main-gpu {main_gpu} --flash-attn -t 4 -ngl {ngl} -n 1 --prompt "if this sentence is in stdout then our ngl was ok"'
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
return 'if this sentence is in stdout then our ngl was ok' in stdout.decode()
def best_tensor_split(vram_per_gpu, MiB_per_layer, ngl):
vram_left = vram_per_gpu[:]
remaining_layers = ngl
# HACK these overhead estimates are very rough guesses, optimized for 8x22B.
# too conservative for smaller models and GPUs? TODO use CUDA0etc buffer size from initial guess
for i in range(0, len(vram_left)):
vram_left[i] -= 500
vram_left[main_gpu_ind] -= 1000
layers = [0] * len(vram_left)
while remaining_layers > 0:
target_ind = vram_left.index(max(vram_left))
layers[target_ind] += 1
remaining_layers -= 1
vram_left[target_ind] -= MiB_per_layer
# if we think it easily fits, then just let llama.cpp do the default
if all(x > 2 * MiB_per_layer for x in vram_left):
return ''
return ' --tensor-split ' + ','.join(map(str, layers))
def binary_search_most_ngl_possible(low, high, llama_main_path, model_path,
desired_context, main_gpu, vram_per_gpu, MiB_per_layer):
works = {}
ts_strings = {}
while low < high:
mid = (low + high) // 2
ts_strings[mid] = best_tensor_split(vram_per_gpu, MiB_per_layer, mid)
works[mid] = try_llama_main(mid, llama_main_path, model_path, desired_context,
'layer', main_gpu, ts_strings[mid])
if works[mid]:
low = mid + 1
else:
high = mid
# if our answer is below the range we evaluated, indicate not found
return low - 1, works.get(low - 1, False), ts_strings.get(low - 1, '')
def nvidia_smi_query_quantity(to_query):
cmd = f'nvidia-smi --query-gpu={to_query} --format=csv,noheader'
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
output_lines = stdout.decode().split()
output_ints = [int(s) for s in output_lines if s.isdigit()]
return output_ints
if len(sys.argv) < 4 or not sys.argv[1].isdigit():
print(f'Usage: python3 {sys.argv[0]} desired_context /path/to/llama.cpp/llama-cli /path/to/model.gguf [model_size_in_MiB]')
print(f'\ne.g.: python3 {sys.argv[0]} 8192 ~/llama.cpp/bin/llama-cli ~/models/Largestral-2407-Q4KS-00001-of-00002.gguf 66348')
print('the optional model_size_in_MiB argument is needed when the model is split over multiple .gguf files.')
exit(1)
desired_context = int(sys.argv[1])
llama_main_path = sys.argv[2]
model_path = sys.argv[3]
if not os.path.isfile(llama_main_path):
print(f'file does not exist: {llama_main_path}')
exit(1)
main_help_process = subprocess.Popen(f'{llama_main_path} --help', stdout=subprocess.PIPE, shell=True)
main_help_stdout, main_help_stderr = main_help_process.communicate()
mh = main_help_stdout.decode()
if not ('--split-mode' in mh and '--reverse-prompt' in mh):
print(f'this file does not appear to be a working llama.cpp "llama-cli" executable: {llama_main_path}')
exit(1)
if not os.path.isfile(model_path):
print(f'file does not exist: {model_path}')
exit(1)
pcie_lanes_per_gpu = nvidia_smi_query_quantity('pcie.link.width.current')
vram_per_gpu = nvidia_smi_query_quantity('memory.total') # in MiB
nvidia_smi_process = subprocess.Popen('nvidia-smi --query-gpu=name --format=csv,noheader',
stdout=subprocess.PIPE, shell=True)
nv_stdout, nv_stderr = nvidia_smi_process.communicate()
gpu_names = [s for s in nv_stdout.decode().split('\n') if s != '']
vram_bandwidth_table = { 'NVIDIA GeForce RTX 4090': 1008,
'NVIDIA GeForce RTX 4080': 737,
'NVIDIA GeForce RTX 4070 Ti': 672,
'NVIDIA GeForce RTX 4070': 504,
'NVIDIA GeForce RTX 4060 Ti': 288,
'NVIDIA GeForce RTX 4060': 272,
'NVIDIA GeForce RTX 3090': 936,
'NVIDIA GeForce RTX 3080 Ti': 912,
'NVIDIA GeForce RTX 3080': 912,
'NVIDIA GeForce RTX 3070 Ti': 608,
'NVIDIA GeForce RTX 3070': 448,
'NVIDIA GeForce RTX 3060 Ti': 448,
'NVIDIA GeForce RTX 3060': 360,
'NVIDIA GeForce RTX 2080 Ti': 616,
'NVIDIA GeForce RTX 2080': 448,
'NVIDIA GeForce RTX A6000': 768,
'NVIDIA GeForce RTX A4000': 448,
'Tesla P40': 346,
'Tesla P100': 732 }
gpu_bandwidths = [vram_bandwidth_table.get(name, 222) for name in gpu_names]
for i in range(0, len(gpu_bandwidths)):
if gpu_bandwidths[i] == 222:
print(f'WARNING! Unrecognized GPU name {gpu_names[i]}. Look up your GPU VRAM bandwidth and put it in the vram_bandwidth_table map in this script (and please open a pull request!), or you might get a suboptimal value for --main-gpu if you have heterogeneous GPUs.')
# Pick main GPU: pick highest VRAM bandwidth GPU, with PCIe lanes as tiebreaker
best_bw_inds = [i for i, x in enumerate(gpu_bandwidths) if x == max(gpu_bandwidths)]
main_gpu_ind = best_bw_inds[0]
best_pcie = pcie_lanes_per_gpu[main_gpu_ind]
for gpu_ind in best_bw_inds:
if pcie_lanes_per_gpu[gpu_ind] > best_pcie:
main_gpu_ind = gpu_ind
model_layers = 0
model_is_MoE = False
with open(model_path, 'rb') as model_file:
data = model_file.read(4096)
ind_block_count = data.find(b'.block_count')
if ind_block_count == -1:
print('\nfailed to determine model n_layer from model file block_count')
exit(1)
model_layers = struct.unpack('<i', data[ind_block_count+16:ind_block_count+20])[0]
ind_expert_count = data.find(b'.expert_count')
if ind_expert_count > -1:
expert_count = struct.unpack('<i', data[ind_expert_count+17:ind_expert_count+21])[0]
model_is_MoE = (expert_count > 1)
# initial guess, hopefully we can mostly skip binary search
model_size_MiB = 1 + os.path.getsize(model_path) // (1024 * 1024)
#HACK for case when model is split across multiple .ggufs
if len(sys.argv) > 4:
model_size_MiB = int(sys.argv[4])
MiB_per_layer = 1 + model_size_MiB // model_layers
print(f'model has {model_layers} layers, file size {model_size_MiB}MiB, model_is_MoE: {model_is_MoE}')
# we can be pretty sure that if x * mem_per_layer > VRAM, we ain't fittin x layers in VRAM
ngl_upperlimit = min(model_layers, sum(vram_per_gpu) // MiB_per_layer)
# HACK very crude estimates based on observation
total_overhead_guess = 11000 if model_size_MiB > 60000 else (5000 if model_size_MiB > 30000 else 2000)
ngl_lowerlimit = min(model_layers, (sum(vram_per_gpu) - total_overhead_guess) // MiB_per_layer)
print(f'model layers: {model_layers}')
print(f'binary search bounds: low {ngl_lowerlimit} high {ngl_upperlimit+1}')
max_ngl_possible, found, tensor_split = binary_search_most_ngl_possible(
ngl_lowerlimit, ngl_upperlimit+1, llama_main_path,
model_path, desired_context, main_gpu_ind, vram_per_gpu, MiB_per_layer)
if not found:
print('good ngl not found within guessed binary search area, trying down to 0')
max_ngl_possible, found, tensor_split = binary_search_most_ngl_possible(
0, ngl_lowerlimit, llama_main_path, model_path, desired_context, main_gpu_ind,
vram_per_gpu, MiB_per_layer)
if not found:
print("Not even -ngl 0 works. It looks like you can't run this model. Sorry.")
exit(1)
split_mode = 'layer' if model_is_MoE or max_ngl_possible < model_layers else 'row'
# TODO do we need to confirm --split-mode row can run?
print(f'\nRun llama.cpp with these arguments:\n-m {model_path} -c {desired_context} --split-mode {split_mode}{tensor_split} --main-gpu {main_gpu_ind} --flash-attn -t 4 -ngl {max_ngl_possible}')