llama.cpp: add 4-bit/8-bit kv cache options

This commit is contained in:
oobabooga 2024-06-29 09:10:33 -07:00
parent 220c1797fc
commit 4ea260098f
3 changed files with 18 additions and 0 deletions

View file

@ -221,6 +221,13 @@ class LlamacppHF(PreTrainedModel):
'flash_attn': shared.args.flash_attn
}
if shared.args.cache_4bit:
params["type_k"] = 2
params["type_v"] = 2
elif shared.args.cache_8bit:
params["type_k"] = 8
params["type_v"] = 8
Llama = llama_cpp_lib().Llama
model = Llama(**params)

View file

@ -100,6 +100,13 @@ class LlamaCppModel:
'flash_attn': shared.args.flash_attn
}
if shared.args.cache_4bit:
params["type_k"] = 2
params["type_v"] = 2
elif shared.args.cache_8bit:
params["type_k"] = 8
params["type_v"] = 8
result.model = Llama(**params)
if cache_capacity > 0:
result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))

View file

@ -30,6 +30,8 @@ loaders_and_params = OrderedDict({
'llama.cpp': [
'n_ctx',
'n_gpu_layers',
'cache_8bit',
'cache_4bit',
'tensor_split',
'n_batch',
'threads',
@ -51,6 +53,8 @@ loaders_and_params = OrderedDict({
'llamacpp_HF': [
'n_ctx',
'n_gpu_layers',
'cache_8bit',
'cache_4bit',
'tensor_split',
'n_batch',
'threads',