From 87dab03dc02eb48b8fd7c8b9a2acb8281678798e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 3 Aug 2023 11:00:36 -0300 Subject: [PATCH] Add the --cpu option for llama.cpp to prevent CUDA from being used (#3432) --- README.md | 5 +++-- modules/llamacpp_hf.py | 17 ++++++++++++++--- modules/llamacpp_model.py | 22 +++++++++++++++++++--- modules/loaders.py | 2 ++ modules/shared.py | 4 ++-- 5 files changed, 40 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 073a841d..6ec84ba2 100644 --- a/README.md +++ b/README.md @@ -249,8 +249,9 @@ Optionally, you can use the following command-line flags: | `--n-gpu-layers N_GPU_LAYERS` | Number of layers to offload to the GPU. Only works if llama-cpp-python was compiled with BLAS. Set this to 1000000000 to offload all layers to the GPU. | | `--n_ctx N_CTX` | Size of the prompt context. | | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). | -| `--n_gqa N_GQA` | grouped-query attention. Must be 8 for llama2 70b. | -| `--rms_norm_eps RMS_NORM_EPS` | Must be 1e-5 for llama2 70b. | +| `--n_gqa N_GQA` | grouped-query attention. Must be 8 for llama-2 70b. | +| `--rms_norm_eps RMS_NORM_EPS` | 5e-6 is a good value for llama-2 models. | +| `--cpu` | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. | #### AutoGPTQ diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 349a5782..e9f4ade6 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -10,13 +10,22 @@ from transformers.modeling_outputs import CausalLMOutputWithPast from modules import shared from modules.logging_colors import logger +import llama_cpp + if torch.cuda.is_available() and not torch.version.hip: try: - from llama_cpp_cuda import Llama + import llama_cpp_cuda except: - from llama_cpp import Llama + llama_cpp_cuda = None else: - from llama_cpp import Llama + llama_cpp_cuda = None + + +def llama_cpp_lib(): + if shared.args.cpu or llama_cpp_cuda is None: + return llama_cpp + else: + return llama_cpp_cuda class LlamacppHF(PreTrainedModel): @@ -111,5 +120,7 @@ class LlamacppHF(PreTrainedModel): 'logits_all': True, } + Llama = llama_cpp_lib().Llama model = Llama(**params) + return LlamacppHF(model) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 0f9c3470..53177f4f 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -7,13 +7,22 @@ from modules import shared from modules.callbacks import Iteratorize from modules.logging_colors import logger +import llama_cpp + if torch.cuda.is_available() and not torch.version.hip: try: - from llama_cpp_cuda import Llama, LlamaCache, LogitsProcessorList + import llama_cpp_cuda except: - from llama_cpp import Llama, LlamaCache, LogitsProcessorList + llama_cpp_cuda = None else: - from llama_cpp import Llama, LlamaCache, LogitsProcessorList + llama_cpp_cuda = None + + +def llama_cpp_lib(): + if shared.args.cpu or llama_cpp_cuda is None: + return llama_cpp + else: + return llama_cpp_cuda def ban_eos_logits_processor(eos_token, input_ids, logits): @@ -30,6 +39,10 @@ class LlamaCppModel: @classmethod def from_pretrained(self, path): + + Llama = llama_cpp_lib().Llama + LlamaCache = llama_cpp_lib().LlamaCache + result = self() cache_capacity = 0 if shared.args.cache_capacity is not None: @@ -74,6 +87,9 @@ class LlamaCppModel: return self.model.detokenize(tokens) def generate(self, prompt, state, callback=None): + + LogitsProcessorList = llama_cpp_lib().LogitsProcessorList + prompt = prompt if type(prompt) is str else prompt.decode() completion_chunks = self.model.create_completion( prompt=prompt, diff --git a/modules/loaders.py b/modules/loaders.py index 68b48204..aa1afcb8 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -41,6 +41,7 @@ loaders_and_params = { 'llama_cpp_seed', 'compress_pos_emb', 'alpha_value', + 'cpu', ], 'llamacpp_HF': [ 'n_ctx', @@ -55,6 +56,7 @@ loaders_and_params = { 'llama_cpp_seed', 'compress_pos_emb', 'alpha_value', + 'cpu', 'llamacpp_HF_info', ], 'Transformers': [ diff --git a/modules/shared.py b/modules/shared.py index bac3fa8c..fc9ba3cf 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -132,8 +132,8 @@ parser.add_argument('--cache-capacity', type=str, help='Maximum cache capacity. parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.') parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)') -parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama2 70b.') -parser.add_argument('--rms_norm_eps', type=float, default=0, help='Must be 1e-5 for llama2 70b.') +parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama-2 70b.') +parser.add_argument('--rms_norm_eps', type=float, default=0, help='5e-6 is a good value for llama-2 models.') # GPTQ parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')