From dfdb6fee22830a8a49ac167b5b2bf8796dae12cf Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 26 Apr 2024 09:39:27 -0700 Subject: [PATCH] Set llm_int8_enable_fp32_cpu_offload=True for --load-in-4bit To allow for 32-bit CPU offloading (it's very slow). --- modules/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/models.py b/modules/models.py index 64cbffe1..c89bcae4 100644 --- a/modules/models.py +++ b/modules/models.py @@ -205,6 +205,7 @@ def huggingface_loader(model_name): 'bnb_4bit_compute_dtype': eval("torch.{}".format(shared.args.compute_dtype)) if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None, 'bnb_4bit_quant_type': shared.args.quant_type, 'bnb_4bit_use_double_quant': shared.args.use_double_quant, + 'llm_int8_enable_fp32_cpu_offload': True } params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)