From 3d17c80954efeb9e24b09824c975cd9ffdca4864 Mon Sep 17 00:00:00 2001 From: randoentity <137087500+randoentity@users.noreply.github.com> Date: Sat, 27 Jul 2024 17:25:43 +0200 Subject: [PATCH] Add q-cache 6 and 8 support for Exllamav2 --- modules/exllamav2.py | 8 +++++++- modules/exllamav2_hf.py | 14 ++++++++++++-- modules/loaders.py | 8 ++++++-- modules/shared.py | 7 +++++-- modules/ui.py | 3 +++ modules/ui_model_menu.py | 7 +++++-- 6 files changed, 38 insertions(+), 9 deletions(-) diff --git a/modules/exllamav2.py b/modules/exllamav2.py index a770e342..7db176c4 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -7,6 +7,8 @@ from exllamav2 import ( ExLlamaV2Cache, ExLlamaV2Cache_8bit, ExLlamaV2Cache_Q4, + ExLlamaV2Cache_Q6, + ExLlamaV2Cache_Q8, ExLlamaV2Config, ExLlamaV2Tokenizer ) @@ -63,8 +65,12 @@ class Exllamav2Model: if shared.args.cache_8bit: cache = ExLlamaV2Cache_8bit(model, lazy=shared.args.autosplit) - elif shared.args.cache_4bit: + elif shared.args.cache_q4: cache = ExLlamaV2Cache_Q4(model, lazy=shared.args.autosplit) + elif shared.args.cache_q6: + cache = ExLlamaV2Cache_Q6(model, lazy=shared.args.autosplit) + elif shared.args.cache_q8: + cache = ExLlamaV2Cache_Q8(model, lazy=shared.args.autosplit) else: cache = ExLlamaV2Cache(model, lazy=shared.args.autosplit) diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 53143d9a..9b85f41c 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -9,6 +9,8 @@ from exllamav2 import ( ExLlamaV2Cache, ExLlamaV2Cache_8bit, ExLlamaV2Cache_Q4, + ExLlamaV2Cache_Q6, + ExLlamaV2Cache_Q8, ExLlamaV2Config ) from torch.nn import CrossEntropyLoss @@ -51,8 +53,12 @@ class Exllamav2HF(PreTrainedModel): if shared.args.cache_8bit: self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=shared.args.autosplit) - elif shared.args.cache_4bit: + elif shared.args.cache_q4: self.ex_cache = ExLlamaV2Cache_Q4(self.ex_model, lazy=shared.args.autosplit) + elif shared.args.cache_q6: + self.ex_cache = ExLlamaV2Cache_Q6(self.ex_model, lazy=shared.args.autosplit) + elif shared.args.cache_q8: + self.ex_cache = ExLlamaV2Cache_Q8(self.ex_model, lazy=shared.args.autosplit) else: self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=shared.args.autosplit) @@ -63,8 +69,12 @@ class Exllamav2HF(PreTrainedModel): if shared.args.cfg_cache: if shared.args.cache_8bit: self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model) - elif shared.args.cache_4bit: + elif shared.args.cache_q4: self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model) + elif shared.args.cache_q6: + self.ex_cache_negative = ExLlamaV2Cache_Q6(self.ex_model) + elif shared.args.cache_q8: + self.ex_cache_negative = ExLlamaV2Cache_Q8(self.ex_model) else: self.ex_cache_negative = ExLlamaV2Cache(self.ex_model) diff --git a/modules/loaders.py b/modules/loaders.py index 549de5fb..55f48730 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -88,7 +88,9 @@ loaders_and_params = OrderedDict({ 'no_sdpa', 'num_experts_per_token', 'cache_8bit', - 'cache_4bit', + 'cache_q4', + 'cache_q6', + 'cache_q8', 'autosplit', 'alpha_value', 'compress_pos_emb', @@ -103,7 +105,9 @@ loaders_and_params = OrderedDict({ 'no_sdpa', 'num_experts_per_token', 'cache_8bit', - 'cache_4bit', + 'cache_q4', + 'cache_q6', + 'cache_q8', 'autosplit', 'alpha_value', 'compress_pos_emb', diff --git a/modules/shared.py b/modules/shared.py index c27657ff..6171327b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -142,8 +142,11 @@ group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Creat group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.') group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.') -group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') -group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.') +group.add_argument('--cache_4bit', action='store_true', help='Use 4-bit cache to save VRAM (llama.cpp).') +group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit (FP8) cache to save VRAM.') +group.add_argument('--cache_q4', action='store_true', help='Use Q4 cache to save VRAM.') +group.add_argument('--cache_q6', action='store_true', help='Use Q6 cache to save VRAM.') +group.add_argument('--cache_q8', action='store_true', help='Use Q8 cache to save VRAM.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') # AutoGPTQ diff --git a/modules/ui.py b/modules/ui.py index 47f92cf0..41b8655b 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -89,6 +89,9 @@ def list_model_elements(): 'num_experts_per_token', 'cache_8bit', 'cache_4bit', + 'cache_q4', + 'cache_q6', + 'cache_q8', 'autosplit', 'threads', 'threads_batch', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 1883fdca..bf28b848 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -118,8 +118,11 @@ def create_ui(): shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.') - shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') - shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.') + shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_8bit, info='Use 4-bit (FP4) cache to save VRAM.') + shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit (FP8) cache to save VRAM.') + shared.gradio['cache_q4'] = gr.Checkbox(label="cache_q4", value=shared.args.cache_q4, info='Use Q4 cache to save VRAM.') + shared.gradio['cache_q6'] = gr.Checkbox(label="cache_q6", value=shared.args.cache_q6, info='Use Q6 cache to save VRAM.') + shared.gradio['cache_q8'] = gr.Checkbox(label="cache_q8", value=shared.args.cache_q8, info='Use Q8 cache to save VRAM.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.') shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')