From e6181e834ab0b32baa19a55773f369dc9a64802d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 23 Jul 2024 15:26:02 -0700 Subject: [PATCH] Remove AutoAWQ as a standalone loader (it works better through transformers) --- modules/LoRA.py | 2 -- modules/loaders.py | 10 ---------- modules/models.py | 19 ------------------- modules/models_settings.py | 2 -- modules/shared.py | 9 ++------- modules/ui.py | 1 - modules/ui_model_menu.py | 1 - 7 files changed, 2 insertions(+), 42 deletions(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index eda5e406..117022cf 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names): else: if len(lora_names) > 1: logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.') - if not shared.args.no_inject_fused_attention: - logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.') peft_config = GPTQLoraConfig( inference_mode=True, diff --git a/modules/loaders.py b/modules/loaders.py index 75ed897b..549de5fb 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -127,15 +127,6 @@ loaders_and_params = OrderedDict({ 'no_use_fast', 'autogptq_info', ], - 'AutoAWQ': [ - 'cpu_memory', - 'gpu_memory', - 'auto_devices', - 'max_seq_len', - 'no_inject_fused_attention', - 'trust_remote_code', - 'no_use_fast', - ], 'HQQ': [ 'hqq_backend', 'trust_remote_code', @@ -200,7 +191,6 @@ def transformers_samplers(): loaders_samplers = { 'Transformers': transformers_samplers(), 'AutoGPTQ': transformers_samplers(), - 'AutoAWQ': transformers_samplers(), 'HQQ': transformers_samplers(), 'ExLlamav2': { 'temperature', diff --git a/modules/models.py b/modules/models.py index 07c14308..ea046e9b 100644 --- a/modules/models.py +++ b/modules/models.py @@ -75,7 +75,6 @@ def load_model(model_name, loader=None): 'llamacpp_HF': llamacpp_HF_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, - 'AutoAWQ': AutoAWQ_loader, 'HQQ': HQQ_loader, 'TensorRT-LLM': TensorRT_LLM_loader, } @@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name): return model -def AutoAWQ_loader(model_name): - from awq import AutoAWQForCausalLM - - model_dir = Path(f'{shared.args.model_dir}/{model_name}') - - model = AutoAWQForCausalLM.from_quantized( - quant_path=model_dir, - max_new_tokens=shared.args.max_seq_len, - trust_remote_code=shared.args.trust_remote_code, - fuse_layers=not shared.args.no_inject_fused_attention, - max_memory=get_max_memory_dict(), - batch_size=1, - safetensors=any(model_dir.glob('*.safetensors')), - ) - - return model - - def AutoGPTQ_loader(model_name): import modules.AutoGPTQ_loader diff --git a/modules/models_settings.py b/modules/models_settings.py index 7ae68125..1bb00ceb 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings): loader = None elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0): loader = 'ExLlamav2_HF' - elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()): - loader = 'AutoAWQ' elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists(): loader = 'llamacpp_HF' elif len(list(path_to_model.glob('*.gguf'))) > 0: diff --git a/modules/shared.py b/modules/shared.py index dec427dd..fe09a165 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -89,7 +89,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -160,10 +160,6 @@ group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExL group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') group.add_argument('--groupsize', type=int, default=-1, help='Group size.') -# AutoAWQ -group = parser.add_argument_group('AutoAWQ') -group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.') - # HQQ group = parser.add_argument_group('HQQ') group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') @@ -217,6 +213,7 @@ group.add_argument('--model_type', type=str, help='DEPRECATED') group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED') group.add_argument('--checkpoint', type=str, help='DEPRECATED') group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED') +group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -267,8 +264,6 @@ def fix_loader_name(name): return 'ExLlamav2' elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: return 'ExLlamav2_HF' - elif name in ['autoawq', 'awq', 'auto-awq']: - return 'AutoAWQ' elif name in ['hqq']: return 'HQQ' elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: diff --git a/modules/ui.py b/modules/ui.py index cfe709fa..47f92cf0 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -78,7 +78,6 @@ def list_model_elements(): 'groupsize', 'triton', 'desc_act', - 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', 'disable_exllama', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 54ac9b12..2938c120 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -127,7 +127,6 @@ def create_ui(): shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) - shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.') shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')