AutoAWQ: initial support (#3999)

2024-09-20 18:45:09 +02:00 · 2023-10-05 16:19:18 +00:00 · 2023-10-05 16:19:18 +00:00 · cc632c3f33
commit cc632c3f33
parent 3f56151f03
8 changed files with 75 additions and 3 deletions
--- a/models/config.yaml
+++ b/models/config.yaml
@ -174,3 +174,5 @@
  instruction_template: 'Llama-v2'
 .*mistral.*instruct:
  instruction_template: 'Mistral'
 .*AWQ:
  n_batch: 1
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -129,6 +129,16 @@ loaders_and_params = OrderedDict({
        'model_type',
        'no_mmap',
        'mlock'
    ],
    'AutoAWQ': [
        'cpu_memory',
        'gpu_memory',
        'auto_devices',
        'max_seq_len',
        'n_batch',
        'no_inject_fused_attention',
        'trust_remote_code',
        'use_fast',
    ]
 })
@ -365,7 +375,40 @@ loaders_samplers = {
        'top_k',
        'repetition_penalty',
        'repetition_penalty_range',
-    }
+    },
    'AutoAWQ': {
        'temperature',
        'top_p',
        'top_k',
        'typical_p',
        'epsilon_cutoff',
        'eta_cutoff',
        'tfs',
        'top_a',
        'repetition_penalty',
        'repetition_penalty_range',
        'encoder_repetition_penalty',
        'no_repeat_ngram_size',
        'min_length',
        'seed',
        'do_sample',
        'penalty_alpha',
        'num_beams',
        'length_penalty',
        'early_stopping',
        'mirostat_mode',
        'mirostat_tau',
        'mirostat_eta',
        'grammar_file_row',
        'grammar_string',
        'guidance_scale',
        'negative_prompt',
        'ban_eos_token',
        'custom_token_bans',
        'add_bos_token',
        'skip_special_tokens',
        'auto_max_new_tokens',
    },
 }
 loaders_model_types = {
--- a/modules/models.py
+++ b/modules/models.py
@ -63,6 +63,7 @@ def load_model(model_name, loader=None):
        'ExLlamav2': ExLlamav2_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ctransformers': ctransformers_loader,
        'AutoAWQ': AutoAWQ_loader,
    }
    if loader is None:
@ -276,6 +277,24 @@ def ctransformers_loader(model_name):
    model, tokenizer = ctrans.from_pretrained(model_file)
    return model, tokenizer
 def AutoAWQ_loader(model_name):
   from awq import AutoAWQForCausalLM
   model_dir = Path(f'{shared.args.model_dir}/{model_name}')
   if shared.args.deepspeed:
       logger.warn("AutoAWQ is incompatible with deepspeed")
   model = AutoAWQForCausalLM.from_quantized(
       quant_path=model_dir,
       max_new_tokens=shared.args.max_seq_len,
       trust_remote_code=shared.args.trust_remote_code,
       fuse_layers=not shared.args.no_inject_fused_attention,
       max_memory=get_max_memory_dict(),
       batch_size=shared.args.n_batch,
       safetensors=not shared.args.trust_remote_code)
   return model
 def GPTQ_loader(model_name):
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -107,10 +107,14 @@ def infer_loader(model_name, model_settings):
        loader = None
    elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
        loader = 'AutoGPTQ'
    elif (path_to_model / 'quant_config.json').exists():
        loader = 'AutoAWQ'
    elif len(list(path_to_model.glob('*.gguf'))) > 0:
        loader = 'llama.cpp'
    elif re.match(r'.*\.gguf', model_name.lower()):
        loader = 'llama.cpp'
    elif re.match(r'.*-awq', model_name.lower()):
        loader = 'AutoAWQ'
    elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
        loader = 'RWKV'
    elif re.match(r'.*exl2', model_name.lower()):
--- a/modules/shared.py
+++ b/modules/shared.py
@ -232,6 +232,8 @@ def fix_loader_name(name):
        return 'ExLlamav2_HF'
    elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
        return 'ctransformers'
    elif name in ['autoawq', 'awq', 'auto-awq']:
        return 'AutoAWQ'
 def add_extension(name):
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -99,7 +99,7 @@ def create_ui():
                        with gr.Column():
                            shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
                            shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
                            shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                            shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
--- a/requirements.txt
+++ b/requirements.txt
@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
 autoawq==0.1.2
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
 autoawq==0.1.2