diff --git a/models/config.yaml b/models/config.yaml index 077d196c..d19d09c6 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -173,4 +173,6 @@ .*codellama.*instruct: instruction_template: 'Llama-v2' .*mistral.*instruct: - instruction_template: 'Mistral' \ No newline at end of file + instruction_template: 'Mistral' +.*AWQ: + n_batch: 1 diff --git a/modules/loaders.py b/modules/loaders.py index 964fb00a..a9b30bb6 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -129,6 +129,16 @@ loaders_and_params = OrderedDict({ 'model_type', 'no_mmap', 'mlock' + ], + 'AutoAWQ': [ + 'cpu_memory', + 'gpu_memory', + 'auto_devices', + 'max_seq_len', + 'n_batch', + 'no_inject_fused_attention', + 'trust_remote_code', + 'use_fast', ] }) @@ -365,7 +375,40 @@ loaders_samplers = { 'top_k', 'repetition_penalty', 'repetition_penalty_range', - } + }, + 'AutoAWQ': { + 'temperature', + 'top_p', + 'top_k', + 'typical_p', + 'epsilon_cutoff', + 'eta_cutoff', + 'tfs', + 'top_a', + 'repetition_penalty', + 'repetition_penalty_range', + 'encoder_repetition_penalty', + 'no_repeat_ngram_size', + 'min_length', + 'seed', + 'do_sample', + 'penalty_alpha', + 'num_beams', + 'length_penalty', + 'early_stopping', + 'mirostat_mode', + 'mirostat_tau', + 'mirostat_eta', + 'grammar_file_row', + 'grammar_string', + 'guidance_scale', + 'negative_prompt', + 'ban_eos_token', + 'custom_token_bans', + 'add_bos_token', + 'skip_special_tokens', + 'auto_max_new_tokens', + }, } loaders_model_types = { diff --git a/modules/models.py b/modules/models.py index 253c998b..db515636 100644 --- a/modules/models.py +++ b/modules/models.py @@ -63,6 +63,7 @@ def load_model(model_name, loader=None): 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ctransformers': ctransformers_loader, + 'AutoAWQ': AutoAWQ_loader, } if loader is None: @@ -276,6 +277,24 @@ def ctransformers_loader(model_name): model, tokenizer = ctrans.from_pretrained(model_file) return model, tokenizer +def AutoAWQ_loader(model_name): + from awq import AutoAWQForCausalLM + + model_dir = Path(f'{shared.args.model_dir}/{model_name}') + + if shared.args.deepspeed: + logger.warn("AutoAWQ is incompatible with deepspeed") + + model = AutoAWQForCausalLM.from_quantized( + quant_path=model_dir, + max_new_tokens=shared.args.max_seq_len, + trust_remote_code=shared.args.trust_remote_code, + fuse_layers=not shared.args.no_inject_fused_attention, + max_memory=get_max_memory_dict(), + batch_size=shared.args.n_batch, + safetensors=not shared.args.trust_remote_code) + + return model def GPTQ_loader(model_name): diff --git a/modules/models_settings.py b/modules/models_settings.py index ec60b3a7..3f168767 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -107,10 +107,14 @@ def infer_loader(model_name, model_settings): loader = None elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0): loader = 'AutoGPTQ' + elif (path_to_model / 'quant_config.json').exists(): + loader = 'AutoAWQ' elif len(list(path_to_model.glob('*.gguf'))) > 0: loader = 'llama.cpp' elif re.match(r'.*\.gguf', model_name.lower()): loader = 'llama.cpp' + elif re.match(r'.*-awq', model_name.lower()): + loader = 'AutoAWQ' elif re.match(r'.*rwkv.*\.pth', model_name.lower()): loader = 'RWKV' elif re.match(r'.*exl2', model_name.lower()): diff --git a/modules/shared.py b/modules/shared.py index 6e4965bb..427d9230 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -232,6 +232,8 @@ def fix_loader_name(name): return 'ExLlamav2_HF' elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']: return 'ctransformers' + elif name in ['autoawq', 'awq', 'auto-awq']: + return 'AutoAWQ' def add_extension(name): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 49c5a611..bfa95c07 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -99,7 +99,7 @@ def create_ui(): with gr.Column(): shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) - shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.') + shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.') shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') diff --git a/requirements.txt b/requirements.txt index 4d3b75f7..19881eff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl +autoawq==0.1.2 diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index fb5a9619..bc2e2451 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl +autoawq==0.1.2