diff --git a/models/config.yaml b/models/config.yaml
index 077d196c..d19d09c6 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -173,4 +173,6 @@
 .*codellama.*instruct:
   instruction_template: 'Llama-v2'
 .*mistral.*instruct:
-  instruction_template: 'Mistral'
\ No newline at end of file
+  instruction_template: 'Mistral'
+.*AWQ:
+  n_batch: 1
diff --git a/modules/loaders.py b/modules/loaders.py
index 964fb00a..a9b30bb6 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -129,6 +129,16 @@ loaders_and_params = OrderedDict({
         'model_type',
         'no_mmap',
         'mlock'
+    ],
+    'AutoAWQ': [
+        'cpu_memory',
+        'gpu_memory',
+        'auto_devices',
+        'max_seq_len',
+        'n_batch',
+        'no_inject_fused_attention',
+        'trust_remote_code',
+        'use_fast',
     ]
 })
 
@@ -365,7 +375,40 @@ loaders_samplers = {
         'top_k',
         'repetition_penalty',
         'repetition_penalty_range',
-    }
+    },
+    'AutoAWQ': {
+        'temperature',
+        'top_p',
+        'top_k',
+        'typical_p',
+        'epsilon_cutoff',
+        'eta_cutoff',
+        'tfs',
+        'top_a',
+        'repetition_penalty',
+        'repetition_penalty_range',
+        'encoder_repetition_penalty',
+        'no_repeat_ngram_size',
+        'min_length',
+        'seed',
+        'do_sample',
+        'penalty_alpha',
+        'num_beams',
+        'length_penalty',
+        'early_stopping',
+        'mirostat_mode',
+        'mirostat_tau',
+        'mirostat_eta',
+        'grammar_file_row',
+        'grammar_string',
+        'guidance_scale',
+        'negative_prompt',
+        'ban_eos_token',
+        'custom_token_bans',
+        'add_bos_token',
+        'skip_special_tokens',
+        'auto_max_new_tokens',
+    },
 }
 
 loaders_model_types = {
diff --git a/modules/models.py b/modules/models.py
index 253c998b..db515636 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -63,6 +63,7 @@ def load_model(model_name, loader=None):
         'ExLlamav2': ExLlamav2_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'ctransformers': ctransformers_loader,
+        'AutoAWQ': AutoAWQ_loader,
     }
 
     if loader is None:
@@ -276,6 +277,24 @@ def ctransformers_loader(model_name):
     model, tokenizer = ctrans.from_pretrained(model_file)
     return model, tokenizer
 
+def AutoAWQ_loader(model_name):
+   from awq import AutoAWQForCausalLM
+
+   model_dir = Path(f'{shared.args.model_dir}/{model_name}')
+
+   if shared.args.deepspeed:
+       logger.warn("AutoAWQ is incompatible with deepspeed")
+
+   model = AutoAWQForCausalLM.from_quantized(
+       quant_path=model_dir,
+       max_new_tokens=shared.args.max_seq_len,
+       trust_remote_code=shared.args.trust_remote_code,
+       fuse_layers=not shared.args.no_inject_fused_attention,
+       max_memory=get_max_memory_dict(),
+       batch_size=shared.args.n_batch,
+       safetensors=not shared.args.trust_remote_code)
+
+   return model
 
 def GPTQ_loader(model_name):
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index ec60b3a7..3f168767 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -107,10 +107,14 @@ def infer_loader(model_name, model_settings):
         loader = None
     elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
         loader = 'AutoGPTQ'
+    elif (path_to_model / 'quant_config.json').exists():
+        loader = 'AutoAWQ'
     elif len(list(path_to_model.glob('*.gguf'))) > 0:
         loader = 'llama.cpp'
     elif re.match(r'.*\.gguf', model_name.lower()):
         loader = 'llama.cpp'
+    elif re.match(r'.*-awq', model_name.lower()):
+        loader = 'AutoAWQ'
     elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
         loader = 'RWKV'
     elif re.match(r'.*exl2', model_name.lower()):
diff --git a/modules/shared.py b/modules/shared.py
index 6e4965bb..427d9230 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -232,6 +232,8 @@ def fix_loader_name(name):
         return 'ExLlamav2_HF'
     elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
         return 'ctransformers'
+    elif name in ['autoawq', 'awq', 'auto-awq']:
+        return 'AutoAWQ'
 
 
 def add_extension(name):
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 49c5a611..bfa95c07 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -99,7 +99,7 @@ def create_ui():
 
                         with gr.Column():
                             shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
-                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Disable if running low on VRAM.')
+                            shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.')
                             shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.')
                             shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                             shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
diff --git a/requirements.txt b/requirements.txt
index 4d3b75f7..19881eff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
+autoawq==0.1.2
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index fb5a9619..bc2e2451 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -48,3 +48,4 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu117-py3-none-any.whl
+autoawq==0.1.2