From b57ffc2ec9bc32ff0cc82ba97edf4986ecf426fb Mon Sep 17 00:00:00 2001
From: sgsdxzy <sgsdxzy@gmail.com>
Date: Mon, 17 Apr 2023 12:11:18 +0800
Subject: [PATCH] Update to support GPTQ triton commit c90adef (#1229)

---
 README.md              |  2 ++
 modules/GPTQ_loader.py | 57 +++++++++++++++++++++++++-----------------
 modules/shared.py      |  2 ++
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 4479c55e..abda3fa4 100644
--- a/README.md
+++ b/README.md
@@ -236,7 +236,9 @@ Optionally, you can use the following command-line flags:
 | `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. |
 | `--groupsize GROUPSIZE`   | GPTQ: Group size. |
 | `--pre_layer PRE_LAYER`   | GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. |
+| `--no-quant_attn`         | GPTQ: Disable quant attention for triton. If you encounter incoherent results try disabling this. |
 | `--no-warmup_autotune`    | GPTQ: Disable warmup autotune for triton. |
+| `--no-fused_mlp`          | GPTQ: Disable fused mlp for triton. If you encounter "Unexpected mma -> mma layout conversion" try disabling this. |
 | `--monkey-patch`          | GPTQ: Apply the monkey patch for using LoRAs with quantized models. |
 
 #### FlexGen
diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py
index 344e34dd..fc70e5e3 100644
--- a/modules/GPTQ_loader.py
+++ b/modules/GPTQ_loader.py
@@ -13,12 +13,18 @@ import modules.shared as shared
 sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa")))
 import llama_inference_offload
 from modelutils import find_layers
-from quant import make_quant
+
+try:
+    from quant import make_quant
+    is_triton = False
+except ImportError:
+    import quant
+    is_triton = True
 
 
 # This function is a replacement for the load_quant function in the
 # GPTQ-for_LLaMa repository. It supports more models and branches.
-def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):
+def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128, eval=True):
 
     def noop(*args, **kwargs):
         pass
@@ -33,27 +39,31 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
     torch.set_default_dtype(torch.half)
     model = AutoModelForCausalLM.from_config(config)
     torch.set_default_dtype(torch.float)
-    model = model.eval()
+    if eval:
+        model = model.eval()
     layers = find_layers(model)
     for name in exclude_layers:
         if name in layers:
             del layers[name]
 
-    gptq_args = inspect.getfullargspec(make_quant).args
+    if not is_triton:
+        gptq_args = inspect.getfullargspec(make_quant).args
 
-    make_quant_kwargs = {
-        'module': model,
-        'names': layers,
-        'bits': wbits,
-    }
-    if 'groupsize' in gptq_args:
-        make_quant_kwargs['groupsize'] = groupsize
-    if 'faster' in gptq_args:
-        make_quant_kwargs['faster'] = faster_kernel
-    if 'kernel_switch_threshold' in gptq_args:
-        make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
+        make_quant_kwargs = {
+            'module': model,
+            'names': layers,
+            'bits': wbits,
+        }
+        if 'groupsize' in gptq_args:
+            make_quant_kwargs['groupsize'] = groupsize
+        if 'faster' in gptq_args:
+            make_quant_kwargs['faster'] = faster_kernel
+        if 'kernel_switch_threshold' in gptq_args:
+            make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold
 
-    make_quant(**make_quant_kwargs)
+        make_quant(**make_quant_kwargs)
+    else:
+        quant.make_quant_linear(model, layers, wbits, groupsize)
 
     del layers
 
@@ -64,15 +74,16 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc
     else:
         model.load_state_dict(torch.load(checkpoint), strict=False)
 
-    try:
-        from quant import autotune_warmup, make_quant_attn
+    if is_triton:
+        if not shared.args.no_quant_attn:
+            quant.make_quant_attn(model)
+        if eval and not shared.args.no_fused_mlp:
+            quant.make_fused_mlp(model)
 
-        # triton branch
-        make_quant_attn(model)
         if not shared.args.no_warmup_autotune:
-            autotune_warmup(model)
-    except ImportError:  # not triton branch
-        pass
+            quant.autotune_warmup_linear(model, transpose=not eval)
+            if eval and not shared.args.no_fused_mlp:
+                quant.autotune_warmup_fused(model)
 
     model.seqlen = 2048
     print('Done.')
diff --git a/modules/shared.py b/modules/shared.py
index 92ac1dd2..ea55973a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -123,7 +123,9 @@ parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quant
 parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.')
 parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.')
 parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.')
+parser.add_argument('--no-quant_attn', action='store_true', help='GPTQ: Disable quant attention for triton. If you encounter incoherent results try disabling this.')
 parser.add_argument('--no-warmup_autotune', action='store_true', help='GPTQ: Disable warmup autotune for triton.')
+parser.add_argument('--no-fused_mlp', action='store_true', help='GPTQ: Disable fused mlp for triton. If you encounter "Unexpected mma -> mma layout conversion" try disabling this.')
 parser.add_argument('--monkey-patch', action='store_true', help='GPTQ: Apply the monkey patch for using LoRAs with quantized models.')
 
 # FlexGen