From b57ffc2ec9bc32ff0cc82ba97edf4986ecf426fb Mon Sep 17 00:00:00 2001 From: sgsdxzy Date: Mon, 17 Apr 2023 12:11:18 +0800 Subject: [PATCH] Update to support GPTQ triton commit c90adef (#1229) --- README.md | 2 ++ modules/GPTQ_loader.py | 57 +++++++++++++++++++++++++----------------- modules/shared.py | 2 ++ 3 files changed, 38 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 4479c55e..abda3fa4 100644 --- a/README.md +++ b/README.md @@ -236,7 +236,9 @@ Optionally, you can use the following command-line flags: | `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. | | `--groupsize GROUPSIZE` | GPTQ: Group size. | | `--pre_layer PRE_LAYER` | GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. | +| `--no-quant_attn` | GPTQ: Disable quant attention for triton. If you encounter incoherent results try disabling this. | | `--no-warmup_autotune` | GPTQ: Disable warmup autotune for triton. | +| `--no-fused_mlp` | GPTQ: Disable fused mlp for triton. If you encounter "Unexpected mma -> mma layout conversion" try disabling this. | | `--monkey-patch` | GPTQ: Apply the monkey patch for using LoRAs with quantized models. | #### FlexGen diff --git a/modules/GPTQ_loader.py b/modules/GPTQ_loader.py index 344e34dd..fc70e5e3 100644 --- a/modules/GPTQ_loader.py +++ b/modules/GPTQ_loader.py @@ -13,12 +13,18 @@ import modules.shared as shared sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) import llama_inference_offload from modelutils import find_layers -from quant import make_quant + +try: + from quant import make_quant + is_triton = False +except ImportError: + import quant + is_triton = True # This function is a replacement for the load_quant function in the # GPTQ-for_LLaMa repository. It supports more models and branches. -def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128): +def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128, eval=True): def noop(*args, **kwargs): pass @@ -33,27 +39,31 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc torch.set_default_dtype(torch.half) model = AutoModelForCausalLM.from_config(config) torch.set_default_dtype(torch.float) - model = model.eval() + if eval: + model = model.eval() layers = find_layers(model) for name in exclude_layers: if name in layers: del layers[name] - gptq_args = inspect.getfullargspec(make_quant).args + if not is_triton: + gptq_args = inspect.getfullargspec(make_quant).args - make_quant_kwargs = { - 'module': model, - 'names': layers, - 'bits': wbits, - } - if 'groupsize' in gptq_args: - make_quant_kwargs['groupsize'] = groupsize - if 'faster' in gptq_args: - make_quant_kwargs['faster'] = faster_kernel - if 'kernel_switch_threshold' in gptq_args: - make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold + make_quant_kwargs = { + 'module': model, + 'names': layers, + 'bits': wbits, + } + if 'groupsize' in gptq_args: + make_quant_kwargs['groupsize'] = groupsize + if 'faster' in gptq_args: + make_quant_kwargs['faster'] = faster_kernel + if 'kernel_switch_threshold' in gptq_args: + make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold - make_quant(**make_quant_kwargs) + make_quant(**make_quant_kwargs) + else: + quant.make_quant_linear(model, layers, wbits, groupsize) del layers @@ -64,15 +74,16 @@ def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exc else: model.load_state_dict(torch.load(checkpoint), strict=False) - try: - from quant import autotune_warmup, make_quant_attn + if is_triton: + if not shared.args.no_quant_attn: + quant.make_quant_attn(model) + if eval and not shared.args.no_fused_mlp: + quant.make_fused_mlp(model) - # triton branch - make_quant_attn(model) if not shared.args.no_warmup_autotune: - autotune_warmup(model) - except ImportError: # not triton branch - pass + quant.autotune_warmup_linear(model, transpose=not eval) + if eval and not shared.args.no_fused_mlp: + quant.autotune_warmup_fused(model) model.seqlen = 2048 print('Done.') diff --git a/modules/shared.py b/modules/shared.py index 92ac1dd2..ea55973a 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -123,7 +123,9 @@ parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quant parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.') parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.') +parser.add_argument('--no-quant_attn', action='store_true', help='GPTQ: Disable quant attention for triton. If you encounter incoherent results try disabling this.') parser.add_argument('--no-warmup_autotune', action='store_true', help='GPTQ: Disable warmup autotune for triton.') +parser.add_argument('--no-fused_mlp', action='store_true', help='GPTQ: Disable fused mlp for triton. If you encounter "Unexpected mma -> mma layout conversion" try disabling this.') parser.add_argument('--monkey-patch', action='store_true', help='GPTQ: Apply the monkey patch for using LoRAs with quantized models.') # FlexGen