diff --git a/Colab-TextGen-GPU.ipynb b/Colab-TextGen-GPU.ipynb index 3bd1bd69..66953bda 100644 --- a/Colab-TextGen-GPU.ipynb +++ b/Colab-TextGen-GPU.ipynb @@ -89,7 +89,7 @@ "# Parameters\n", "model_url = \"https://huggingface.co/turboderp/Mistral-7B-instruct-exl2\" #@param {type:\"string\"}\n", "branch = \"4.0bpw\" #@param {type:\"string\"}\n", - "command_line_flags = \"--n-gpu-layers 128 --mul_mat_q --load-in-4bit --use_double_quant\" #@param {type:\"string\"}\n", + "command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant\" #@param {type:\"string\"}\n", "api = False #@param {type:\"boolean\"}\n", "\n", "if api:\n", diff --git a/README.md b/README.md index 2ecc301e..b4f4455d 100644 --- a/README.md +++ b/README.md @@ -319,7 +319,7 @@ Optionally, you can use the following command-line flags: | `--n_ctx N_CTX` | Size of the prompt context. | | `--threads` | Number of threads to use. | | `--threads-batch THREADS_BATCH` | Number of threads to use for batches/prompt processing. | -| `--mul_mat_q` | Activate new mulmat kernels. | +| `--no_mul_mat_q` | Disable the mulmat kernels. | | `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. | | `--no-mmap` | Prevent mmap from being used. | | `--mlock` | Force the system to keep the model in RAM. | diff --git a/docs/04 ‐ Model Tab.md b/docs/04 ‐ Model Tab.md index 155b6167..2e8c63d2 100644 --- a/docs/04 ‐ Model Tab.md +++ b/docs/04 ‐ Model Tab.md @@ -90,7 +90,7 @@ Example: https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF * **threads**: Number of threads. Recommended value: your number of physical cores. * **threads_batch**: Number of threads for batch processing. Recommended value: your total number of cores (physical + virtual). * **n_batch**: Batch size for prompt processing. Higher values are supposed to make generation faster, but I have never obtained any benefit from changing this value. -* **mul_mat_q**: Use the mul_mat_q kernel. This usually improves generation speed significantly. +* **mul_mat_q**: Disable the mul_mat_q kernel. This kernel usually improves generation speed significantly. This option to disable it is included in case it doesn't work on some system. * **no-mmap**: Loads the model into memory at once, possibly preventing I/O operations later on at the cost of a longer load time. * **mlock**: Force the system to keep the model in RAM rather than swapping or compressing (no idea what this means, never used it). * **numa**: May improve performance on certain multi-cpu systems. diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 37f86e08..53bc861d 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -198,7 +198,7 @@ class LlamacppHF(PreTrainedModel): 'n_batch': shared.args.n_batch, 'use_mmap': not shared.args.no_mmap, 'use_mlock': shared.args.mlock, - 'mul_mat_q': shared.args.mul_mat_q, + 'mul_mat_q': not shared.args.no_mul_mat_q, 'numa': shared.args.numa, 'n_gpu_layers': shared.args.n_gpu_layers, 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 554da2b5..d692cc03 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -80,7 +80,7 @@ class LlamaCppModel: 'n_batch': shared.args.n_batch, 'use_mmap': not shared.args.no_mmap, 'use_mlock': shared.args.mlock, - 'mul_mat_q': shared.args.mul_mat_q, + 'mul_mat_q': not shared.args.no_mul_mat_q, 'numa': shared.args.numa, 'n_gpu_layers': shared.args.n_gpu_layers, 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), diff --git a/modules/loaders.py b/modules/loaders.py index bd3f04af..b76c85df 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -94,7 +94,7 @@ loaders_and_params = OrderedDict({ 'threads_batch', 'no_mmap', 'mlock', - 'mul_mat_q', + 'no_mul_mat_q', 'llama_cpp_seed', 'alpha_value', 'rope_freq_base', @@ -111,7 +111,7 @@ loaders_and_params = OrderedDict({ 'threads_batch', 'no_mmap', 'mlock', - 'mul_mat_q', + 'no_mul_mat_q', 'alpha_value', 'rope_freq_base', 'compress_pos_emb', diff --git a/modules/shared.py b/modules/shared.py index 7cd49bfa..3744d551 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -103,7 +103,7 @@ parser.add_argument('--quant_type', type=str, default='nf4', help='quant_type fo parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.') parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.') parser.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') -parser.add_argument('--mul_mat_q', action='store_true', help='Activate new mulmat kernels.') +parser.add_argument('--no_mul_mat_q', action='store_true', help='Disable the mulmat kernels.') parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.') parser.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') @@ -173,6 +173,7 @@ parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The m parser.add_argument('--notebook', action='store_true', help='DEPRECATED') parser.add_argument('--chat', action='store_true', help='DEPRECATED') parser.add_argument('--no-stream', action='store_true', help='DEPRECATED') +parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -183,7 +184,7 @@ for arg in sys.argv[1:]: provided_arguments.append(arg) # Deprecation warnings -for k in ['chat', 'notebook', 'no_stream']: +for k in ['chat', 'notebook', 'no_stream', 'mul_mat_q']: if getattr(args, k): logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.') diff --git a/modules/ui.py b/modules/ui.py index 875e4e24..ce92464d 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -73,7 +73,7 @@ def list_model_elements(): 'n_batch', 'no_mmap', 'mlock', - 'mul_mat_q', + 'no_mul_mat_q', 'n_gpu_layers', 'tensor_split', 'n_ctx', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index d9fe63d1..0f6c7f85 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -103,7 +103,7 @@ def create_ui(): shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') - shared.gradio['mul_mat_q'] = gr.Checkbox(label="mul_mat_q", value=shared.args.mul_mat_q, info='Recommended in most cases. Improves generation speed by 10-20%.') + shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.') shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)