diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md
index 2587eedc..fcc10ef0 100644
--- a/docs/04 - Model Tab.md	
+++ b/docs/04 - Model Tab.md	
@@ -28,7 +28,7 @@ Options:
 * **disk**: Enable disk offloading for layers that don't fit into the GPU and CPU combined.
 * **load-in-4bit**: Load the model in 4-bit precision using bitsandbytes.
 * **trust-remote-code**: Some models use custom Python code to load the model or the tokenizer. For such models, this option needs to be set. It doesn't download any remote content: all it does is execute the .py files that get downloaded with the model. Those files can potentially include malicious code; I have never seen it happen, but it is in principle possible.
-* **use_fast**: Use the "fast" version of the tokenizer. Especially useful for Llama models, which originally had a "slow" tokenizer that received an update. If your local files are in the old "slow" format, checking this option may trigger a conversion that takes several minutes. The fast tokenizer is mostly useful if you are generating 50+ tokens/second using ExLlama_HF or if you are tokenizing a huge dataset for training.
+* **no_use_fast**: Do not use the "fast" version of the tokenizer. Can usually be ignored; only check this if you can't load the tokenizer for your model otherwise.
 * **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training.
 * **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model.
 
diff --git a/modules/loaders.py b/modules/loaders.py
index 607a63d3..2f1648c7 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -19,7 +19,7 @@ loaders_and_params = OrderedDict({
         'quant_type',
         'compute_dtype',
         'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
         'use_flash_attention_2',
         'alpha_value',
         'rope_freq_base',
@@ -34,7 +34,7 @@ loaders_and_params = OrderedDict({
         'rope_freq_base',
         'compress_pos_emb',
         'cfg_cache',
-        'use_fast',
+        'no_use_fast',
         'exllama_HF_info',
     ],
     'ExLlamav2_HF': [
@@ -45,7 +45,7 @@ loaders_and_params = OrderedDict({
         'cache_8bit',
         'alpha_value',
         'compress_pos_emb',
-        'use_fast',
+        'no_use_fast',
     ],
     'ExLlama': [
         'gpu_split',
@@ -78,7 +78,7 @@ loaders_and_params = OrderedDict({
         'disk',
         'auto_devices',
         'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
         'autogptq_info',
     ],
     'GPTQ-for-LLaMa': [
@@ -86,7 +86,7 @@ loaders_and_params = OrderedDict({
         'groupsize',
         'model_type',
         'pre_layer',
-        'use_fast',
+        'no_use_fast',
         'gptq_for_llama_info',
     ],
     'llama.cpp': [
@@ -119,7 +119,7 @@ loaders_and_params = OrderedDict({
         'compress_pos_emb',
         'numa',
         'cfg_cache',
-        'use_fast',
+        'no_use_fast',
         'logits_all',
         'llamacpp_HF_info',
     ],
@@ -139,7 +139,7 @@ loaders_and_params = OrderedDict({
         'max_seq_len',
         'no_inject_fused_attention',
         'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
     ]
 })
 
diff --git a/modules/models.py b/modules/models.py
index e4c3ddaa..19c0d903 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -114,13 +114,13 @@ def load_tokenizer(model_name, model):
     if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
         tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
     elif path_to_model.exists():
-        if shared.args.use_fast:
-            logger.info('Loading the tokenizer with use_fast=True.')
+        if shared.args.no_use_fast:
+            logger.info('Loading the tokenizer with use_fast=False.')
 
         tokenizer = AutoTokenizer.from_pretrained(
             path_to_model,
             trust_remote_code=shared.args.trust_remote_code,
-            use_fast=shared.args.use_fast
+            use_fast=not shared.args.no_use_fast
         )
 
     return tokenizer
@@ -262,13 +262,13 @@ def llamacpp_HF_loader(model_name):
         logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
         return None, None
 
-    if shared.args.use_fast:
-        logger.info('Loading the tokenizer with use_fast=True.')
+    if shared.args.no_use_fast:
+        logger.info('Loading the tokenizer with use_fast=False.')
 
     tokenizer = AutoTokenizer.from_pretrained(
         path,
         trust_remote_code=shared.args.trust_remote_code,
-        use_fast=shared.args.use_fast
+        use_fast=not shared.args.no_use_fast
     )
 
     model = LlamacppHF.from_pretrained(model_name)
diff --git a/modules/shared.py b/modules/shared.py
index b4750b26..09cf006a 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -93,7 +93,7 @@ parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memo
 parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
 parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
 parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
-parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.')
+parser.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Set this if you have any problems related to use_fast.')
 parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
 
 # Accelerate 4-bit
@@ -182,6 +182,7 @@ parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED')
 parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED')
 parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED')
 parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED')
+parser.add_argument('--use_fast', action='store_true', help='DEPRECATED')
 
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@@ -192,7 +193,7 @@ for arg in sys.argv[1:]:
         provided_arguments.append(arg)
 
 # Deprecation warnings
-for k in ['chat', 'notebook', 'no_stream', 'mul_mat_q']:
+for k in ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast']:
     if getattr(args, k):
         logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')
 
diff --git a/modules/ui.py b/modules/ui.py
index 5984a588..383bc66f 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -52,7 +52,7 @@ def list_model_elements():
         'bf16',
         'load_in_8bit',
         'trust_remote_code',
-        'use_fast',
+        'no_use_fast',
         'use_flash_attention_2',
         'load_in_4bit',
         'compute_dtype',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 3e4c6f8e..12edeed9 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -109,7 +109,6 @@ def create_ui():
                             shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
                             shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
                             shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
-                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                             shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                             shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                             shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
@@ -122,12 +121,13 @@ def create_ui():
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
-                            shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.')
+                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                             shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                             shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
                             shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
                             shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
+                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                             shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
                             shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).')
                             shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')