Set use_fast=True by default, create --no_use_fast flag

This increases tokens/second for HF loaders.
This commit is contained in:
oobabooga 2023-11-16 19:45:05 -08:00
parent b2ce8dc7ee
commit 8b66d83aa9
6 changed files with 20 additions and 19 deletions

View file

@ -28,7 +28,7 @@ Options:
* **disk**: Enable disk offloading for layers that don't fit into the GPU and CPU combined. * **disk**: Enable disk offloading for layers that don't fit into the GPU and CPU combined.
* **load-in-4bit**: Load the model in 4-bit precision using bitsandbytes. * **load-in-4bit**: Load the model in 4-bit precision using bitsandbytes.
* **trust-remote-code**: Some models use custom Python code to load the model or the tokenizer. For such models, this option needs to be set. It doesn't download any remote content: all it does is execute the .py files that get downloaded with the model. Those files can potentially include malicious code; I have never seen it happen, but it is in principle possible. * **trust-remote-code**: Some models use custom Python code to load the model or the tokenizer. For such models, this option needs to be set. It doesn't download any remote content: all it does is execute the .py files that get downloaded with the model. Those files can potentially include malicious code; I have never seen it happen, but it is in principle possible.
* **use_fast**: Use the "fast" version of the tokenizer. Especially useful for Llama models, which originally had a "slow" tokenizer that received an update. If your local files are in the old "slow" format, checking this option may trigger a conversion that takes several minutes. The fast tokenizer is mostly useful if you are generating 50+ tokens/second using ExLlama_HF or if you are tokenizing a huge dataset for training. * **no_use_fast**: Do not use the "fast" version of the tokenizer. Can usually be ignored; only check this if you can't load the tokenizer for your model otherwise.
* **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training. * **use_flash_attention_2**: Set use_flash_attention_2=True while loading the model. Possibly useful for training.
* **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model. * **disable_exllama**: Only applies when you are loading a GPTQ model through the transformers loader. It needs to be checked if you intend to train LoRAs with the model.

View file

@ -19,7 +19,7 @@ loaders_and_params = OrderedDict({
'quant_type', 'quant_type',
'compute_dtype', 'compute_dtype',
'trust_remote_code', 'trust_remote_code',
'use_fast', 'no_use_fast',
'use_flash_attention_2', 'use_flash_attention_2',
'alpha_value', 'alpha_value',
'rope_freq_base', 'rope_freq_base',
@ -34,7 +34,7 @@ loaders_and_params = OrderedDict({
'rope_freq_base', 'rope_freq_base',
'compress_pos_emb', 'compress_pos_emb',
'cfg_cache', 'cfg_cache',
'use_fast', 'no_use_fast',
'exllama_HF_info', 'exllama_HF_info',
], ],
'ExLlamav2_HF': [ 'ExLlamav2_HF': [
@ -45,7 +45,7 @@ loaders_and_params = OrderedDict({
'cache_8bit', 'cache_8bit',
'alpha_value', 'alpha_value',
'compress_pos_emb', 'compress_pos_emb',
'use_fast', 'no_use_fast',
], ],
'ExLlama': [ 'ExLlama': [
'gpu_split', 'gpu_split',
@ -78,7 +78,7 @@ loaders_and_params = OrderedDict({
'disk', 'disk',
'auto_devices', 'auto_devices',
'trust_remote_code', 'trust_remote_code',
'use_fast', 'no_use_fast',
'autogptq_info', 'autogptq_info',
], ],
'GPTQ-for-LLaMa': [ 'GPTQ-for-LLaMa': [
@ -86,7 +86,7 @@ loaders_and_params = OrderedDict({
'groupsize', 'groupsize',
'model_type', 'model_type',
'pre_layer', 'pre_layer',
'use_fast', 'no_use_fast',
'gptq_for_llama_info', 'gptq_for_llama_info',
], ],
'llama.cpp': [ 'llama.cpp': [
@ -119,7 +119,7 @@ loaders_and_params = OrderedDict({
'compress_pos_emb', 'compress_pos_emb',
'numa', 'numa',
'cfg_cache', 'cfg_cache',
'use_fast', 'no_use_fast',
'logits_all', 'logits_all',
'llamacpp_HF_info', 'llamacpp_HF_info',
], ],
@ -139,7 +139,7 @@ loaders_and_params = OrderedDict({
'max_seq_len', 'max_seq_len',
'no_inject_fused_attention', 'no_inject_fused_attention',
'trust_remote_code', 'trust_remote_code',
'use_fast', 'no_use_fast',
] ]
}) })

View file

@ -114,13 +114,13 @@ def load_tokenizer(model_name, model):
if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
elif path_to_model.exists(): elif path_to_model.exists():
if shared.args.use_fast: if shared.args.no_use_fast:
logger.info('Loading the tokenizer with use_fast=True.') logger.info('Loading the tokenizer with use_fast=False.')
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
path_to_model, path_to_model,
trust_remote_code=shared.args.trust_remote_code, trust_remote_code=shared.args.trust_remote_code,
use_fast=shared.args.use_fast use_fast=not shared.args.no_use_fast
) )
return tokenizer return tokenizer
@ -262,13 +262,13 @@ def llamacpp_HF_loader(model_name):
logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.") logger.error("Could not load the model because a tokenizer in transformers format was not found. Please download oobabooga/llama-tokenizer.")
return None, None return None, None
if shared.args.use_fast: if shared.args.no_use_fast:
logger.info('Loading the tokenizer with use_fast=True.') logger.info('Loading the tokenizer with use_fast=False.')
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
path, path,
trust_remote_code=shared.args.trust_remote_code, trust_remote_code=shared.args.trust_remote_code,
use_fast=shared.args.use_fast use_fast=not shared.args.no_use_fast
) )
model = LlamacppHF.from_pretrained(model_name) model = LlamacppHF.from_pretrained(model_name)

View file

@ -93,7 +93,7 @@ parser.add_argument('--xformers', action='store_true', help='Use xformer\'s memo
parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.') parser.add_argument('--sdp-attention', action='store_true', help='Use PyTorch 2.0\'s SDP attention. Same as above.')
parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.') parser.add_argument('--trust-remote-code', action='store_true', help='Set trust_remote_code=True while loading the model. Necessary for some models.')
parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') parser.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.')
parser.add_argument('--use_fast', action='store_true', help='Set use_fast=True while loading the tokenizer.') parser.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Set this if you have any problems related to use_fast.')
parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.') parser.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.')
# Accelerate 4-bit # Accelerate 4-bit
@ -182,6 +182,7 @@ parser.add_argument('--mul_mat_q', action='store_true', help='DEPRECATED')
parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED') parser.add_argument('--api-blocking-port', type=int, default=5000, help='DEPRECATED')
parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED') parser.add_argument('--api-streaming-port', type=int, default=5005, help='DEPRECATED')
parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED') parser.add_argument('--llama_cpp_seed', type=int, default=0, help='DEPRECATED')
parser.add_argument('--use_fast', action='store_true', help='DEPRECATED')
args = parser.parse_args() args = parser.parse_args()
args_defaults = parser.parse_args([]) args_defaults = parser.parse_args([])
@ -192,7 +193,7 @@ for arg in sys.argv[1:]:
provided_arguments.append(arg) provided_arguments.append(arg)
# Deprecation warnings # Deprecation warnings
for k in ['chat', 'notebook', 'no_stream', 'mul_mat_q']: for k in ['notebook', 'chat', 'no_stream', 'mul_mat_q', 'use_fast']:
if getattr(args, k): if getattr(args, k):
logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.') logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')

View file

@ -52,7 +52,7 @@ def list_model_elements():
'bf16', 'bf16',
'load_in_8bit', 'load_in_8bit',
'trust_remote_code', 'trust_remote_code',
'use_fast', 'no_use_fast',
'use_flash_attention_2', 'use_flash_attention_2',
'load_in_4bit', 'load_in_4bit',
'compute_dtype', 'compute_dtype',

View file

@ -109,7 +109,6 @@ def create_ui():
shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.')
shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.')
shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
@ -122,12 +121,13 @@ def create_ui():
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant)
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code) shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.', interactive=shared.args.trust_remote_code)
shared.gradio['use_fast'] = gr.Checkbox(label="use_fast", value=shared.args.use_fast, info='Set use_fast=True while loading the tokenizer. May trigger a conversion that takes several minutes.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.') shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel.')
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.')
shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.')
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa support is currently only kept for compatibility with older GPUs. AutoGPTQ or ExLlama is preferred when compatible. GPTQ-for-LLaMa is installed by default with the webui on supported systems. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).')
shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).') shared.gradio['exllama_info'] = gr.Markdown('For more information, consult the [docs](https://github.com/oobabooga/text-generation-webui/wiki/04-%E2%80%90-Model-Tab#exllama_hf).')
shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.') shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s a bit slower than the regular ExLlama.')