diff --git a/Colab-TextGen-GPU.ipynb b/Colab-TextGen-GPU.ipynb index 82e6c18e..8e305e1d 100644 --- a/Colab-TextGen-GPU.ipynb +++ b/Colab-TextGen-GPU.ipynb @@ -22,7 +22,7 @@ "source": [ "# oobabooga/text-generation-webui\n", "\n", - "After running both cells, a public gradio URL will appear at the bottom in a few minutes. You can optionally generate an API link.\n", + "After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.\n", "\n", "* Project page: https://github.com/oobabooga/text-generation-webui\n", "* Gradio server status: https://status.gradio.app/" @@ -53,44 +53,28 @@ "\n", "#@markdown If unsure about the branch, write \"main\" or leave it blank.\n", "\n", - "import torch\n", + "import os\n", "from pathlib import Path\n", "\n", + "os.environ.pop('PYTHONPATH', None)\n", + "\n", "if Path.cwd().name != 'text-generation-webui':\n", - " print(\"Installing the webui...\")\n", + " print(\"\\033[1;32;1m\\n --> Installing the web UI. This will take a while, but after the initial setup, you can download and test as many models as you like.\\033[0;37;0m\\n\")\n", "\n", " !git clone https://github.com/oobabooga/text-generation-webui\n", " %cd text-generation-webui\n", "\n", - " torver = torch.__version__\n", - " print(f\"TORCH: {torver}\")\n", - " is_cuda118 = '+cu118' in torver # 2.1.0+cu118\n", - "\n", - " if is_cuda118:\n", - " !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118\n", - " else:\n", - " !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121\n", - "\n", - " textgen_requirements = open('requirements.txt').read().splitlines()\n", - " if is_cuda118:\n", - " textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements]\n", - " with open('temp_requirements.txt', 'w') as file:\n", - " file.write('\\n'.join(textgen_requirements))\n", - "\n", - " !pip install -r temp_requirements.txt --upgrade\n", - "\n", - " print(\"\\033[1;32;1m\\n --> If you see a warning about \\\"previously imported packages\\\", just ignore it.\\033[0;37;0m\")\n", - " print(\"\\033[1;32;1m\\n --> There is no need to restart the runtime.\\n\\033[0;37;0m\")\n", - "\n", - " try:\n", - " import flash_attn\n", - " except:\n", - " !pip uninstall -y flash_attn\n", + " # Install the project in an isolated environment\n", + " !GPU_CHOICE=A \\\n", + " USE_CUDA118=FALSE \\\n", + " LAUNCH_AFTER_INSTALL=FALSE \\\n", + " INSTALL_EXTENSIONS=FALSE \\\n", + " ./start_linux.sh\n", "\n", "# Parameters\n", - "model_url = \"https://huggingface.co/TheBloke/MythoMax-L2-13B-GPTQ\" #@param {type:\"string\"}\n", - "branch = \"gptq-4bit-32g-actorder_True\" #@param {type:\"string\"}\n", - "command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant\" #@param {type:\"string\"}\n", + "model_url = \"https://huggingface.co/turboderp/gemma-2-9b-it-exl2\" #@param {type:\"string\"}\n", + "branch = \"8.0bpw\" #@param {type:\"string\"}\n", + "command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant --no_flash_attn\" #@param {type:\"string\"}\n", "api = False #@param {type:\"boolean\"}\n", "\n", "if api:\n", @@ -116,11 +100,10 @@ " output_folder = \"\"\n", "\n", "# Start the web UI\n", - "cmd = f\"python server.py --share\"\n", + "cmd = f\"./start_linux.sh {command_line_flags} --share\"\n", "if output_folder != \"\":\n", " cmd += f\" --model {output_folder}\"\n", - "cmd += f\" {command_line_flags}\"\n", - "print(cmd)\n", + "\n", "!$cmd" ], "metadata": { diff --git a/cmd_linux.sh b/cmd_linux.sh index 1685050a..576dbf02 100755 --- a/cmd_linux.sh +++ b/cmd_linux.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash cd "$(dirname "${BASH_SOURCE[0]}")" diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 8a31d6e2..50b9402f 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -39,14 +39,6 @@ margin-bottom: 0 !important; } -.dark .message-body p em { - color: rgb(198 202 214) !important; -} - -.message-body p em { - color: rgb(110 110 110) !important; -} - .gradio-container .chat .assistant-message { padding: 20px; background: #f4f4f4; diff --git a/css/main.css b/css/main.css index d8e12e59..3ecf0044 100644 --- a/css/main.css +++ b/css/main.css @@ -406,6 +406,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { color: var(--body-text-color); } +.dark .message q { + color: #f5b031; +} + +.message q::before, .message q::after { + content: ""; +} + .message-body li { list-style-position: outside; } diff --git a/js/main.js b/js/main.js index bdbb7cef..3b8b13e4 100644 --- a/js/main.js +++ b/js/main.js @@ -213,12 +213,10 @@ function doSyntaxHighlighting() { renderMathInElement(element, { delimiters: [ { left: "$$", right: "$$", display: true }, - { left: "$", right: "$", display: false }, { left: "\\(", right: "\\)", display: false }, { left: "\\[", right: "\\]", display: true }, ], }); - }); observer.observe(targetElement, config); diff --git a/modules/LoRA.py b/modules/LoRA.py index eda5e406..117022cf 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names): else: if len(lora_names) > 1: logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.') - if not shared.args.no_inject_fused_attention: - logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.') peft_config = GPTQLoraConfig( inference_mode=True, diff --git a/modules/chat.py b/modules/chat.py index c95673ce..c744defc 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -17,7 +17,11 @@ from PIL import Image import modules.shared as shared from modules import utils from modules.extensions import apply_extensions -from modules.html_generator import chat_html_wrapper, make_thumbnail +from modules.html_generator import ( + chat_html_wrapper, + convert_to_markdown, + make_thumbnail +) from modules.logging_colors import logger from modules.text_generation import ( generate_reply, @@ -368,7 +372,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess def impersonate_wrapper(text, state): - static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) prompt = generate_chat_prompt('', state, impersonate=True) @@ -488,7 +491,7 @@ def start_new_chat(state): greeting = replace_character_names(state['greeting'], state['name1'], state['name2']) if greeting != '': history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]] - history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]] + history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]] unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') save_history(history, unique_id, state['character_menu'], state['mode']) @@ -1044,6 +1047,8 @@ def handle_unique_id_select(state): history = load_history(state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + convert_to_markdown.cache_clear() + return [history, html] @@ -1052,6 +1057,8 @@ def handle_start_new_chat_click(state): histories = find_all_histories_with_first_prompts(state) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + convert_to_markdown.cache_clear() + return [history, html, gr.update(choices=histories, value=histories[0][1])] @@ -1061,6 +1068,8 @@ def handle_delete_chat_confirm_click(state): history, unique_id = load_history_after_deletion(state, index) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + convert_to_markdown.cache_clear() + return [ history, html, @@ -1099,6 +1108,8 @@ def handle_upload_chat_history(load_chat_history, state): html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + convert_to_markdown.cache_clear() + return [ history, html, @@ -1119,6 +1130,8 @@ def handle_character_menu_change(state): histories = find_all_histories_with_first_prompts(state) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + convert_to_markdown.cache_clear() + return [ history, html, @@ -1136,6 +1149,8 @@ def handle_mode_change(state): histories = find_all_histories_with_first_prompts(state) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + convert_to_markdown.cache_clear() + return [ history, html, diff --git a/modules/html_generator.py b/modules/html_generator.py index 657133bd..c5eba5a8 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -42,13 +42,39 @@ def fix_newlines(string): return string +def replace_quotes(text): + + # Define a list of quote pairs (opening and closing), using HTML entities + quote_pairs = [ + ('"', '"'), # Double quotes + ('“', '”'), # Unicode left and right double quotation marks + ('‘', '’'), # Unicode left and right single quotation marks + ('«', '»'), # French quotes + ('„', '“'), # German quotes + ('‘', '’'), # Alternative single quotes + ('“', '”'), # Unicode quotes (numeric entities) + ('“', '”'), # Unicode quotes (hex entities) + ] + + # Create a regex pattern that matches any of the quote pairs, including newlines + pattern = '|'.join(f'({re.escape(open_q)})(.*?)({re.escape(close_q)})' for open_q, close_q in quote_pairs) + + # Replace matched patterns with tags, keeping original quotes + replaced_text = re.sub(pattern, lambda m: f'{m.group(1)}{m.group(2)}{m.group(3)}', text, flags=re.DOTALL) + + return replaced_text + + def replace_blockquote(m): return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '') -@functools.lru_cache(maxsize=4096) +@functools.lru_cache(maxsize=None) def convert_to_markdown(string): + # Quote to + string = replace_quotes(string) + # Blockquote string = re.sub(r'(^|[\n])>', r'\1>', string) pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL) @@ -124,6 +150,7 @@ def convert_to_markdown_wrapped(string, use_cache=True): def generate_basic_html(string): + convert_to_markdown.cache_clear() string = convert_to_markdown(string) string = f'
{string}
' return string diff --git a/modules/loaders.py b/modules/loaders.py index 75ed897b..549de5fb 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -127,15 +127,6 @@ loaders_and_params = OrderedDict({ 'no_use_fast', 'autogptq_info', ], - 'AutoAWQ': [ - 'cpu_memory', - 'gpu_memory', - 'auto_devices', - 'max_seq_len', - 'no_inject_fused_attention', - 'trust_remote_code', - 'no_use_fast', - ], 'HQQ': [ 'hqq_backend', 'trust_remote_code', @@ -200,7 +191,6 @@ def transformers_samplers(): loaders_samplers = { 'Transformers': transformers_samplers(), 'AutoGPTQ': transformers_samplers(), - 'AutoAWQ': transformers_samplers(), 'HQQ': transformers_samplers(), 'ExLlamav2': { 'temperature', diff --git a/modules/models.py b/modules/models.py index 07c14308..ea046e9b 100644 --- a/modules/models.py +++ b/modules/models.py @@ -75,7 +75,6 @@ def load_model(model_name, loader=None): 'llamacpp_HF': llamacpp_HF_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, - 'AutoAWQ': AutoAWQ_loader, 'HQQ': HQQ_loader, 'TensorRT-LLM': TensorRT_LLM_loader, } @@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name): return model -def AutoAWQ_loader(model_name): - from awq import AutoAWQForCausalLM - - model_dir = Path(f'{shared.args.model_dir}/{model_name}') - - model = AutoAWQForCausalLM.from_quantized( - quant_path=model_dir, - max_new_tokens=shared.args.max_seq_len, - trust_remote_code=shared.args.trust_remote_code, - fuse_layers=not shared.args.no_inject_fused_attention, - max_memory=get_max_memory_dict(), - batch_size=1, - safetensors=any(model_dir.glob('*.safetensors')), - ) - - return model - - def AutoGPTQ_loader(model_name): import modules.AutoGPTQ_loader diff --git a/modules/models_settings.py b/modules/models_settings.py index 7ae68125..1bb00ceb 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings): loader = None elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0): loader = 'ExLlamav2_HF' - elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()): - loader = 'AutoAWQ' elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists(): loader = 'llamacpp_HF' elif len(list(path_to_model.glob('*.gguf'))) > 0: diff --git a/modules/shared.py b/modules/shared.py index dec427dd..fe09a165 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -89,7 +89,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -160,10 +160,6 @@ group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExL group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') group.add_argument('--groupsize', type=int, default=-1, help='Group size.') -# AutoAWQ -group = parser.add_argument_group('AutoAWQ') -group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.') - # HQQ group = parser.add_argument_group('HQQ') group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') @@ -217,6 +213,7 @@ group.add_argument('--model_type', type=str, help='DEPRECATED') group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED') group.add_argument('--checkpoint', type=str, help='DEPRECATED') group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED') +group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -267,8 +264,6 @@ def fix_loader_name(name): return 'ExLlamav2' elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: return 'ExLlamav2_HF' - elif name in ['autoawq', 'awq', 'auto-awq']: - return 'AutoAWQ' elif name in ['hqq']: return 'HQQ' elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: diff --git a/modules/ui.py b/modules/ui.py index cfe709fa..47f92cf0 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -78,7 +78,6 @@ def list_model_elements(): 'groupsize', 'triton', 'desc_act', - 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', 'disable_exllama', diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 7085f5cd..7ef8df4d 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -84,13 +84,13 @@ def create_ui(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) with gr.Row(): - shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') + shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') with gr.Row(): shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct') with gr.Row(): - shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar']) + shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) def create_chat_settings_ui(): diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 54ac9b12..2938c120 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -127,7 +127,6 @@ def create_ui(): shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) - shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.') shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') diff --git a/one_click.py b/one_click.py index e94b6d44..0a0412ba 100644 --- a/one_click.py +++ b/one_click.py @@ -388,7 +388,12 @@ def update_requirements(initial_installation=False, pull=True): # Prepare the requirements file textgen_requirements = open(requirements_file).read().splitlines() if is_cuda118: - textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements if "auto-gptq" not in req] + textgen_requirements = [ + req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') + for req in textgen_requirements + if "auto-gptq" not in req.lower() and "autoawq" not in req.lower() + ] + if is_windows() and is_cuda118: # No flash-attention on Windows for CUDA 11 textgen_requirements = [req for req in textgen_requirements if 'oobabooga/flash-attention' not in req] diff --git a/requirements.txt b/requirements.txt index 1f952872..d6e755a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb @@ -53,12 +53,20 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.83+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # CUDA wheels -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" \ No newline at end of file diff --git a/requirements_amd.txt b/requirements_amd.txt index e85e3e0c..9c34c95d 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -21,7 +21,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb @@ -40,6 +40,10 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp # AMD wheels https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.83+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.83+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" \ No newline at end of file diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index 35c9a485..5df8f2ba 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -21,7 +21,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb @@ -38,6 +38,10 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.83+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # AMD wheels -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" \ No newline at end of file diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index a020387f..123b6d9b 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -21,7 +21,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb @@ -36,4 +36,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index 9f59a487..08509b05 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -21,7 +21,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb @@ -38,4 +38,4 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/me https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.83-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index 6110eab6..d4913ac8 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -21,7 +21,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index d4591919..b468adaf 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -21,7 +21,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 2249b0a8..1bbd4091 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -24,7 +24,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb @@ -53,12 +53,20 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/te https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.83+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # CUDA wheels -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" \ No newline at end of file diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index 14e3aa88..bc8a59aa 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -21,7 +21,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.42.* +transformers==4.43.* tqdm wandb diff --git a/server.py b/server.py index 57e26be8..d6069d5e 100644 --- a/server.py +++ b/server.py @@ -90,7 +90,7 @@ def create_interface(): # Force some events to be triggered on page load shared.persistent_interface_state.update({ 'loader': shared.args.loader or 'Transformers', - 'mode': shared.settings['mode'], + 'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(), 'character_menu': shared.args.character or shared.settings['character'], 'instruction_template_str': shared.settings['instruction_template_str'], 'prompt_menu-default': shared.settings['prompt-default'], diff --git a/start_linux.sh b/start_linux.sh index 5620c831..792daca8 100755 --- a/start_linux.sh +++ b/start_linux.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash cd "$(dirname "${BASH_SOURCE[0]}")" diff --git a/update_wizard_linux.sh b/update_wizard_linux.sh index c5add61e..3ada9a1e 100755 --- a/update_wizard_linux.sh +++ b/update_wizard_linux.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash cd "$(dirname "${BASH_SOURCE[0]}")"