From 4148a9201fd2b89a9c0e89d4ae164af8b136762d Mon Sep 17 00:00:00 2001 From: InvectorGator Date: Fri, 12 Jul 2024 23:04:19 -0400 Subject: [PATCH 1/2] Fix for MacOS users encountering model load errors (#6227) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> Co-authored-by: Invectorgator --- modules/llama_cpp_python_hijack.py | 76 ++++++++++++------------------ 1 file changed, 29 insertions(+), 47 deletions(-) diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py index 6ec15626..64280dc9 100644 --- a/modules/llama_cpp_python_hijack.py +++ b/modules/llama_cpp_python_hijack.py @@ -1,4 +1,5 @@ import importlib +import platform from typing import Sequence from tqdm import tqdm @@ -13,58 +14,39 @@ imported_module = None def llama_cpp_lib(): global imported_module - def module_to_purpose(module_name): - if module_name == 'llama_cpp': - return 'CPU' - elif module_name == 'llama_cpp_cuda_tensorcores': - return 'tensorcores' - elif module_name == 'llama_cpp_cuda': - return 'default' + # Determine the platform + is_macos = platform.system() == 'Darwin' - return 'unknown' + # Define the library names based on the platform + if is_macos: + lib_names = [ + (None, 'llama_cpp') + ] + else: + lib_names = [ + ('cpu', 'llama_cpp'), + ('tensorcores', 'llama_cpp_cuda_tensorcores'), + (None, 'llama_cpp_cuda'), + (None, 'llama_cpp') + ] - return_lib = None + for arg, lib_name in lib_names: + should_import = (arg is None or getattr(shared.args, arg)) - if shared.args.cpu: - if imported_module and imported_module != 'llama_cpp': - raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the CPU version currently requires a server restart.") - try: - return_lib = importlib.import_module('llama_cpp') - imported_module = 'llama_cpp' - except: - pass + if should_import: + if imported_module and imported_module != lib_name: + # Conflict detected, raise an exception + raise Exception(f"Cannot import `{lib_name}` because `{imported_module}` is already imported. Switching to a different version of llama-cpp-python currently requires a server restart.") - if shared.args.tensorcores and return_lib is None: - if imported_module and imported_module != 'llama_cpp_cuda_tensorcores': - raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the tensorcores version currently requires a server restart.") - try: - return_lib = importlib.import_module('llama_cpp_cuda_tensorcores') - imported_module = 'llama_cpp_cuda_tensorcores' - except: - pass + try: + return_lib = importlib.import_module(lib_name) + imported_module = lib_name + monkey_patch_llama_cpp_python(return_lib) + return return_lib + except ImportError: + continue - if return_lib is None: - if imported_module and imported_module != 'llama_cpp_cuda': - raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the default version currently requires a server restart.") - try: - return_lib = importlib.import_module('llama_cpp_cuda') - imported_module = 'llama_cpp_cuda' - except: - pass - - if return_lib is None and not shared.args.cpu: - if imported_module and imported_module != 'llama_cpp': - raise Exception(f"The {module_to_purpose(imported_module)} version of llama-cpp-python is already loaded. Switching to the CPU version currently requires a server restart.") - try: - return_lib = importlib.import_module('llama_cpp') - imported_module = 'llama_cpp' - except: - pass - - if return_lib is not None: - monkey_patch_llama_cpp_python(return_lib) - - return return_lib + return None def eval_with_progress(self, tokens: Sequence[int]): From b19d239a603316fc93fc523428687c8417a53f7f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 12 Jul 2024 20:16:11 -0700 Subject: [PATCH 2/2] Bump flash-attention to 2.6.1 --- requirements.txt | 8 ++++---- requirements_noavx2.txt | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index 57b95381..ff2f3161 100644 --- a/requirements.txt +++ b/requirements.txt @@ -58,8 +58,8 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+ https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index c027d9d2..15b45a05 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -58,8 +58,8 @@ https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+ https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/oobabooga/exllamav2/releases/download/v0.1.7/exllamav2-0.1.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.9.post1/flash_attn-2.5.9.post1+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" autoawq==0.2.5; platform_system == "Linux" or platform_system == "Windows"