Add ExLlama+LoRA support (#2756)

2024-09-20 10:35:10 +02:00 · 2023-06-19 12:31:24 -03:00 · 2023-06-19 12:31:24 -03:00 · eb30f4441f
commit eb30f4441f
parent a1cac88c19
3 changed files with 119 additions and 73 deletions
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@ -7,85 +7,117 @@ import modules.shared as shared
 from modules.logging_colors import logger
 from modules.models import reload_model
 try:
    from auto_gptq import get_gptq_peft_model
    from auto_gptq.utils.peft_utils import GPTQLoraConfig
    has_auto_gptq_peft = True
 except:
    has_auto_gptq_peft = False
 def add_lora_to_model(lora_names):
    if 'GPTQForCausalLM' in shared.model.__class__.__name__:
        add_lora_autogptq(lora_names)
    elif shared.model.__class__.__name__ == 'ExllamaModel':
        add_lora_exllama(lora_names)
    else:
        add_lora_transformers(lora_names)
 def add_lora_exllama(lora_names):
    try:
        from repositories.exllama.lora import ExLlamaLora
    except:
        logger.error("Could not find the file repositories/exllama/lora.py. Make sure that exllama is cloned inside repositories/ and is up to date.")
        return
    if len(lora_names) == 0:
        shared.model.generator.lora = None
        shared.lora_names = []
        return
    else:
        if len(lora_names) > 1:
            logger.warning('ExLlama can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
        lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
        lora_config_path = lora_path / "adapter_config.json"
        lora_adapter_path = lora_path / "adapter_model.bin"
        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
        lora = ExLlamaLora(shared.model.model, str(lora_config_path), str(lora_adapter_path))
        shared.model.generator.lora = lora
        shared.lora_names = [lora_names[0]]
        return
 # Adapted from https://github.com/Ph0rk0z/text-generation-webui-testing
 def add_lora_autogptq(lora_names):
    try:
        from auto_gptq import get_gptq_peft_model
        from auto_gptq.utils.peft_utils import GPTQLoraConfig
    except:
        logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
        return
    if len(lora_names) == 0:
        if len(shared.lora_names) > 0:
            reload_model()
        shared.lora_names = []
        return
    else:
        if len(lora_names) > 1:
            logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
        peft_config = GPTQLoraConfig(
            inference_mode=True,
        )
        lora_path = Path(f"{shared.args.lora_dir}/{lora_names[0]}")
        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
        shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
        shared.lora_names = [lora_names[0]]
        return
 def add_lora_transformers(lora_names):
    prior_set = set(shared.lora_names)
    added_set = set(lora_names) - prior_set
    removed_set = prior_set - set(lora_names)
    shared.lora_names = list(lora_names)
-    is_autogptq = 'GPTQForCausalLM' in shared.model.__class__.__name__
+    # If no LoRA needs to be added or removed, exit
    if len(added_set) == 0 and len(removed_set) == 0:
        return
-    # AutoGPTQ case. It doesn't use the peft functions.
+    # Add a LoRA when another LoRA is already present
-    # Copied from https://github.com/Ph0rk0z/text-generation-webui-testing
+    if len(removed_set) == 0 and len(prior_set) > 0:
-    if is_autogptq:
+        logger.info(f"Adding the LoRA(s) named {added_set} to the model...")
-        if not has_auto_gptq_peft:
+        for lora in added_set:
-            logger.error("This version of AutoGPTQ does not support LoRA. You need to install from source or wait for a new release.")
+            shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
            return
-        if len(prior_set) > 0:
+        return
            reload_model()
-        if len(shared.lora_names) == 0:
+    # If any LoRA needs to be removed, start over
-            return
+    if len(removed_set) > 0:
-        else:
+        shared.model.disable_adapter()
-            if len(shared.lora_names) > 1:
+        shared.model = shared.model.base_model.model
                logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded')
-            peft_config = GPTQLoraConfig(
+    if len(lora_names) > 0:
-                inference_mode=True,
+        params = {}
-            )
+        if not shared.args.cpu:
            params['dtype'] = shared.model.dtype
            if hasattr(shared.model, "hf_device_map"):
                params['device_map'] = {"base_model.model." + k: v for k, v in shared.model.hf_device_map.items()}
            elif shared.args.load_in_8bit:
                params['device_map'] = {'': 0}
-            lora_path = Path(f"{shared.args.lora_dir}/{shared.lora_names[0]}")
+        logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
-            logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
+        shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_names[0]}"), adapter_name=lora_names[0], **params)
-            shared.model = get_gptq_peft_model(shared.model, peft_config, lora_path)
+        for lora in lora_names[1:]:
-            return
+            shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
-    # Transformers case
+        shared.lora_names = lora_names
    else:
        # If no LoRA needs to be added or removed, exit
        if len(added_set) == 0 and len(removed_set) == 0:
            return
-        # Add a LoRA when another LoRA is already present
+        if not shared.args.load_in_8bit and not shared.args.cpu:
-        if len(removed_set) == 0 and len(prior_set) > 0:
+            shared.model.half()
-            logger.info(f"Adding the LoRA(s) named {added_set} to the model...")
+            if not hasattr(shared.model, "hf_device_map"):
-            for lora in added_set:
+                if torch.has_mps:
-                shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
+                    device = torch.device('mps')
-
+                    shared.model = shared.model.to(device)
-            return
+                else:
-
+                    shared.model = shared.model.cuda()
        # If any LoRA needs to be removed, start over
        if len(removed_set) > 0:
            shared.model.disable_adapter()
            shared.model = shared.model.base_model.model
        if len(lora_names) > 0:
            logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join(lora_names)))
            params = {}
            if not shared.args.cpu:
                params['dtype'] = shared.model.dtype
                if hasattr(shared.model, "hf_device_map"):
                    params['device_map'] = {"base_model.model." + k: v for k, v in shared.model.hf_device_map.items()}
                elif shared.args.load_in_8bit:
                    params['device_map'] = {'': 0}
            shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_names[0]}"), adapter_name=lora_names[0], **params)
            for lora in lora_names[1:]:
                shared.model.load_adapter(Path(f"{shared.args.lora_dir}/{lora}"), lora)
            if not shared.args.load_in_8bit and not shared.args.cpu:
                shared.model.half()
                if not hasattr(shared.model, "hf_device_map"):
                    if torch.has_mps:
                        device = torch.device('mps')
                        shared.model = shared.model.to(device)
                    else:
                        shared.model = shared.model.cuda()
--- a/modules/exllama.py
+++ b/modules/exllama.py
@ -3,11 +3,12 @@ from pathlib import Path
 from modules import shared
 from modules.logging_colors import logger
 from modules.relative_imports import RelativeImport
-sys.path.insert(0, str(Path("repositories/exllama")))
+with RelativeImport("repositories/exllama"):
-from repositories.exllama.generator import ExLlamaGenerator
+    from generator import ExLlamaGenerator
-from repositories.exllama.model import ExLlama, ExLlamaCache, ExLlamaConfig
+    from model import ExLlama, ExLlamaCache, ExLlamaConfig
-from repositories.exllama.tokenizer import ExLlamaTokenizer
+    from tokenizer import ExLlamaTokenizer
 class ExllamaModel:
--- a/modules/relative_imports.py
+++ b/modules/relative_imports.py
@ -0,0 +1,13 @@
 import sys
 from pathlib import Path
 class RelativeImport:
    def __init__(self, path):
        self.import_path = Path(path)
    def __enter__(self):
        sys.path.insert(0, str(self.import_path))
    def __exit__(self, exc_type, exc_value, traceback):
        sys.path.remove(str(self.import_path))