From d423021a48397c64e6da7890d0ba85d4d7922d7d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 4 Apr 2024 20:23:58 -0300
Subject: [PATCH] Remove CTransformers support (#5807)

---
 README.md                      | 10 +----
 docs/04 - Model Tab.md         |  6 ---
 docs/What Works.md             |  1 -
 modules/ctransformers_model.py | 79 ----------------------------------
 modules/loaders.py             | 29 -------------
 modules/models.py              | 30 +------------
 modules/models_settings.py     |  4 +-
 modules/shared.py              |  4 +-
 modules/text_generation.py     |  6 +--
 modules/ui_model_menu.py       |  2 +-
 requirements.txt               |  1 -
 requirements_noavx2.txt        |  1 -
 12 files changed, 10 insertions(+), 163 deletions(-)
 delete mode 100644 modules/ctransformers_model.py

diff --git a/README.md b/README.md
index 6b92448c..9f3e81bd 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 * 3 interface modes: default (two columns), notebook, and chat.
-* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp).
+* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp).
 * Dropdown menu for quickly switching between different models.
 * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character).
@@ -221,7 +221,7 @@ List of command-line flags
 
 | Flag                                       | Description |
 |--------------------------------------------|-------------|
-| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#. |
+| `--loader LOADER`                          | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#. |
 
 #### Accelerate/transformers
 
@@ -308,12 +308,6 @@ List of command-line flags
 | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
 | `--monkey-patch`          | Apply the monkey patch for using LoRAs with quantized models. |
 
-#### ctransformers
-
-| Flag        | Description |
-|-------------|-------------|
-| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gptneox, falcon, llama, mpt, starcoder (gptbigcode), dollyv2, and replit are supported. |
-
 #### HQQ
 
 | Flag        | Description |
diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md
index 05b85b48..7c168e89 100644
--- a/docs/04 - Model Tab.md	
+++ b/docs/04 - Model Tab.md	
@@ -105,12 +105,6 @@ It has an additional parameter:
 
 * **logits_all**: Needs to be checked if you want to evaluate the perplexity of the llama.cpp model using the "Training" > "Perplexity evaluation" tab. Otherwise, leave it unchecked, as it makes prompt processing slower.
 
-### ctransformers
-
-Loads: GGUF/GGML models.
-
-Similar to llama.cpp but it works for certain GGUF/GGML models not originally supported by llama.cpp like Falcon, StarCoder, StarChat, and GPT-J.
-
 ### AutoAWQ
 
 Loads: AWQ models.
diff --git a/docs/What Works.md b/docs/What Works.md
index 354da1dd..6c0d4c84 100644
--- a/docs/What Works.md	
+++ b/docs/What Works.md	
@@ -10,7 +10,6 @@
 | AutoGPTQ       |       ✅       |           ❌            |       ❌       |          ✅          |           ✅          |
 | AutoAWQ        |       ?        |           ❌            |       ?        |          ?           |           ✅          |
 | GPTQ-for-LLaMa |       ✅\*\*   |           ✅\*\*\*      |       ✅       |          ✅          |           ✅          |
-| ctransformers  |       ❌       |           ❌            |       ❌       |          ❌          |           ❌          |
 | QuIP#          |       ?        |           ?             |       ?        |          ?           |           ✅          |
 | HQQ            |       ?        |           ?             |       ?        |          ?           |           ✅          |
 
diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py
deleted file mode 100644
index 70ce92f5..00000000
--- a/modules/ctransformers_model.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from ctransformers import AutoConfig, AutoModelForCausalLM
-
-from modules import shared
-from modules.callbacks import Iteratorize
-from modules.logging_colors import logger
-
-
-class CtransformersModel:
-    def __init__(self):
-        pass
-
-    @classmethod
-    def from_pretrained(cls, path):
-        result = cls()
-
-        config = AutoConfig.from_pretrained(
-            str(path),
-            threads=shared.args.threads if shared.args.threads != 0 else -1,
-            gpu_layers=shared.args.n_gpu_layers,
-            batch_size=shared.args.n_batch,
-            context_length=shared.args.n_ctx,
-            stream=True,
-            mmap=not shared.args.no_mmap,
-            mlock=shared.args.mlock
-        )
-
-        result.model = AutoModelForCausalLM.from_pretrained(
-            str(result.model_dir(path) if result.model_type_is_auto() else path),
-            model_type=(None if result.model_type_is_auto() else shared.args.model_type),
-            config=config
-        )
-
-        logger.info(f'Using ctransformers model_type: {result.model.model_type} for {result.model.model_path}')
-        return result, result
-
-    def model_type_is_auto(self):
-        return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None"
-
-    def model_dir(self, path):
-        if path.is_file():
-            return path.parent
-
-        return path
-
-    def encode(self, string, **kwargs):
-        return self.model.tokenize(string)
-
-    def decode(self, ids):
-        return self.model.detokenize(ids)
-
-    def generate(self, prompt, state, callback=None):
-        prompt = prompt if type(prompt) is str else prompt.decode()
-        # ctransformers uses -1 for random seed
-        generator = self.model(
-            prompt=prompt,
-            max_new_tokens=state['max_new_tokens'],
-            temperature=state['temperature'],
-            top_p=state['top_p'],
-            top_k=state['top_k'],
-            repetition_penalty=state['repetition_penalty'],
-            last_n_tokens=state['repetition_penalty_range'],
-            seed=int(state['seed'])
-        )
-
-        output = ""
-        for token in generator:
-            if callback:
-                callback(token)
-
-            output += token
-
-        return output
-
-    def generate_with_streaming(self, *args, **kwargs):
-        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
-            reply = ''
-            for token in generator:
-                reply += token
-                yield reply
diff --git a/modules/loaders.py b/modules/loaders.py
index 60fe8aa6..23477339 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -138,15 +138,6 @@ loaders_and_params = OrderedDict({
         'no_use_fast',
         'gptq_for_llama_info',
     ],
-    'ctransformers': [
-        'n_ctx',
-        'n_gpu_layers',
-        'n_batch',
-        'threads',
-        'model_type',
-        'no_mmap',
-        'mlock'
-    ],
     'QuIP#': [
         'trust_remote_code',
         'no_use_fast',
@@ -332,13 +323,6 @@ loaders_samplers = {
         'skip_special_tokens',
         'auto_max_new_tokens',
     },
-    'ctransformers': {
-        'temperature',
-        'top_p',
-        'top_k',
-        'repetition_penalty',
-        'repetition_penalty_range',
-    },
 }
 
 loaders_model_types = {
@@ -348,19 +332,6 @@ loaders_model_types = {
         "opt",
         "gptj"
     ],
-    'ctransformers': [
-        "None",
-        "gpt2",
-        "gptj",
-        "gptneox",
-        "llama",
-        "mpt",
-        "dollyv2",
-        "replit",
-        "starcoder",
-        "gptbigcode",
-        "falcon"
-    ],
 }
 
 
diff --git a/modules/models.py b/modules/models.py
index 541c6301..98349705 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -67,7 +67,6 @@ def load_model(model_name, loader=None):
         'llamacpp_HF': llamacpp_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
-        'ctransformers': ctransformers_loader,
         'AutoAWQ': AutoAWQ_loader,
         'QuIP#': QuipSharp_loader,
         'HQQ': HQQ_loader,
@@ -97,7 +96,7 @@ def load_model(model_name, loader=None):
     shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
     if loader.lower().startswith('exllama'):
         shared.settings['truncation_length'] = shared.args.max_seq_len
-    elif loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+    elif loader in ['llama.cpp', 'llamacpp_HF']:
         shared.settings['truncation_length'] = shared.args.n_ctx
 
     logger.info(f"LOADER: \"{loader}\"")
@@ -265,33 +264,6 @@ def llamacpp_HF_loader(model_name):
     return model
 
 
-def ctransformers_loader(model_name):
-    from modules.ctransformers_model import CtransformersModel
-
-    path = Path(f'{shared.args.model_dir}/{model_name}')
-    ctrans = CtransformersModel()
-    if ctrans.model_type_is_auto():
-        model_file = path
-    else:
-        if path.is_file():
-            model_file = path
-        else:
-            entries = Path(f'{shared.args.model_dir}/{model_name}')
-            gguf = list(entries.glob('*.gguf'))
-            bin = list(entries.glob('*.bin'))
-            if len(gguf) > 0:
-                model_file = gguf[0]
-            elif len(bin) > 0:
-                model_file = bin[0]
-            else:
-                logger.error("Could not find a model for ctransformers.")
-                return None, None
-
-    logger.info(f'ctransformers weights detected: \"{model_file}\"')
-    model, tokenizer = ctrans.from_pretrained(model_file)
-    return model, tokenizer
-
-
 def AutoAWQ_loader(model_name):
     from awq import AutoAWQForCausalLM
 
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 76effa87..12a2db82 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -48,7 +48,7 @@ def get_model_metadata(model):
         model_settings['loader'] = loader
 
     # GGUF metadata
-    if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+    if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF']:
         path = Path(f'{shared.args.model_dir}/{model}')
         if path.is_file():
             model_file = path
@@ -231,7 +231,7 @@ def apply_model_settings_to_state(model, state):
         loader = model_settings.pop('loader')
 
         # If the user is using an alternative loader for the same model type, let them keep using it
-        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['ctransformers']):
+        if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']):
             state['loader'] = loader
 
     for k in model_settings:
diff --git a/modules/shared.py b/modules/shared.py
index ecfdb3be..a48b281c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -88,7 +88,7 @@ group.add_argument('--chat-buttons', action='store_true', help='Show buttons on
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -259,8 +259,6 @@ def fix_loader_name(name):
         return 'ExLlamav2'
     elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
         return 'ExLlamav2_HF'
-    elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']:
-        return 'ctransformers'
     elif name in ['autoawq', 'awq', 'auto-awq']:
         return 'AutoAWQ'
     elif name in ['quip#', 'quip-sharp', 'quipsharp', 'quip_sharp']:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 724bb0f0..f99c605e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -46,7 +46,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             yield ''
             return
 
-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']:
+        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
             generate_func = generate_reply_custom
         else:
             generate_func = generate_reply_HF
@@ -114,7 +114,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if shared.tokenizer is None:
         raise ValueError('No tokenizer is loaded')
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']:
         input_ids = shared.tokenizer.encode(str(prompt))
         if shared.model.__class__.__name__ not in ['Exllamav2Model']:
             input_ids = np.array(input_ids).reshape(1, len(input_ids))
@@ -128,7 +128,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
     if truncation_length is not None:
         input_ids = input_ids[:, -truncation_length:]
 
-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu:
         return input_ids
     elif shared.args.deepspeed:
         return input_ids.to(device=local_rank)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 9f2729e2..8d6122d2 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -330,7 +330,7 @@ def update_truncation_length(current_length, state):
     if 'loader' in state:
         if state['loader'].lower().startswith('exllama'):
             return state['max_seq_len']
-        elif state['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']:
+        elif state['loader'] in ['llama.cpp', 'llamacpp_HF']:
             return state['n_ctx']
 
     return current_length
diff --git a/requirements.txt b/requirements.txt
index 3835b954..25efdf53 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -68,5 +68,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl
 autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows"
diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
index 9fe6e0a0..60cd75d5 100644
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@@ -68,5 +68,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl
 autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows"