diff --git a/README.md b/README.md index 5df4d6bc..93bbf0e3 100644 --- a/README.md +++ b/README.md @@ -312,13 +312,6 @@ List of command-line flags | `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. | | `--local_rank LOCAL_RANK` | DeepSpeed: Optional argument for distributed setups. | -#### RWKV - -| Flag | Description | -|---------------------------------|-------------| -| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". | -| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. | - #### RoPE (for llama.cpp, ExLlamaV2, and transformers) | Flag | Description | diff --git a/modules/RWKV.py b/modules/RWKV.py deleted file mode 100644 index 8a15e540..00000000 --- a/modules/RWKV.py +++ /dev/null @@ -1,154 +0,0 @@ -''' -This loader is not currently maintained as RWKV can now be loaded -through the transformers library. -''' - -import copy -import os -from pathlib import Path - -import numpy as np -from tokenizers import Tokenizer -from transformers import is_torch_xpu_available - -import modules.shared as shared -from modules.callbacks import Iteratorize - -np.set_printoptions(precision=4, suppress=True, linewidth=200) - -os.environ['RWKV_JIT_ON'] = '1' -os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster) - -from rwkv.model import RWKV -from rwkv.utils import PIPELINE, PIPELINE_ARGS - - -class RWKVModel: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"): - tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json") - if shared.args.rwkv_strategy is None: - model = RWKV(model=str(path), strategy=f'{device} {dtype}') - else: - model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy) - - pipeline = PIPELINE(model, str(tokenizer_path)) - result = self() - result.pipeline = pipeline - result.model = model - result.cached_context = "" - result.cached_model_state = None - result.cached_output_logits = None - return result - - def generate(self, prompt, state, callback=None): - args = PIPELINE_ARGS( - temperature=state['temperature'], - top_p=state['top_p'], - top_k=state['top_k'], - alpha_frequency=0.1, # Frequency Penalty (as in GPT-3) - alpha_presence=0.1, # Presence Penalty (as in GPT-3) - token_ban=[0], # ban the generation of some tokens - token_stop=[] - ) - - if self.cached_context != "": - if prompt.startswith(self.cached_context): - prompt = prompt[len(self.cached_context):] - else: - self.cached_context = "" - self.cached_model_state = None - self.cached_output_logits = None - - # out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback) - out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback) - return out - - def generate_with_streaming(self, *args, **kwargs): - with Iteratorize(self.generate, args, kwargs, callback=None) as generator: - reply = '' - for token in generator: - reply += token - yield reply - - # Similar to the PIPELINE.generate, but lets us maintain the cached_model_state - def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None): - all_tokens = [] - out_str = '' - occurrence = {} - state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None - - # if we ended up with an empty context, just reuse the cached logits - # this can happen if a user undoes a message and then sends the exact message again - # in that case the full context ends up being the same as the cached_context, so the remaining context is empty. - if ctx == "": - out = self.cached_output_logits - - token = None - for i in range(token_count): - # forward - tokens = self.pipeline.encode(ctx) if i == 0 else [token] - while len(tokens) > 0: - out, state = self.model.forward(tokens[:args.chunk_len], state) - tokens = tokens[args.chunk_len:] - if i == 0: - begin_token = len(all_tokens) - last_token_posi = begin_token - # cache the model state after scanning the context - # we don't cache the state after processing our own generated tokens because - # the output string might be post-processed arbitrarily. Therefore, what's fed into the model - # on the next round of chat might be slightly different what what it output on the previous round - if i == 0: - self.cached_context += ctx - self.cached_model_state = copy.deepcopy(state) - self.cached_output_logits = copy.deepcopy(out) - - # adjust probabilities - for n in args.token_ban: - out[n] = -float('inf') - - for n in occurrence: - out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency) - - # sampler - token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k) - if token in args.token_stop: - break - - all_tokens += [token] - if token not in occurrence: - occurrence[token] = 1 - else: - occurrence[token] += 1 - - # output - tmp = self.pipeline.decode(all_tokens[last_token_posi:]) - if '\ufffd' not in tmp: # is valid utf-8 string? - if callback: - callback(tmp) - - out_str += tmp - last_token_posi = begin_token + i + 1 - return out_str - - -class RWKVTokenizer: - def __init__(self): - pass - - @classmethod - def from_pretrained(self, path): - tokenizer_path = path / "20B_tokenizer.json" - tokenizer = Tokenizer.from_file(str(tokenizer_path)) - result = self() - result.tokenizer = tokenizer - return result - - def encode(self, prompt): - return self.tokenizer.encode(prompt).ids - - def decode(self, ids): - return self.tokenizer.decode(ids) diff --git a/modules/models.py b/modules/models.py index ed6f6b52..7f338712 100644 --- a/modules/models.py +++ b/modules/models.py @@ -65,7 +65,6 @@ def load_model(model_name, loader=None): 'GPTQ-for-LLaMa': GPTQ_loader, 'llama.cpp': llamacpp_loader, 'llamacpp_HF': llamacpp_HF_loader, - 'RWKV': RWKV_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ctransformers': ctransformers_loader, @@ -405,23 +404,6 @@ def HQQ_loader(model_name): return model -def RWKV_loader(model_name): - ''' - This loader is not currently maintained as RWKV can now be loaded - through the transformers library. - ''' - from modules.RWKV import RWKVModel, RWKVTokenizer - - model = RWKVModel.from_pretrained( - Path(f'{shared.args.model_dir}/{model_name}'), - dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", - device="cpu" if shared.args.cpu else "xpu" if is_xpu_available() else "cuda" - ) - - tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir)) - return model, tokenizer - - def get_max_memory_dict(): max_memory = {} max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' diff --git a/modules/models_settings.py b/modules/models_settings.py index d508227a..9acc7efa 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -157,8 +157,6 @@ def infer_loader(model_name, model_settings): loader = 'llama.cpp' elif re.match(r'.*\.gguf', model_name.lower()): loader = 'llama.cpp' - elif re.match(r'.*rwkv.*\.pth', model_name.lower()): - loader = 'RWKV' elif re.match(r'.*exl2', model_name.lower()): loader = 'ExLlamav2_HF' elif re.match(r'.*-hqq', model_name.lower()): diff --git a/modules/shared.py b/modules/shared.py index 60b3f8f4..785d5509 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -165,11 +165,6 @@ group.add_argument('--deepspeed', action='store_true', help='Enable the use of D group.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.') group.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.') -# RWKV -group = parser.add_argument_group('RWKV') -group.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".') -group.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.') - # RoPE group = parser.add_argument_group('RoPE') group.add_argument('--alpha_value', type=float, default=1, help='Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.') diff --git a/modules/text_generation.py b/modules/text_generation.py index b39a037f..d4380188 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -44,7 +44,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -118,7 +118,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']: input_ids = shared.tokenizer.encode(str(prompt)) if shared.model.__class__.__name__ not in ['Exllamav2Model']: input_ids = np.array(input_ids).reshape(1, len(input_ids)) @@ -132,7 +132,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank)