From ebc64a408c1c36df98a4487eefc10a3876500e6c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 27 Feb 2023 23:03:35 -0300 Subject: [PATCH 01/14] RWKV support prototype --- modules/models.py | 28 +++++++++++++++++++++++++++- modules/shared.py | 1 + modules/text_generation.py | 14 ++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index 1264a58c..9ce94f6b 100644 --- a/modules/models.py +++ b/modules/models.py @@ -38,8 +38,10 @@ def load_model(model_name): print(f"Loading {model_name}...") t0 = time.time() + shared.is_RWKV = model_name.lower().startswith('rwkv-') + # Default settings - if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen): + if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV): if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True) else: @@ -75,6 +77,30 @@ def load_model(model_name): model.module.eval() # Inference print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}") + # RMKV model (not on HuggingFace) + elif shared.is_RWKV: + import types + np.set_printoptions(precision=4, suppress=True, linewidth=200) + + os.environ['RWKV_JIT_ON'] = '1' + os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much faster) + + from rwkv.model import RWKV + from rwkv.utils import PIPELINE, PIPELINE_ARGS + + model = RWKV(model='models/RWKV-4-Pile-169M-20220807-8023.pth', strategy='cuda fp16') + + out, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json + print(out.detach().cpu().numpy()) # get logits + out, state = model.forward([187, 510], None) + out, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it) + out, state = model.forward([310, 247], state) + print(out.detach().cpu().numpy()) # same result as above + + pipeline = PIPELINE(model, "20B_tokenizer.json") + + return pipeline, None + # Custom else: command = "AutoModelForCausalLM.from_pretrained" diff --git a/modules/shared.py b/modules/shared.py index d59cee99..b28f8c5f 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -5,6 +5,7 @@ tokenizer = None model_name = "" soft_prompt_tensor = None soft_prompt = False +is_RWKV = False # Chat variables history = {'internal': [], 'visible': []} diff --git a/modules/text_generation.py b/modules/text_generation.py index 9c8674d2..ebe6ed35 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -6,6 +6,7 @@ import numpy as np import torch import transformers from tqdm import tqdm +from rwkv.utils import PIPELINE, PIPELINE_ARGS import modules.shared as shared from modules.extensions import apply_extensions @@ -80,6 +81,19 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if not shared.args.cpu: torch.cuda.empty_cache() + if shared.is_RWKV: + def my_print(s): + print(s, end='', flush=True) + args = PIPELINE_ARGS(temperature = temperature, top_p = top_p, + alpha_frequency = 0.25, # Frequency Penalty (as in GPT-3) + alpha_presence = 0.25, # Presence Penalty (as in GPT-3) + token_ban = [0], # ban the generation of some tokens + token_stop = []) # stop generation whenever you see any token here + reply = question + shared.model.generate(question, token_count=max_new_tokens, args=args, callback=None) + print(formatted_outputs(reply, None)) + yield formatted_outputs(reply, None) + return formatted_outputs(reply, None) + original_question = question if not (shared.args.chat or shared.args.cai_chat): question = apply_extensions(question, "input") From 70e522732c12f441718e2c5ea3e7cde33df366f9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 27 Feb 2023 23:50:16 -0300 Subject: [PATCH 02/14] Move RWKV loader into a separate file --- modules/RWKV.py | 26 ++++++++++++++++++++++++++ modules/models.py | 22 ++-------------------- modules/text_generation.py | 5 +---- 3 files changed, 29 insertions(+), 24 deletions(-) create mode 100644 modules/RWKV.py diff --git a/modules/RWKV.py b/modules/RWKV.py new file mode 100644 index 00000000..a4a406ee --- /dev/null +++ b/modules/RWKV.py @@ -0,0 +1,26 @@ +import os, time, types, torch +from pathlib import Path +import numpy as np +np.set_printoptions(precision=4, suppress=True, linewidth=200) + +os.environ['RWKV_JIT_ON'] = '1' +os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much faster) + +import repositories.ChatRWKV.v2.rwkv as rwkv +from rwkv.model import RWKV +from rwkv.utils import PIPELINE, PIPELINE_ARGS + +def load_RWKV_model(path): + os.system("ls") + model = RWKV(model=path.as_posix(), strategy='cuda fp16') + + out, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json + print(out.detach().cpu().numpy()) # get logits + out, state = model.forward([187, 510], None) + out, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it) + out, state = model.forward([310, 247], state) + print(out.detach().cpu().numpy()) # same result as above + + pipeline = PIPELINE(model, Path("repositories/ChatRWKV/20B_tokenizer.json").as_posix()) + + return pipeline diff --git a/modules/models.py b/modules/models.py index 9ce94f6b..0ba584a5 100644 --- a/modules/models.py +++ b/modules/models.py @@ -79,27 +79,9 @@ def load_model(model_name): # RMKV model (not on HuggingFace) elif shared.is_RWKV: - import types - np.set_printoptions(precision=4, suppress=True, linewidth=200) + from modules.RWKV import load_RWKV_model - os.environ['RWKV_JIT_ON'] = '1' - os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much faster) - - from rwkv.model import RWKV - from rwkv.utils import PIPELINE, PIPELINE_ARGS - - model = RWKV(model='models/RWKV-4-Pile-169M-20220807-8023.pth', strategy='cuda fp16') - - out, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json - print(out.detach().cpu().numpy()) # get logits - out, state = model.forward([187, 510], None) - out, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it) - out, state = model.forward([310, 247], state) - print(out.detach().cpu().numpy()) # same result as above - - pipeline = PIPELINE(model, "20B_tokenizer.json") - - return pipeline, None + return load_RWKV_model(Path('models/RWKV-4-Pile-169M-20220807-8023.pth')), None # Custom else: diff --git a/modules/text_generation.py b/modules/text_generation.py index ebe6ed35..d879e14e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -82,17 +82,14 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi torch.cuda.empty_cache() if shared.is_RWKV: - def my_print(s): - print(s, end='', flush=True) args = PIPELINE_ARGS(temperature = temperature, top_p = top_p, alpha_frequency = 0.25, # Frequency Penalty (as in GPT-3) alpha_presence = 0.25, # Presence Penalty (as in GPT-3) token_ban = [0], # ban the generation of some tokens token_stop = []) # stop generation whenever you see any token here reply = question + shared.model.generate(question, token_count=max_new_tokens, args=args, callback=None) - print(formatted_outputs(reply, None)) yield formatted_outputs(reply, None) - return formatted_outputs(reply, None) + return formatted_outputs(reply, None) original_question = question if not (shared.args.chat or shared.args.cai_chat): From ebd698905c7b38aa9de387bd5bc89636bc6a2560 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Feb 2023 00:04:04 -0300 Subject: [PATCH 03/14] Add streaming to RWKV --- modules/text_generation.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index d879e14e..4e0056c6 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -87,9 +87,17 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi alpha_presence = 0.25, # Presence Penalty (as in GPT-3) token_ban = [0], # ban the generation of some tokens token_stop = []) # stop generation whenever you see any token here - reply = question + shared.model.generate(question, token_count=max_new_tokens, args=args, callback=None) - yield formatted_outputs(reply, None) - return formatted_outputs(reply, None) + + if shared.args.no_stream: + reply = question + shared.model.generate(question, token_count=max_new_tokens, args=args, callback=None) + yield formatted_outputs(reply, None) + return formatted_outputs(reply, None) + else: + for i in range(max_new_tokens//8): + reply = question + shared.model.generate(question, token_count=8, args=args, callback=None) + yield formatted_outputs(reply, None) + question = reply + return formatted_outputs(reply, None) original_question = question if not (shared.args.chat or shared.args.cai_chat): From 67ee7bead7eb06b7be823d779a01cc17e7dcffdd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Feb 2023 00:09:11 -0300 Subject: [PATCH 04/14] Add cpu, bf16 options --- modules/RWKV.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index a4a406ee..3d6a2be5 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -1,6 +1,13 @@ -import os, time, types, torch +import os +import time +import types from pathlib import Path + import numpy as np +import torch + +import modules.shared as shared + np.set_printoptions(precision=4, suppress=True, linewidth=200) os.environ['RWKV_JIT_ON'] = '1' @@ -10,17 +17,11 @@ import repositories.ChatRWKV.v2.rwkv as rwkv from rwkv.model import RWKV from rwkv.utils import PIPELINE, PIPELINE_ARGS + def load_RWKV_model(path): - os.system("ls") - model = RWKV(model=path.as_posix(), strategy='cuda fp16') - - out, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json - print(out.detach().cpu().numpy()) # get logits - out, state = model.forward([187, 510], None) - out, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it) - out, state = model.forward([310, 247], state) - print(out.detach().cpu().numpy()) # same result as above + print(f'strategy={"cpu" if shared.args.cpu else "cuda"} {"fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16"}') + model = RWKV(model=path.as_posix(), strategy=f'{"cpu" if shared.args.cpu else "cuda"} {"fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16"}') pipeline = PIPELINE(model, Path("repositories/ChatRWKV/20B_tokenizer.json").as_posix()) return pipeline From f871971de1e274d4ce298ae0d19e27e3de5539a8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Feb 2023 00:25:30 -0300 Subject: [PATCH 05/14] Trying to get the chat to work --- modules/text_generation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/text_generation.py b/modules/text_generation.py index 4e0056c6..ba4b7d79 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -22,6 +22,9 @@ def get_max_prompt_length(tokens): return max_length def encode(prompt, tokens_to_generate=0, add_special_tokens=True): + if shared.is_RWKV: + return prompt + input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', truncation=True, max_length=get_max_prompt_length(tokens_to_generate), add_special_tokens=add_special_tokens) if shared.args.cpu: return input_ids From 6837d4d72a759c88696db87f11e1b6ae82a6db6b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 28 Feb 2023 02:52:29 -0300 Subject: [PATCH 06/14] Load the model by name --- modules/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models.py b/modules/models.py index 0ba584a5..b3e4b8e0 100644 --- a/modules/models.py +++ b/modules/models.py @@ -81,7 +81,7 @@ def load_model(model_name): elif shared.is_RWKV: from modules.RWKV import load_RWKV_model - return load_RWKV_model(Path('models/RWKV-4-Pile-169M-20220807-8023.pth')), None + return load_RWKV_model(Path(f'models/{model_name}')), None # Custom else: From 9c86a1cd4ae3ab9a4f1e3f8ee093a5f6964815c2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 11:42:49 -0300 Subject: [PATCH 07/14] Add RWKV pip package --- modules/RWKV.py | 3 +-- requirements.txt | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index 3d6a2be5..b7388ea7 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -13,7 +13,6 @@ np.set_printoptions(precision=4, suppress=True, linewidth=200) os.environ['RWKV_JIT_ON'] = '1' os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much faster) -import repositories.ChatRWKV.v2.rwkv as rwkv from rwkv.model import RWKV from rwkv.utils import PIPELINE, PIPELINE_ARGS @@ -22,6 +21,6 @@ def load_RWKV_model(path): print(f'strategy={"cpu" if shared.args.cpu else "cuda"} {"fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16"}') model = RWKV(model=path.as_posix(), strategy=f'{"cpu" if shared.args.cpu else "cuda"} {"fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16"}') - pipeline = PIPELINE(model, Path("repositories/ChatRWKV/20B_tokenizer.json").as_posix()) + pipeline = PIPELINE(model, Path("models/20B_tokenizer.json").as_posix()) return pipeline diff --git a/requirements.txt b/requirements.txt index b333ffba..7dcd720a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,6 @@ bitsandbytes==0.37.0 flexgen==0.1.6 gradio==3.18.0 numpy +rwkv==0.0.5 safetensors==0.2.8 git+https://github.com/huggingface/transformers From 659bb76722fb0dec8932839e339070e80ae2c987 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 12:08:55 -0300 Subject: [PATCH 08/14] Add RWKVModel class --- modules/RWKV.py | 19 ++++++++++++++----- modules/models.py | 6 ++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index b7388ea7..c4481043 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -16,11 +16,20 @@ os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much fas from rwkv.model import RWKV from rwkv.utils import PIPELINE, PIPELINE_ARGS +class RWKVModel: + def __init__(self): + pass -def load_RWKV_model(path): - print(f'strategy={"cpu" if shared.args.cpu else "cuda"} {"fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16"}') + @classmethod + def from_pretrained(self, path, dtype="fp16", device="cuda"): + tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json") - model = RWKV(model=path.as_posix(), strategy=f'{"cpu" if shared.args.cpu else "cuda"} {"fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16"}') - pipeline = PIPELINE(model, Path("models/20B_tokenizer.json").as_posix()) + model = RWKV(model=path.as_posix(), strategy=f'{device} {dtype}') + pipeline = PIPELINE(model, tokenizer_path.as_posix()) - return pipeline + result = self() + result.model = pipeline + return result + + def generate(self, context, **kwargs): + return self.model.generate(context, **kwargs) diff --git a/modules/models.py b/modules/models.py index b3e4b8e0..955ade0b 100644 --- a/modules/models.py +++ b/modules/models.py @@ -79,9 +79,11 @@ def load_model(model_name): # RMKV model (not on HuggingFace) elif shared.is_RWKV: - from modules.RWKV import load_RWKV_model + from modules.RWKV import RWKVModel - return load_RWKV_model(Path(f'models/{model_name}')), None + model = RWKVModel.from_pretrained(Path(f'models/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda") + + return model, None # Custom else: From e735806c510887710d8da9d78db542a60e41b234 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 12:16:11 -0300 Subject: [PATCH 09/14] Add a generate() function for RWKV --- modules/RWKV.py | 13 +++++++++++-- modules/text_generation.py | 10 ++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index c4481043..9f348ad7 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -31,5 +31,14 @@ class RWKVModel: result.model = pipeline return result - def generate(self, context, **kwargs): - return self.model.generate(context, **kwargs) + def generate(self, context, token_count=20, temperature=1, top_p=1, alpha_frequency=0.25, alpha_presence=0.25, token_ban=[0], token_stop=[], callback=None): + args = PIPELINE_ARGS( + temperature = temperature, + top_p = top_p, + alpha_frequency = 0.25, # Frequency Penalty (as in GPT-3) + alpha_presence = 0.25, # Presence Penalty (as in GPT-3) + token_ban = [0], # ban the generation of some tokens + token_stop = [] + ) + + return self.model.generate(context, token_count=token_count, args=args, callback=callback) diff --git a/modules/text_generation.py b/modules/text_generation.py index ba4b7d79..1bd84cd2 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -85,19 +85,13 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi torch.cuda.empty_cache() if shared.is_RWKV: - args = PIPELINE_ARGS(temperature = temperature, top_p = top_p, - alpha_frequency = 0.25, # Frequency Penalty (as in GPT-3) - alpha_presence = 0.25, # Presence Penalty (as in GPT-3) - token_ban = [0], # ban the generation of some tokens - token_stop = []) # stop generation whenever you see any token here - if shared.args.no_stream: - reply = question + shared.model.generate(question, token_count=max_new_tokens, args=args, callback=None) + reply = question + shared.model.generate(question, token_count=max_new_tokens, temperature=temperature) yield formatted_outputs(reply, None) return formatted_outputs(reply, None) else: for i in range(max_new_tokens//8): - reply = question + shared.model.generate(question, token_count=8, args=args, callback=None) + reply = question + shared.model.generate(question, token_count=8, temperature=temperature) yield formatted_outputs(reply, None) question = reply return formatted_outputs(reply, None) From 0f6708c471d3e3daa507067b214df8556b799a52 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 12:18:17 -0300 Subject: [PATCH 10/14] Sort the imports --- modules/RWKV.py | 1 + modules/text_generation.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index 9f348ad7..c5424d8f 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -16,6 +16,7 @@ os.environ["RWKV_CUDA_ON"] = '0' # '1' : use CUDA kernel for seq mode (much fas from rwkv.model import RWKV from rwkv.utils import PIPELINE, PIPELINE_ARGS + class RWKVModel: def __init__(self): pass diff --git a/modules/text_generation.py b/modules/text_generation.py index 1bd84cd2..4c9d1f0e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -5,8 +5,8 @@ import time import numpy as np import torch import transformers -from tqdm import tqdm from rwkv.utils import PIPELINE, PIPELINE_ARGS +from tqdm import tqdm import modules.shared as shared from modules.extensions import apply_extensions From 9e9cfc4b314545f930482afa35ef9c9790a74f5f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 12:19:37 -0300 Subject: [PATCH 11/14] Parameters --- modules/RWKV.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index c5424d8f..5e701bf0 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -36,10 +36,10 @@ class RWKVModel: args = PIPELINE_ARGS( temperature = temperature, top_p = top_p, - alpha_frequency = 0.25, # Frequency Penalty (as in GPT-3) - alpha_presence = 0.25, # Presence Penalty (as in GPT-3) - token_ban = [0], # ban the generation of some tokens - token_stop = [] + alpha_frequency = alpha_frequency, # Frequency Penalty (as in GPT-3) + alpha_presence = alpha_presence, # Presence Penalty (as in GPT-3) + token_ban = token_ban, # ban the generation of some tokens + token_stop = token_stop ) return self.model.generate(context, token_count=token_count, args=args, callback=callback) From 2f16ce309ad4b5a1f40b4d0ab38c2e5fd7887318 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 12:33:09 -0300 Subject: [PATCH 12/14] Rename a variable --- modules/RWKV.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index 5e701bf0..98b11847 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -29,7 +29,7 @@ class RWKVModel: pipeline = PIPELINE(model, tokenizer_path.as_posix()) result = self() - result.model = pipeline + result.pipeline = pipeline return result def generate(self, context, token_count=20, temperature=1, top_p=1, alpha_frequency=0.25, alpha_presence=0.25, token_ban=[0], token_stop=[], callback=None): @@ -42,4 +42,4 @@ class RWKVModel: token_stop = token_stop ) - return self.model.generate(context, token_count=token_count, args=args, callback=callback) + return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) From 7c4d5ca8cca25d5e43a4423ac8d69f4583ec933e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 16:40:25 -0300 Subject: [PATCH 13/14] Improve the text generation call a bit --- modules/RWKV.py | 2 +- modules/text_generation.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/modules/RWKV.py b/modules/RWKV.py index 98b11847..88f1ec23 100644 --- a/modules/RWKV.py +++ b/modules/RWKV.py @@ -42,4 +42,4 @@ class RWKVModel: token_stop = token_stop ) - return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) + return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) diff --git a/modules/text_generation.py b/modules/text_generation.py index 4c9d1f0e..cc8b62d4 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -86,15 +86,14 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if shared.is_RWKV: if shared.args.no_stream: - reply = question + shared.model.generate(question, token_count=max_new_tokens, temperature=temperature) + reply = shared.model.generate(question, token_count=max_new_tokens, temperature=temperature) yield formatted_outputs(reply, None) - return formatted_outputs(reply, None) else: for i in range(max_new_tokens//8): - reply = question + shared.model.generate(question, token_count=8, temperature=temperature) + reply = shared.model.generate(question, token_count=8, temperature=temperature) yield formatted_outputs(reply, None) question = reply - return formatted_outputs(reply, None) + return formatted_outputs(reply, None) original_question = question if not (shared.args.chat or shared.args.cai_chat): From 831ac7ed3f8829b65e9e7f7c6ef76e5662faad43 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 1 Mar 2023 16:45:48 -0300 Subject: [PATCH 14/14] Add top_p --- modules/text_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index cc8b62d4..1324c8b8 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -86,11 +86,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi if shared.is_RWKV: if shared.args.no_stream: - reply = shared.model.generate(question, token_count=max_new_tokens, temperature=temperature) + reply = shared.model.generate(question, token_count=max_new_tokens, temperature=temperature, top_p=top_p) yield formatted_outputs(reply, None) else: for i in range(max_new_tokens//8): - reply = shared.model.generate(question, token_count=8, temperature=temperature) + reply = shared.model.generate(question, token_count=8, temperature=temperature, top_p=top_p) yield formatted_outputs(reply, None) question = reply return formatted_outputs(reply, None)