From ab50f80542788dd7fa21b20ac91fddc8c9766c23 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 02:46:35 -0300
Subject: [PATCH 01/69] New text streaming method (much faster)

---
 modules/callbacks.py         | 75 ++++++++++++++++++++++++++++++++++++
 modules/stopping_criteria.py | 32 ---------------
 modules/text_generation.py   | 66 +++++++++++++++++++++++--------
 server.py                    |  3 --
 4 files changed, 124 insertions(+), 52 deletions(-)
 create mode 100644 modules/callbacks.py
 delete mode 100644 modules/stopping_criteria.py

diff --git a/modules/callbacks.py b/modules/callbacks.py
new file mode 100644
index 00000000..15674b8a
--- /dev/null
+++ b/modules/callbacks.py
@@ -0,0 +1,75 @@
+from queue import Queue
+from threading import Thread
+
+import torch
+import transformers
+
+import modules.shared as shared
+
+
+# Copied from https://github.com/PygmalionAI/gradio-ui/
+class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
+
+    def __init__(self, sentinel_token_ids: torch.LongTensor,
+                 starting_idx: int):
+        transformers.StoppingCriteria.__init__(self)
+        self.sentinel_token_ids = sentinel_token_ids
+        self.starting_idx = starting_idx
+
+    def __call__(self, input_ids: torch.LongTensor,
+                 _scores: torch.FloatTensor) -> bool:
+        for sample in input_ids:
+            trimmed_sample = sample[self.starting_idx:]
+            # Can't unfold, output is still too tiny. Skip.
+            if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
+                continue
+
+            for window in trimmed_sample.unfold(
+                    0, self.sentinel_token_ids.shape[-1], 1):
+                if torch.all(torch.eq(self.sentinel_token_ids, window)):
+                    return True
+        return False
+
+class Stream(transformers.StoppingCriteria):
+    def __init__(self, callback_func=None):
+        self.callback_func = callback_func
+
+    def __call__(self, input_ids, scores) -> bool:
+        if self.callback_func is not None:
+            self.callback_func(input_ids[0])
+        return False
+
+class Iteratorize:
+
+    """
+    Transforms a function that takes a callback
+    into a lazy iterator (generator).
+    """
+
+    def __init__(self, func, kwargs={}, callback=None):
+        self.mfunc=func
+        self.c_callback=callback
+        self.q = Queue(maxsize=1)
+        self.sentinel = object()
+        self.kwargs = kwargs
+
+        def _callback(val):
+            self.q.put(val)
+
+        def gentask():
+            ret = self.mfunc(callback=_callback, **self.kwargs)
+            self.q.put(self.sentinel)
+            if self.c_callback:
+                self.c_callback(ret)
+
+        Thread(target=gentask).start()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        obj = self.q.get(True,None)
+        if obj is self.sentinel:
+            raise StopIteration
+        else:
+            return obj
diff --git a/modules/stopping_criteria.py b/modules/stopping_criteria.py
deleted file mode 100644
index 44a631b3..00000000
--- a/modules/stopping_criteria.py
+++ /dev/null
@@ -1,32 +0,0 @@
-'''
-This code was copied from
-
-https://github.com/PygmalionAI/gradio-ui/
-
-'''
-
-import torch
-import transformers
-
-
-class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
-
-    def __init__(self, sentinel_token_ids: torch.LongTensor,
-                 starting_idx: int):
-        transformers.StoppingCriteria.__init__(self)
-        self.sentinel_token_ids = sentinel_token_ids
-        self.starting_idx = starting_idx
-
-    def __call__(self, input_ids: torch.LongTensor,
-                 _scores: torch.FloatTensor) -> bool:
-        for sample in input_ids:
-            trimmed_sample = sample[self.starting_idx:]
-            # Can't unfold, output is still too tiny. Skip.
-            if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
-                continue
-
-            for window in trimmed_sample.unfold(
-                    0, self.sentinel_token_ids.shape[-1], 1):
-                if torch.all(torch.eq(self.sentinel_token_ids, window)):
-                    return True
-        return False
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 4af53273..436afbeb 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -5,13 +5,13 @@ import time
 import numpy as np
 import torch
 import transformers
-from tqdm import tqdm
 
 import modules.shared as shared
+from modules.callbacks import (Iteratorize, Stream,
+                               _SentinelTokenStoppingCriteria)
 from modules.extensions import apply_extensions
 from modules.html_generator import generate_4chan_html, generate_basic_html
 from modules.models import local_rank
-from modules.stopping_criteria import _SentinelTokenStoppingCriteria
 
 
 def get_max_prompt_length(tokens):
@@ -103,7 +103,9 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 yield formatted_outputs(reply, shared.model_name)
 
         t1 = time.time()
-        print(f"Output generated in {(t1-t0):.2f} seconds.")
+        output = encode(reply)[0]
+        input_ids = encode(question)
+        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
         return
 
     original_question = question
@@ -113,6 +115,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         print(f"\n\n{question}\n--------------------\n")
 
     input_ids = encode(question, max_new_tokens)
+    original_input_ids = input_ids
     cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
     n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
     if stopping_string is not None:
@@ -126,10 +129,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             )
         ])
     else:
-        stopping_criteria_list = None
+        stopping_criteria_list = []
 
     if not shared.args.flexgen:
         generate_params = [
+            f"max_new_tokens=max_new_tokens",
             f"eos_token_id={n}",
             f"stopping_criteria=stopping_criteria_list",
             f"do_sample={do_sample}",
@@ -147,24 +151,21 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         ]
     else:
         generate_params = [
+            f"max_new_tokens={max_new_tokens if shared.args.no_stream else 8}",
             f"do_sample={do_sample}",
             f"temperature={temperature}",
             f"stop={n}",
         ]
     if shared.args.deepspeed:
         generate_params.append("synced_gpus=True")
-    if shared.args.no_stream:
-        generate_params.append("max_new_tokens=max_new_tokens")
-    else:
-        generate_params.append("max_new_tokens=8")
     if shared.soft_prompt:
         inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
         generate_params.insert(0, "inputs_embeds=inputs_embeds")
-        generate_params.insert(0, "filler_input_ids")
+        generate_params.insert(0, "inputs=filler_input_ids")
     else:
-        generate_params.insert(0, "input_ids")
+        generate_params.insert(0, "inputs=input_ids")
 
-    # Generate the entire reply at once
+    # Generate the entire reply at once.
     if shared.args.no_stream:
         with torch.no_grad():
             output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
@@ -175,18 +176,45 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         if not (shared.args.chat or shared.args.cai_chat):
             reply = original_question + apply_extensions(reply[len(question):], "output")
 
-        t1 = time.time()
-        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0)/8:.2f} it/s, {len(output)-len(input_ids[0])} tokens)")
         yield formatted_outputs(reply, shared.model_name)
 
-    # Generate the reply 8 tokens at a time
-    else:
+    # Stream the reply 1 token at a time.
+    # This is based on the trick of using 'stopping_criteria' to create an iterator.
+    elif not shared.args.flexgen:
+
+        def generate_with_callback(callback=None, **kwargs):
+            if 'stopping_criteria' not in kwargs:
+                kwargs['stopping_criteria'] = []
+            kwargs['stopping_criteria'].append(Stream(callback_func=callback))
+            shared.model.generate(**kwargs)[0]
+
+        def generate_with_streaming(**kwargs):
+            return Iteratorize(generate_with_callback, kwargs, callback=None)
+
         yield formatted_outputs(original_question, shared.model_name)
-        for i in tqdm(range(max_new_tokens//8+1)):
+        for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
+            if shared.soft_prompt:
+                output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+
+            reply = decode(output)
+            if not (shared.args.chat or shared.args.cai_chat):
+                reply = original_question + apply_extensions(reply[len(question):], "output")
+            yield formatted_outputs(reply, shared.model_name)
+
+            if not shared.args.flexgen:
+                if output[-1] == n:
+                    break
+            else:
+                if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
+                    break
+
+    # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
+    else:
+        for i in range(max_new_tokens//8+1):
             clear_torch_cache()
 
             with torch.no_grad():
-                output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
+                output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
             if shared.soft_prompt:
                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
 
@@ -206,3 +234,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
             if shared.soft_prompt:
                 inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
+
+    t1 = time.time()
+    print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
+    return
diff --git a/server.py b/server.py
index 9f584ba3..42897b0b 100644
--- a/server.py
+++ b/server.py
@@ -18,9 +18,6 @@ from modules.html_generator import generate_chat_html
 from modules.models import load_model, load_soft_prompt
 from modules.text_generation import generate_reply
 
-if (shared.args.chat or shared.args.cai_chat) and not shared.args.no_stream:
-    print('Warning: chat mode currently becomes somewhat slower with text streaming on.\nConsider starting the web UI with the --no-stream option.\n')
-    
 # Loading custom settings
 settings_file = None
 if shared.args.settings is not None and Path(shared.args.settings).exists():

From 0e16c0bacb88ad0f5420fd2aa2c6cfadf38e2579 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 02:50:49 -0300
Subject: [PATCH 02/69] Remove redeclaration of a function

---
 modules/RWKV.py | 36 +-----------------------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/modules/RWKV.py b/modules/RWKV.py
index b226a195..70deab28 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -7,6 +7,7 @@ import numpy as np
 from tokenizers import Tokenizer
 
 import modules.shared as shared
+from modules.callbacks import Iteratorize
 
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 
@@ -73,38 +74,3 @@ class RWKVTokenizer:
 
     def decode(self, ids):
         return self.tokenizer.decode(ids)
-
-class Iteratorize:
-
-    """
-    Transforms a function that takes a callback
-    into a lazy iterator (generator).
-    """
-
-    def __init__(self, func, kwargs={}, callback=None):
-        self.mfunc=func
-        self.c_callback=callback
-        self.q = Queue(maxsize=1)
-        self.sentinel = object()
-        self.kwargs = kwargs
-
-        def _callback(val):
-            self.q.put(val)
-
-        def gentask():
-            ret = self.mfunc(callback=_callback, **self.kwargs)
-            self.q.put(self.sentinel)
-            if self.c_callback:
-                self.c_callback(ret)
-
-        Thread(target=gentask).start()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        obj = self.q.get(True,None)
-        if obj is self.sentinel:
-            raise StopIteration
-        else:
-            return obj

From 72d539dbff6f946fbbd1d8806361dccbc241f8ec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 02:54:47 -0300
Subject: [PATCH 03/69] Better separate the FlexGen case

---
 modules/text_generation.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 436afbeb..a8157a76 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -201,12 +201,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 reply = original_question + apply_extensions(reply[len(question):], "output")
             yield formatted_outputs(reply, shared.model_name)
 
-            if not shared.args.flexgen:
-                if output[-1] == n:
-                    break
-            else:
-                if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
-                    break
+            if output[-1] == n:
+                break
 
     # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
     else:
@@ -223,14 +219,9 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 reply = original_question + apply_extensions(reply[len(question):], "output")
             yield formatted_outputs(reply, shared.model_name)
 
-            if not shared.args.flexgen:
-                if output[-1] == n:
-                    break
-                input_ids = torch.reshape(output, (1, output.shape[0]))
-            else:
-                if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
-                    break
-                input_ids = np.reshape(output, (1, output.shape[0]))
+            if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
+                break
+            input_ids = np.reshape(output, (1, output.shape[0]))
 
             if shared.soft_prompt:
                 inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)

From ad2970374adeb58aec1d7748b02a8c82cc524c0a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 03:00:06 -0300
Subject: [PATCH 04/69] Readability improvements

---
 modules/text_generation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index a8157a76..9477fe41 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -195,8 +195,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
             if shared.soft_prompt:
                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-
             reply = decode(output)
+
             if not (shared.args.chat or shared.args.cai_chat):
                 reply = original_question + apply_extensions(reply[len(question):], "output")
             yield formatted_outputs(reply, shared.model_name)
@@ -213,16 +213,16 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
             if shared.soft_prompt:
                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-
             reply = decode(output)
+
             if not (shared.args.chat or shared.args.cai_chat):
                 reply = original_question + apply_extensions(reply[len(question):], "output")
             yield formatted_outputs(reply, shared.model_name)
 
             if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
                 break
-            input_ids = np.reshape(output, (1, output.shape[0]))
 
+            input_ids = np.reshape(output, (1, output.shape[0]))
             if shared.soft_prompt:
                 inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
 

From 33fb6aed74ebfd50f12373fcbe2f7c0d285022d3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 03:08:16 -0300
Subject: [PATCH 05/69] Minor bug fix

---
 modules/text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 9477fe41..35617314 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -115,7 +115,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         print(f"\n\n{question}\n--------------------\n")
 
     input_ids = encode(question, max_new_tokens)
-    original_input_ids = input_ids
+    original_input_ids = output = input_ids
     cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
     n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
     if stopping_string is not None:

From ad6b699503eeabcad141efb6172ff43dc1976522 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Wed, 8 Mar 2023 22:02:17 +1100
Subject: [PATCH 06/69] Better TTS with autoplay

- Adds "still_streaming" to shared module for extensions to know if generation is complete
- Changed TTS extension with new options:
   - Show text under the audio widget
   - Automatically play the audio once text generation finishes
   - manage the generated wav files (only keep files for finished generations, optional max file limit)
   - [wip] ability to change voice pitch and speed
- added 'tensorboard' to requirements, since python sent "tensorboard not found" errors after a fresh installation.
---
 extensions/silero_tts/requirements.txt |  1 +
 extensions/silero_tts/script.py        | 60 +++++++++++++++++++++++---
 modules/shared.py                      |  1 +
 modules/text_generation.py             | 11 ++++-
 requirements.txt                       |  1 +
 5 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/extensions/silero_tts/requirements.txt b/extensions/silero_tts/requirements.txt
index f2f0bff5..b4444306 100644
--- a/extensions/silero_tts/requirements.txt
+++ b/extensions/silero_tts/requirements.txt
@@ -4,3 +4,4 @@ pydub
 PyYAML
 torch
 torchaudio
+simpleaudio
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index f697d0e2..03319dbf 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -4,20 +4,36 @@ from pathlib import Path
 import gradio as gr
 import torch
 
+import modules.shared as shared
+import simpleaudio as sa
+
 torch._C._jit_set_profiling_mode(False)
 
 params = {
     'activate': True,
-    'speaker': 'en_56',
+    'speaker': 'en_5',
     'language': 'en',
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
+    'max_wavs': 20,
+    'play_audio': True,
+    'show_text': True,
 }
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
 wav_idx = 0
 
+table = str.maketrans({
+    "<": "&lt;",
+    ">": "&gt;",
+    "&": "&amp;",
+    "'": "&apos;",
+    '"': "&quot;",
+})
+def xmlesc(txt):
+    return txt.translate(table)
+
 def load_model():
     model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
     model.to(params['device'])
@@ -58,20 +74,45 @@ def output_modifier(string):
     if params['activate'] == False:
         return string
 
+    orig_string = string
     string = remove_surrounded_chars(string)
     string = string.replace('"', '')
     string = string.replace('“', '')
     string = string.replace('\n', ' ')
     string = string.strip()
 
+    auto_playable=True
     if string == '':
-        string = 'empty reply, try regenerating'
+            string = 'empty reply, try regenerating'
+            auto_playable=False
+            
 
+    #x-slow, slow, medium, fast, x-fast
+    #x-low, low, medium, high, x-high
+    #prosody='<prosody rate="fast" pitch="medium">'
+    prosody='<prosody rate="fast">'
+    string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
+        
     output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    audio = model.save_wav(text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-
+    audio = model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
     string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
-    wav_idx += 1
+    
+    #reset if too many wavs. set max to -1 for unlimited.
+    if wav_idx < params['max_wavs'] and params['max_wavs'] > 0:
+        #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
+        if not shared.still_streaming:
+            wav_idx += 1
+    else:
+        wav_idx = 0
+        
+    if params['show_text']:
+        string+='\n\n'+orig_string
+    
+    #if params['play_audio'] == True and auto_playable and shared.stop_everything:
+    if params['play_audio'] == True and auto_playable and not shared.still_streaming:
+        stop_autoplay()
+        wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
+        wave_obj.play()
 
     return string
 
@@ -84,11 +125,20 @@ def bot_prefix_modifier(string):
 
     return string
 
+def stop_autoplay():
+    sa.stop_all()
+
 def ui():
     # Gradio elements
     activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+    show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
+    play_audio = gr.Checkbox(value=params['play_audio'], label='Play TTS automatically')
+    stop_audio = gr.Button("Stop Auto-Play")
     voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
+    play_audio.change(lambda x: params.update({"play_audio": x}), play_audio, None)
+    show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
+    stop_audio.click(stop_autoplay)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
diff --git a/modules/shared.py b/modules/shared.py
index e9dfdaa2..90adb320 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -12,6 +12,7 @@ is_LLaMA = False
 history = {'internal': [], 'visible': []}
 character = 'None'
 stop_everything = False
+still_streaming = False
 
 # UI elements (buttons, sliders, HTML, etc)
 gradio = {}
diff --git a/modules/text_generation.py b/modules/text_generation.py
index f9082a31..c9f4fc6a 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -182,6 +182,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     # Generate the reply 8 tokens at a time
     else:
         yield formatted_outputs(original_question, shared.model_name)
+        shared.still_streaming = True
         for i in tqdm(range(max_new_tokens//8+1)):
             with torch.no_grad():
                 output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
@@ -191,8 +192,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             reply = decode(output)
             if not (shared.args.chat or shared.args.cai_chat):
                 reply = original_question + apply_extensions(reply[len(question):], "output")
-            yield formatted_outputs(reply, shared.model_name)
-
+            
             if not shared.args.flexgen:
                 if output[-1] == n:
                     break
@@ -201,6 +201,13 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
                     break
                 input_ids = np.reshape(output, (1, output.shape[0]))
+                
+            #Mid-stream yield, ran if no breaks
+            yield formatted_outputs(reply, shared.model_name)
 
             if shared.soft_prompt:
                 inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
+                
+        #Stream finished from max tokens or break. Do final yield.
+        shared.still_streaming = False
+        yield formatted_outputs(reply, shared.model_name)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 55aeb8fd..48ca1e4e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ numpy
 rwkv==0.0.6
 safetensors==0.2.8
 git+https://github.com/huggingface/transformers
+tensorboard

From 738be6dd59a6f9c2ee215093675f2d55111d89ca Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Wed, 8 Mar 2023 22:25:55 +1100
Subject: [PATCH 07/69] Fix merge errors and unlimited wav bug

---
 extensions/silero_tts/script.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 53bd554c..eaf56159 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -93,11 +93,11 @@ def output_modifier(string):
     string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
         
     output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    model.save_wav(text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+    model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
     string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
     
     #reset if too many wavs. set max to -1 for unlimited.
-    if wav_idx < params['max_wavs'] and params['max_wavs'] > 0:
+    if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
         #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
         if not shared.still_streaming:
             wav_idx += 1

From add9330e5e90e33f3f8bbe0ea42290475deb9998 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 11:26:29 -0300
Subject: [PATCH 08/69] Bug fixes

---
 modules/text_generation.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 35617314..8f5ea798 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -115,7 +115,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
         print(f"\n\n{question}\n--------------------\n")
 
     input_ids = encode(question, max_new_tokens)
-    original_input_ids = output = input_ids
+    original_input_ids = input_ids
+    output = input_ids[0]
     cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
     n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
     if stopping_string is not None:
@@ -186,7 +187,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             if 'stopping_criteria' not in kwargs:
                 kwargs['stopping_criteria'] = []
             kwargs['stopping_criteria'].append(Stream(callback_func=callback))
-            shared.model.generate(**kwargs)[0]
+            clear_torch_cache()
+            shared.model.generate(**kwargs)
 
         def generate_with_streaming(**kwargs):
             return Iteratorize(generate_with_callback, kwargs, callback=None)
@@ -208,7 +210,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     else:
         for i in range(max_new_tokens//8+1):
             clear_torch_cache()
-
             with torch.no_grad():
                 output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
             if shared.soft_prompt:

From 59b5f7a4b731c528f0fa53d70eb3318d3a1727df Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 8 Mar 2023 12:13:40 -0300
Subject: [PATCH 09/69] Improve usage of stopping_criteria

---
 modules/text_generation.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8f5ea798..6a59f9a7 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -119,18 +119,11 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     output = input_ids[0]
     cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
     n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
+    stopping_criteria_list = transformers.StoppingCriteriaList()
     if stopping_string is not None:
-        # The stopping_criteria code below was copied from
-        # https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
+        # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
         t = encode(stopping_string, 0, add_special_tokens=False)
-        stopping_criteria_list = transformers.StoppingCriteriaList([
-            _SentinelTokenStoppingCriteria(
-                sentinel_token_ids=t,
-                starting_idx=len(input_ids[0])
-            )
-        ])
-    else:
-        stopping_criteria_list = []
+        stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0])))
 
     if not shared.args.flexgen:
         generate_params = [
@@ -184,17 +177,17 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     elif not shared.args.flexgen:
 
         def generate_with_callback(callback=None, **kwargs):
-            if 'stopping_criteria' not in kwargs:
-                kwargs['stopping_criteria'] = []
             kwargs['stopping_criteria'].append(Stream(callback_func=callback))
             clear_torch_cache()
-            shared.model.generate(**kwargs)
+            with torch.no_grad():
+                shared.model.generate(**kwargs)
 
         def generate_with_streaming(**kwargs):
             return Iteratorize(generate_with_callback, kwargs, callback=None)
 
         yield formatted_outputs(original_question, shared.model_name)
         for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
+            print(print('Used vram in gib:', torch.cuda.memory_allocated() / 1024**3))
             if shared.soft_prompt:
                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
             reply = decode(output)

From a2b5383398adc6da5c46811179bfadaefa5e23f7 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Thu, 9 Mar 2023 10:48:44 +1100
Subject: [PATCH 10/69] Merge in audio generation only on text stream finish.,
 postpone audioblock autoplay

- Keeping simpleaudio until audio block "autoplay" doesn't play previous messages
- Only generate audio for finished messages
- Better name for autoplay, clean up comments
- set default to unlimited wav files. Still a few bugs when wav id resets

Co-Authored-By: Christoph Hess <9931495+ChristophHess@users.noreply.github.com>
---
 extensions/silero_tts/script.py | 34 +++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index eaf56159..334b02b9 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -15,14 +15,15 @@ params = {
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
-    'max_wavs': 20,
-    'play_audio': True,
+    'max_wavs': -1,
+    'autoplay': True,
     'show_text': True,
 }
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
 wav_idx = 0
 
+#Used for making text xml compatible, needed for voice pitch and speed control
 table = str.maketrans({
     "<": "&lt;",
     ">": "&gt;",
@@ -88,27 +89,32 @@ def output_modifier(string):
 
     #x-slow, slow, medium, fast, x-fast
     #x-low, low, medium, high, x-high
-    #prosody='<prosody rate="fast" pitch="medium">'
-    prosody='<prosody rate="fast">'
+    prosody='<prosody rate="medium" pitch="medium">'
     string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
         
     output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-    string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
+    autoplay_str = ''
+    if not shared.still_streaming:
+        model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+        #diabled until autoplay doesn't run on previous messages
+        #autoplay = 'autoplay' if (params['autoplay'] and auto_playable) else ''
+        string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay_str}></audio>\n\n'
+    else:
+        #placeholder so text doesnt shift around so much
+        string =f'<audio controls {autoplay_str}></audio>\n\n'
     
     #reset if too many wavs. set max to -1 for unlimited.
     if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
-        #only increment if starting a new stream, else replace during streaming. Does not update duration on webui sometimes?
+        #only increment if starting a new stream, else replace during streaming.
         if not shared.still_streaming:
             wav_idx += 1
     else:
         wav_idx = 0
-        
+
     if params['show_text']:
-        string+='\n\n'+orig_string
-    
-    #if params['play_audio'] == True and auto_playable and shared.stop_everything:
-    if params['play_audio'] == True and auto_playable and not shared.still_streaming:
+        string+=orig_string
+
+    if params['autoplay'] == True and auto_playable and not shared.still_streaming:
         stop_autoplay()
         wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
         wave_obj.play()
@@ -131,13 +137,13 @@ def ui():
     # Gradio elements
     activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
     show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
-    play_audio = gr.Checkbox(value=params['play_audio'], label='Play TTS automatically')
+    autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
     stop_audio = gr.Button("Stop Auto-Play")
     voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
-    play_audio.change(lambda x: params.update({"play_audio": x}), play_audio, None)
+    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
     stop_audio.click(stop_autoplay)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)

From 4dd14dcab4778b2d4e031db9cdfa94a2e1fe13e6 Mon Sep 17 00:00:00 2001
From: Chimdumebi Nebolisa <78305519+MichealC0@users.noreply.github.com>
Date: Thu, 9 Mar 2023 10:22:09 +0100
Subject: [PATCH 11/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9efacb7c..23d53604 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ The third line assumes that you have an NVIDIA GPU.
 pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2
 ```
   	  
-* If you are running in CPU mode, replace the third command with this one:
+* If you are running it in CPU mode, replace the third command with this one:
 
 ```
 conda install pytorch torchvision torchaudio git -c pytorch

From 828a524f9a957f56c1985d71f941715727fd1db4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 15:50:26 -0300
Subject: [PATCH 12/69] Add LLaMA 4-bit support

---
 modules/models.py | 22 +++++++++++++++++++++-
 modules/shared.py |  1 +
 requirements.txt  |  2 +-
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 16ce6eb1..04235b52 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -1,5 +1,6 @@
 import json
 import os
+import sys
 import time
 import zipfile
 from pathlib import Path
@@ -41,7 +42,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
+    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.load_in_4bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -86,6 +87,24 @@ def load_model(model_name):
 
         return model, tokenizer
 
+    # 4-bit LLaMA
+    elif shared.args.load_in_4bit:
+        sys.path.append(os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
+
+        from llama import load_quant
+
+        path_to_model = Path(f'models/{model_name}')
+        pt_model = ''
+        if path_to_model.name.lower().startswith('llama-7b'):
+            pt_model = 'llama-7b-4bit.pt'
+        if path_to_model.name.lower().startswith('llama-13b'):
+            pt_model = 'llama-13b-4bit.pt'
+        if path_to_model.name.lower().startswith('llama-30b'):
+            pt_model = 'llama-30b-4bit.pt'
+
+        model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
+        model = model.to(torch.device('cuda:0'))
+
     # Custom
     else:
         command = "AutoModelForCausalLM.from_pretrained"
@@ -159,3 +178,4 @@ def load_soft_prompt(name):
         shared.soft_prompt_tensor = tensor
 
     return name
+
diff --git a/modules/shared.py b/modules/shared.py
index b609045c..4c062fe9 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -68,6 +68,7 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
 parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
+parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
diff --git a/requirements.txt b/requirements.txt
index 47c56a45..6133f394 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,4 @@ numpy
 rwkv==0.1.0
 safetensors==0.2.8
 sentencepiece
-git+https://github.com/oobabooga/transformers@llama_push
+git+https://github.com/zphang/transformers@llama_push

From fd540b89309a138a17147955ecf8ea2049af4ca2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 17:59:15 -0300
Subject: [PATCH 13/69] Use new LLaMA implementation (this will break stuff. I
 am sorry)

https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 47c56a45..6133f394 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,4 +6,4 @@ numpy
 rwkv==0.1.0
 safetensors==0.2.8
 sentencepiece
-git+https://github.com/oobabooga/transformers@llama_push
+git+https://github.com/zphang/transformers@llama_push

From d41e3c233b4b4bccf6b0b36ff3f1db8701e52d5c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 18:02:44 -0300
Subject: [PATCH 14/69] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 65596321..9fe454c2 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Text generation web UI
 
-A gradio web UI for running Large Language Models like GPT-J 6B, OPT, GALACTICA, GPT-Neo, and Pygmalion.
+A gradio web UI for running Large Language Models like GPT-J 6B, OPT, GALACTICA, LLaMA, and Pygmalion.
 
 Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation.
 
@@ -27,6 +27,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * [FlexGen offload](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen).
 * [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed).
 * Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming.
+* [Supports the LLaMA model](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model).
 * [Supports the RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model).
 * Supports softprompts.
 * [Supports extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions).

From 2965aa1625a1186fcf36a559235881d1382f2366 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:48:51 -0300
Subject: [PATCH 15/69] Check if the .pt file exists

---
 modules/models.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 04235b52..e10668cf 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -102,6 +102,10 @@ def load_model(model_name):
         if path_to_model.name.lower().startswith('llama-30b'):
             pt_model = 'llama-30b-4bit.pt'
 
+        if not Path(f"models/{pt_model}").exists():
+            print(f"Could not find models/{pt_model}, exiting...")
+            exit()
+
         model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
         model = model.to(torch.device('cuda:0'))
 
@@ -178,4 +182,3 @@ def load_soft_prompt(name):
         shared.soft_prompt_tensor = tensor
 
     return name
-

From 74102d5ee48fcf68939ff4fc3ca7e34e6623bcb7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:51:22 -0300
Subject: [PATCH 16/69] Insert to the path instead of appending

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index e10668cf..0ad4c198 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -89,7 +89,7 @@ def load_model(model_name):
 
     # 4-bit LLaMA
     elif shared.args.load_in_4bit:
-        sys.path.append(os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
+        sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
 
         from llama import load_quant
 

From eb0cb9b6df58c397bda377deefeb14a2c0b0e0f9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:53:52 -0300
Subject: [PATCH 17/69] Update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 65596321..de498fb6 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ Optionally, you can use the following command-line flags:
 | `--cai-chat`  | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
 | `--cpu`       | Use the CPU to generate text.|
 | `--load-in-8bit`  | Load the model with 8-bit precision.|
+| `--load-in-4bit`  |  Load the model with 4-bit precision. Currently only works with LLaMA. |
 | `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |

From 9849aac0f1284c5fa02509f1e197cc248e2c4700 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 9 Mar 2023 21:54:50 -0300
Subject: [PATCH 18/69] Don't show .pt models in the list

---
 modules/models.py | 3 +++
 server.py         | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 0ad4c198..3e6cea18 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -105,6 +105,9 @@ def load_model(model_name):
         if not Path(f"models/{pt_model}").exists():
             print(f"Could not find models/{pt_model}, exiting...")
             exit()
+        elif pt_model == '':
+            print(f"Could not find the .pt model for {model_name}, exiting...")
+            exit()
 
         model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
         model = model.to(torch.device('cuda:0'))
diff --git a/server.py b/server.py
index 7d8792b7..c2977f41 100644
--- a/server.py
+++ b/server.py
@@ -37,7 +37,7 @@ def get_available_models():
     if shared.args.flexgen:
         return sorted([re.sub('-np$', '', item.name) for item in list(Path('models/').glob('*')) if item.name.endswith('-np')], key=str.lower)
     else:
-        return sorted([item.name for item in list(Path('models/').glob('*')) if not item.name.endswith(('.txt', '-np'))], key=str.lower)
+        return sorted([item.name for item in list(Path('models/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt'))], key=str.lower)
 
 def get_available_presets():
     return sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path('presets').glob('*.txt'))), key=str.lower)

From 826e297b0ec40299318f1002f9165e7ac9c9c257 Mon Sep 17 00:00:00 2001
From: rohvani <3782201+rohvani@users.noreply.github.com>
Date: Thu, 9 Mar 2023 18:31:32 -0800
Subject: [PATCH 19/69] add llama-65b-4bit support & multiple pt paths

---
 modules/models.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 3e6cea18..062ccb1f 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -97,19 +97,27 @@ def load_model(model_name):
         pt_model = ''
         if path_to_model.name.lower().startswith('llama-7b'):
             pt_model = 'llama-7b-4bit.pt'
-        if path_to_model.name.lower().startswith('llama-13b'):
+        elif path_to_model.name.lower().startswith('llama-13b'):
             pt_model = 'llama-13b-4bit.pt'
-        if path_to_model.name.lower().startswith('llama-30b'):
+        elif path_to_model.name.lower().startswith('llama-30b'):
             pt_model = 'llama-30b-4bit.pt'
-
-        if not Path(f"models/{pt_model}").exists():
-            print(f"Could not find models/{pt_model}, exiting...")
-            exit()
-        elif pt_model == '':
+        elif path_to_model.name.lower().startswith('llama-65b'):
+            pt_model = 'llama-65b-4bit.pt'
+        else:
             print(f"Could not find the .pt model for {model_name}, exiting...")
             exit()
 
-        model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
+        # check root of models folder, and model path root
+        paths = [  f"{path_to_model}/{pt_model}", f"models/{pt_model}" ]
+        for path in [ Path(p) for p in paths ]:
+            if path.exists():
+                pt_path = path
+
+        if not pt_path:
+            print(f"Could not find {pt_model}, exiting...")
+            exit()
+
+        model = load_quant(path_to_model, pt_path, 4)
         model = model.to(torch.device('cuda:0'))
 
     # Custom

From 5ee376c580e8c2cf2e3b34e1822c43e6754b2649 Mon Sep 17 00:00:00 2001
From: rohvani <3782201+rohvani@users.noreply.github.com>
Date: Thu, 9 Mar 2023 18:31:41 -0800
Subject: [PATCH 20/69] add LLaMA preset

---
 presets/LLaMA-Default.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 presets/LLaMA-Default.txt

diff --git a/presets/LLaMA-Default.txt b/presets/LLaMA-Default.txt
new file mode 100644
index 00000000..3df8209a
--- /dev/null
+++ b/presets/LLaMA-Default.txt
@@ -0,0 +1,12 @@
+do_sample=False
+temperature=0.7
+top_p=0
+typical_p=1
+repetition_penalty=1.15
+top_k=40
+num_beams=1
+penalty_alpha=0
+min_length=0
+length_penalty=1
+no_repeat_ngram_size=0
+early_stopping=True

From ec3de0495c52a6d81495ac0553f4a7a886e4e0c8 Mon Sep 17 00:00:00 2001
From: Ber Zoidberg <ber.zoidberg@gmail.com>
Date: Thu, 9 Mar 2023 19:08:09 -0800
Subject: [PATCH 21/69] download tokenizer when present

---
 download-model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 599418fc..27fbffda 100644
--- a/download-model.py
+++ b/download-model.py
@@ -107,9 +107,10 @@ def get_download_links_from_huggingface(model, branch):
 
             is_pytorch = re.match("pytorch_model.*\.bin", fname)
             is_safetensors = re.match("model.*\.safetensors", fname)
+            is_tokenizer = re.match("tokenizer.*\.model", fname)
             is_text = re.match(".*\.(txt|json)", fname)
 
-            if is_text or is_safetensors or is_pytorch:
+            if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
                 if is_text:
                     links.append(f"https://huggingface.co/{model}/resolve/{branch}/{fname}")
                     classifications.append('text')

From 249c268176114e72da3e82d7e2c652481060f44f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 00:41:10 -0300
Subject: [PATCH 22/69] Fix the download script for long lists of files on HF

---
 download-model.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/download-model.py b/download-model.py
index 599418fc..98b57bb0 100644
--- a/download-model.py
+++ b/download-model.py
@@ -5,7 +5,9 @@ Example:
 python download-model.py facebook/opt-1.3b
 
 '''
+
 import argparse
+import base64
 import json
 import multiprocessing
 import re
@@ -93,14 +95,18 @@ facebook/opt-1.3b
 def get_download_links_from_huggingface(model, branch):
     base = "https://huggingface.co"
     page = f"/api/models/{model}/tree/{branch}?cursor="
+    cursor = b""
 
     links = []
     classifications = []
     has_pytorch = False
     has_safetensors = False
-    while page is not None:
-        content = requests.get(f"{base}{page}").content
+    while True:
+        content = requests.get(f"{base}{page}{cursor.decode()}").content
+
         dict = json.loads(content)
+        if len(dict) == 0:
+            break
 
         for i in range(len(dict)):
             fname = dict[i]['path']
@@ -123,8 +129,9 @@ def get_download_links_from_huggingface(model, branch):
                         has_pytorch = True
                         classifications.append('pytorch')
 
-        #page = dict['nextUrl']
-        page = None
+        cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
+        cursor = base64.b64encode(cursor)
+        cursor = cursor.replace(b'=', b'%3D')
 
     # If both pytorch and safetensors are available, download safetensors only
     if has_pytorch and has_safetensors:

From 875847bf88c52166c4e9a0cc35f7e6c535b88d97 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 00:45:28 -0300
Subject: [PATCH 23/69] Consider tokenizer a type of text

---
 download-model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/download-model.py b/download-model.py
index 27fbffda..bf94be7c 100644
--- a/download-model.py
+++ b/download-model.py
@@ -108,7 +108,7 @@ def get_download_links_from_huggingface(model, branch):
             is_pytorch = re.match("pytorch_model.*\.bin", fname)
             is_safetensors = re.match("model.*\.safetensors", fname)
             is_tokenizer = re.match("tokenizer.*\.model", fname)
-            is_text = re.match(".*\.(txt|json)", fname)
+            is_text = re.match(".*\.(txt|json)", fname) or is_tokenizer
 
             if any((is_pytorch, is_safetensors, is_text, is_tokenizer)):
                 if is_text:

From 2ac29137470396733e95e7efa77e091d5e8a5ef5 Mon Sep 17 00:00:00 2001
From: rohvani <3782201+rohvani@users.noreply.github.com>
Date: Thu, 9 Mar 2023 20:13:23 -0800
Subject: [PATCH 24/69] fix reference issue

---
 modules/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/models.py b/modules/models.py
index 062ccb1f..a2256b98 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -109,6 +109,7 @@ def load_model(model_name):
 
         # check root of models folder, and model path root
         paths = [  f"{path_to_model}/{pt_model}", f"models/{pt_model}" ]
+        pt_path = None
         for path in [ Path(p) for p in paths ]:
             if path.exists():
                 pt_path = path

From ab470444591e425290db72db9ebc3127f5520449 Mon Sep 17 00:00:00 2001
From: deepdiffuser <deepdiffuser@github.com>
Date: Fri, 10 Mar 2023 04:29:09 -0800
Subject: [PATCH 25/69] add multi-gpu support for 4bit gptq LLaMA

---
 modules/models.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 3e6cea18..14443c89 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -110,7 +110,18 @@ def load_model(model_name):
             exit()
 
         model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
-        model = model.to(torch.device('cuda:0'))
+
+        if shared.args.gpu_memory:
+            max_memory = {}
+            for i in range(len(shared.args.gpu_memory)):
+                max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
+            max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
+
+            import accelerate
+            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory)
+            model = accelerate.dispatch_model(model, device_map=device_map)
+        else:
+            model = model.to(torch.device('cuda:0'))
 
     # Custom
     else:

From 9fbd60bf22c6a2e9cef0cade23a4933547df9114 Mon Sep 17 00:00:00 2001
From: deepdiffuser <deepdiffuser@github.com>
Date: Fri, 10 Mar 2023 05:30:47 -0800
Subject: [PATCH 26/69] add no_split_module_classes to prevent tensor split
 error

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 14443c89..986cd73a 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -118,7 +118,7 @@ def load_model(model_name):
             max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
 
             import accelerate
-            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory)
+            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
             model = accelerate.dispatch_model(model, device_map=device_map)
         else:
             model = model.to(torch.device('cuda:0'))

From e461c0b7a0769c4df3aa96505803b004a1071c2e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 10:51:12 -0300
Subject: [PATCH 27/69] Move the import to the top

---
 modules/models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 986cd73a..f4c1071d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -112,12 +112,13 @@ def load_model(model_name):
         model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
 
         if shared.args.gpu_memory:
+            import accelerate
+
             max_memory = {}
             for i in range(len(shared.args.gpu_memory)):
                 max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
             max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
 
-            import accelerate
             device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
             model = accelerate.dispatch_model(model, device_map=device_map)
         else:

From de7dd8b6aa3aa00ba629c9ba6ce1bc32bd213d2f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 10:54:08 -0300
Subject: [PATCH 28/69] Add comments

---
 modules/models.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/models.py b/modules/models.py
index f4c1071d..a5ec59d1 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -111,6 +111,7 @@ def load_model(model_name):
 
         model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
 
+        # Multi-GPU setup
         if shared.args.gpu_memory:
             import accelerate
 
@@ -121,6 +122,8 @@ def load_model(model_name):
 
             device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
             model = accelerate.dispatch_model(model, device_map=device_map)
+
+        # Single GPU
         else:
             model = model.to(torch.device('cuda:0'))
 

From 706a03b2cb5bf3c0667d8c13b3a47f1a6e33cc81 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 11:02:25 -0300
Subject: [PATCH 29/69] Minor changes

---
 modules/models.py         |  8 +++-----
 presets/LLaMA-Default.txt | 12 ------------
 2 files changed, 3 insertions(+), 17 deletions(-)
 delete mode 100644 presets/LLaMA-Default.txt

diff --git a/modules/models.py b/modules/models.py
index a2256b98..a23f1fa9 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -104,13 +104,11 @@ def load_model(model_name):
         elif path_to_model.name.lower().startswith('llama-65b'):
             pt_model = 'llama-65b-4bit.pt'
         else:
-            print(f"Could not find the .pt model for {model_name}, exiting...")
-            exit()
+            pt_model = f'{model_name}-4bit.pt'
 
-        # check root of models folder, and model path root
-        paths = [  f"{path_to_model}/{pt_model}", f"models/{pt_model}" ]
+        # Try to find the .pt both in models/ and in the subfolder
         pt_path = None
-        for path in [ Path(p) for p in paths ]:
+        for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
             if path.exists():
                 pt_path = path
 
diff --git a/presets/LLaMA-Default.txt b/presets/LLaMA-Default.txt
deleted file mode 100644
index 3df8209a..00000000
--- a/presets/LLaMA-Default.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-do_sample=False
-temperature=0.7
-top_p=0
-typical_p=1
-repetition_penalty=1.15
-top_k=40
-num_beams=1
-penalty_alpha=0
-min_length=0
-length_penalty=1
-no_repeat_ngram_size=0
-early_stopping=True

From 026d60bd3424b5426c5ef80632aa6b71fe12d4c5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 10 Mar 2023 14:01:02 -0300
Subject: [PATCH 30/69] Remove default preset that didn't do anything

---
 modules/shared.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index 4c062fe9..2acb047f 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -42,7 +42,6 @@ settings = {
         'default': 'NovelAI-Sphinx Moth',
         'pygmalion-*': 'Pygmalion',
         'RWKV-*': 'Naive',
-        '(rosey|chip|joi)_.*_instruct.*': 'Instruct Joi (Contrastive Search)'
     },
     'prompts': {
         'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',

From e6c631aea4dd4596606b0f058173de223909d372 Mon Sep 17 00:00:00 2001
From: draff <comamnds@gmail.com>
Date: Fri, 10 Mar 2023 21:36:45 +0000
Subject: [PATCH 31/69] Replace --load-in-4bit with --llama-bits

Replaces --load-in-4bit with a more flexible --llama-bits arg to allow for 2 and 3 bit models as well. This commit also fixes a loading issue with .pt files which are not in the root of the models folder
---
 README.md         |  2 +-
 modules/models.py | 17 +++++++++--------
 modules/shared.py |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index c329913d..5c560172 100644
--- a/README.md
+++ b/README.md
@@ -138,7 +138,7 @@ Optionally, you can use the following command-line flags:
 | `--cai-chat`  | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
 | `--cpu`       | Use the CPU to generate text.|
 | `--load-in-8bit`  | Load the model with 8-bit precision.|
-| `--load-in-4bit`  |  Load the model with 4-bit precision. Currently only works with LLaMA. |
+| `--llama-bits`  |  Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision. |
 | `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
diff --git a/modules/models.py b/modules/models.py
index f31d8b0d..467ffbee 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.load_in_4bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
+    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.llama_bits>0 or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -88,23 +88,24 @@ def load_model(model_name):
         return model, tokenizer
 
     # 4-bit LLaMA
-    elif shared.args.load_in_4bit:
+    elif shared.args.llama_bits>0:
         sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
+        bits = shared.args.llama_bits
 
         from llama import load_quant
 
         path_to_model = Path(f'models/{model_name}')
         pt_model = ''
         if path_to_model.name.lower().startswith('llama-7b'):
-            pt_model = 'llama-7b-4bit.pt'
+            pt_model = f'llama-7b-{bits}bit.pt'
         elif path_to_model.name.lower().startswith('llama-13b'):
-            pt_model = 'llama-13b-4bit.pt'
+            pt_model = f'llama-13b-{bits}bit.pt'
         elif path_to_model.name.lower().startswith('llama-30b'):
-            pt_model = 'llama-30b-4bit.pt'
+            pt_model = f'llama-30b-{bits}bit.pt'
         elif path_to_model.name.lower().startswith('llama-65b'):
-            pt_model = 'llama-65b-4bit.pt'
+            pt_model = f'llama-65b-{bits}bit.pt'
         else:
-            pt_model = f'{model_name}-4bit.pt'
+            pt_model = f'{model_name}-{bits}bit.pt'
 
         # Try to find the .pt both in models/ and in the subfolder
         pt_path = None
@@ -116,7 +117,7 @@ def load_model(model_name):
             print(f"Could not find {pt_model}, exiting...")
             exit()
 
-        model = load_quant(path_to_model, Path(f"models/{pt_model}"), 4)
+        model = load_quant(path_to_model, Path(f"{pt_path}"), bits)
 
         # Multi-GPU setup
         if shared.args.gpu_memory:
diff --git a/modules/shared.py b/modules/shared.py
index 2acb047f..61d5a768 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -67,7 +67,7 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
 parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
-parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
+parser.add_argument('--llama-bits', type=int, default=0, help='Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')

From 9ba8156a70b7d8d2cd79cac939aba22e080d8730 Mon Sep 17 00:00:00 2001
From: ItsLogic <38233332+ItsLogic@users.noreply.github.com>
Date: Fri, 10 Mar 2023 22:33:58 +0000
Subject: [PATCH 32/69] remove unnecessary Path()

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 467ffbee..3ec68f17 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -117,7 +117,7 @@ def load_model(model_name):
             print(f"Could not find {pt_model}, exiting...")
             exit()
 
-        model = load_quant(path_to_model, Path(f"{pt_path}"), bits)
+        model = load_quant(path_to_model, pt_path, bits)
 
         # Multi-GPU setup
         if shared.args.gpu_memory:

From 804486214b5a1b07fc4c57255053593bb980d349 Mon Sep 17 00:00:00 2001
From: draff <comamnds@gmail.com>
Date: Fri, 10 Mar 2023 23:21:01 +0000
Subject: [PATCH 33/69] Re-implement --load-in-4bit and update --llama-bits arg
 description

---
 README.md         | 3 ++-
 modules/models.py | 8 ++++++--
 modules/shared.py | 3 ++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 5c560172..76774c0b 100644
--- a/README.md
+++ b/README.md
@@ -138,7 +138,8 @@ Optionally, you can use the following command-line flags:
 | `--cai-chat`  | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. |
 | `--cpu`       | Use the CPU to generate text.|
 | `--load-in-8bit`  | Load the model with 8-bit precision.|
-| `--llama-bits`  |  Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision. |
+| `--load-in-4bit`  | Load the model with 4-bit precision. Currently only works with LLaMA.|
+| `--llama-bits`  |  Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
 | `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
diff --git a/modules/models.py b/modules/models.py
index 3ec68f17..6c423a25 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -88,9 +88,13 @@ def load_model(model_name):
         return model, tokenizer
 
     # 4-bit LLaMA
-    elif shared.args.llama_bits>0:
+    elif shared.args.llama_bits>0 or shared.args.load_in_4bit:
         sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
-        bits = shared.args.llama_bits
+        if shared.args.load_in_4bit:
+            bits = 4
+        else:
+            bits = shared.args.llama_bits
+        
 
         from llama import load_quant
 
diff --git a/modules/shared.py b/modules/shared.py
index 61d5a768..f3f46329 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -67,7 +67,8 @@ parser.add_argument('--chat', action='store_true', help='Launch the web UI in ch
 parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI in chat mode with a style similar to Character.AI\'s. If the file img_bot.png or img_bot.jpg exists in the same folder as server.py, this image will be used as the bot\'s profile picture. Similarly, img_me.png or img_me.jpg will be used as your profile picture.')
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
-parser.add_argument('--llama-bits', type=int, default=0, help='Load LLaMA models with specified precision. 2, 3 and 4 bit are supported, use standard `--load-in-8bit` for 8bit precision.')
+parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
+parser.add_argument('--llama-bits', type=int, default=0, help='Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')

From 001e638b47331f24ac967dd982f8ce4781775f7d Mon Sep 17 00:00:00 2001
From: draff <comamnds@gmail.com>
Date: Fri, 10 Mar 2023 23:28:19 +0000
Subject: [PATCH 34/69] Make it actually work

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index 6c423a25..8e7caa8d 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.llama_bits>0 or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
+    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.llama_bits>0 or shared.args.load_in_4bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:

From 28fd4fc9702c9fa3a52e2ca60ca034f01cbe3be9 Mon Sep 17 00:00:00 2001
From: draff <comamnds@gmail.com>
Date: Fri, 10 Mar 2023 23:34:13 +0000
Subject: [PATCH 35/69] Change wording to be consistent with other args

---
 README.md         | 2 +-
 modules/shared.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 76774c0b..50d07cd6 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,7 @@ Optionally, you can use the following command-line flags:
 | `--cpu`       | Use the CPU to generate text.|
 | `--load-in-8bit`  | Load the model with 8-bit precision.|
 | `--load-in-4bit`  | Load the model with 4-bit precision. Currently only works with LLaMA.|
-| `--llama-bits`  |  Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
+| `--llama-bits`  |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
 | `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |
diff --git a/modules/shared.py b/modules/shared.py
index f3f46329..3ea4ef41 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -68,7 +68,7 @@ parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI i
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
 parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
-parser.add_argument('--llama-bits', type=int, default=0, help='Load pre-quantized models with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
+parser.add_argument('--llama-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')

From 0dfac4b777009d415d848c2f0bc718ec1bbac7e5 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 16:34:59 +1100
Subject: [PATCH 36/69] Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID
---
 extensions/silero_tts/requirements.txt |  1 -
 extensions/silero_tts/script.py        | 79 +++++++++++++-------------
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/extensions/silero_tts/requirements.txt b/extensions/silero_tts/requirements.txt
index b4444306..f2f0bff5 100644
--- a/extensions/silero_tts/requirements.txt
+++ b/extensions/silero_tts/requirements.txt
@@ -4,4 +4,3 @@ pydub
 PyYAML
 torch
 torchaudio
-simpleaudio
diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 334b02b9..b66963e2 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -4,7 +4,6 @@ import gradio as gr
 import torch
 
 import modules.shared as shared
-import simpleaudio as sa
 
 torch._C._jit_set_profiling_mode(False)
 
@@ -15,13 +14,16 @@ params = {
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
-    'max_wavs': -1,
-    'autoplay': True,
     'show_text': True,
+    'autoplay': True,
+    'voice_pitch': 'medium',
+    'voice_speed': 'medium',
 }
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
-wav_idx = 0
+voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
+voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
+last_msg_id = 0
 
 #Used for making text xml compatible, needed for voice pitch and speed control
 table = str.maketrans({
@@ -55,6 +57,14 @@ def input_modifier(string):
     This function is applied to your text inputs before
     they are fed into the model.
     """
+    #remove autoplay from previous
+    if len(shared.history['internal'])>0:
+        [text, reply] = shared.history['internal'][-1]
+        [visible_text, visible_reply] = shared.history['visible'][-1]
+        rep_clean = reply.replace('controls autoplay>','controls>')
+        vis_rep_clean = visible_reply.replace('controls autoplay>','controls>')
+        shared.history['internal'][-1] = [text, rep_clean]
+        shared.history['visible'][-1] = [visible_text, vis_rep_clean]
 
     return string
 
@@ -63,7 +73,7 @@ def output_modifier(string):
     This function is applied to the model outputs.
     """
 
-    global wav_idx, model, current_params
+    global model, current_params
 
     for i in params:
         if params[i] != current_params[i]:
@@ -81,44 +91,31 @@ def output_modifier(string):
     string = string.replace('\n', ' ')
     string = string.strip()
 
-    auto_playable=True
+    silent_string = False #Used to prevent unnecessary audio file generation
     if string == '':
             string = 'empty reply, try regenerating'
-            auto_playable=False
-            
+            silent_string = True
 
     #x-slow, slow, medium, fast, x-fast
     #x-low, low, medium, high, x-high
-    prosody='<prosody rate="medium" pitch="medium">'
+    pitch = params['voice_pitch']
+    speed = params['voice_speed']
+    prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
     string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
-        
-    output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    autoplay_str = ''
-    if not shared.still_streaming:
+
+    current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message
+    output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
+    if not shared.still_streaming and not silent_string:
         model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-        #diabled until autoplay doesn't run on previous messages
-        #autoplay = 'autoplay' if (params['autoplay'] and auto_playable) else ''
-        string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay_str}></audio>\n\n'
+        string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
     else:
-        #placeholder so text doesnt shift around so much
-        string =f'<audio controls {autoplay_str}></audio>\n\n'
-    
-    #reset if too many wavs. set max to -1 for unlimited.
-    if wav_idx < params['max_wavs'] or params['max_wavs'] < 0:
-        #only increment if starting a new stream, else replace during streaming.
-        if not shared.still_streaming:
-            wav_idx += 1
-    else:
-        wav_idx = 0
+        #placeholder so text doesn't shift around so much
+        string ='<audio controls></audio>\n\n'
 
     if params['show_text']:
+        #string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
         string+=orig_string
 
-    if params['autoplay'] == True and auto_playable and not shared.still_streaming:
-        stop_autoplay()
-        wave_obj = sa.WaveObject.from_wave_file(output_file.as_posix())
-        wave_obj.play()
-
     return string
 
 def bot_prefix_modifier(string):
@@ -130,20 +127,20 @@ def bot_prefix_modifier(string):
 
     return string
 
-def stop_autoplay():
-    sa.stop_all()
-
 def ui():
     # Gradio elements
-    activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
-    show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
-    autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
-    stop_audio = gr.Button("Stop Auto-Play")
-    voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
+    with gr.Accordion("Silero TTS"):
+        activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+        show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
+        autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
+        voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
+        v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
+        v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
-    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
-    stop_audio.click(stop_autoplay)
+    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
+    v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
+    v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)

From b8f7d34c1df5b12e60491e4c8a6494d5e6aec20e Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 17:05:09 +1100
Subject: [PATCH 37/69] Undo changes to requirements

needing to manually install tensorboard might be a windows-only problem. Can be easily solved manually.
---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a8a6eada..47c56a45 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,6 +5,5 @@ gradio==3.18.0
 numpy
 rwkv==0.1.0
 safetensors==0.2.8
-tensorboard
 sentencepiece
 git+https://github.com/oobabooga/transformers@llama_push

From 96c51973f9e551055dac2e135e9c4229cbf40ad0 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sat, 11 Mar 2023 22:50:59 +1100
Subject: [PATCH 38/69] --auto-launch and "Is typing..."

- Added `--auto-launch` arg to open web UI in the default browser when ready.
- Changed chat.py to display user input immediately and "*Is typing...*" as a temporary reply while generating text. Most noticeable when using `--no-stream`.
---
 modules/chat.py   | 3 +++
 modules/shared.py | 1 +
 server.py         | 4 ++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..0f029fe2 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -120,6 +120,9 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
     else:
         prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
 
+    #display user input and "*is typing...*" imediately
+    yield shared.history['visible']+[[visible_text, '*Is typing...*']]
+
     # Generate
     reply = ''
     for i in range(chat_generation_attempts):
diff --git a/modules/shared.py b/modules/shared.py
index 2acb047f..c42ba7ed 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -90,4 +90,5 @@ parser.add_argument('--listen', action='store_true', help='Make the web UI reach
 parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
 parser.add_argument('--share', action='store_true', help='Create a public URL. This is useful for running the web UI on Google Colab or similar.')
 parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
+parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch')
 args = parser.parse_args()
diff --git a/server.py b/server.py
index c2977f41..ad483eb5 100644
--- a/server.py
+++ b/server.py
@@ -372,9 +372,9 @@ else:
 
 shared.gradio['interface'].queue()
 if shared.args.listen:
-    shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_name='0.0.0.0', server_port=shared.args.listen_port)
+    shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_name='0.0.0.0', server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch)
 else:
-    shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port)
+    shared.gradio['interface'].launch(prevent_thread_lock=True, share=shared.args.share, server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch)
 
 # I think that I will need this later
 while True:

From 2743dd736a431e54a7220ef7e29ad5d31797611c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 10:50:18 -0300
Subject: [PATCH 39/69] Add *Is typing...* to impersonate as well

---
 modules/chat.py | 5 ++++-
 server.py       | 9 +++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 0f029fe2..5bf96b1a 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -120,7 +120,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
     else:
         prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
 
-    #display user input and "*is typing...*" imediately
+    # Display user input and "*is typing...*" imediately
     yield shared.history['visible']+[[visible_text, '*Is typing...*']]
 
     # Generate
@@ -161,6 +161,9 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
 
     prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
 
+    # Display "*is typing...*" imediately
+    yield '*Is typing...*'
+
     reply = ''
     for i in range(chat_generation_attempts):
         for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
diff --git a/server.py b/server.py
index ad483eb5..c65443ec 100644
--- a/server.py
+++ b/server.py
@@ -272,10 +272,10 @@ if shared.args.chat or shared.args.cai_chat:
 
         function_call = 'chat.cai_chatbot_wrapper' if shared.args.cai_chat else 'chat.chatbot_wrapper'
 
-        gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream, api_name='textgen'))
-        gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
-        gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream))
-        gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream))
+        gen_events.append(shared.gradio['Generate'].click(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=False, api_name='textgen'))
+        gen_events.append(shared.gradio['textbox'].submit(eval(function_call), shared.input_params, shared.gradio['display'], show_progress=False))
+        gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=False))
+        gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=False))
         shared.gradio['Stop'].click(chat.stop_everything_event, [], [], cancels=gen_events)
 
         shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream)
@@ -309,6 +309,7 @@ if shared.args.chat or shared.args.cai_chat:
         reload_inputs = [shared.gradio['name1'], shared.gradio['name2']] if shared.args.cai_chat else []
         shared.gradio['upload_chat_history'].upload(reload_func, reload_inputs, [shared.gradio['display']])
         shared.gradio['upload_img_me'].upload(reload_func, reload_inputs, [shared.gradio['display']])
+        shared.gradio['Stop'].click(reload_func, reload_inputs, [shared.gradio['display']])
 
         shared.gradio['interface'].load(lambda : chat.load_default_history(shared.settings[f'name1{suffix}'], shared.settings[f'name2{suffix}']), None, None)
         shared.gradio['interface'].load(reload_func, reload_inputs, [shared.gradio['display']], show_progress=True)

From 8f8da6707d7e71c2eef01c2d33ca6623cebf080c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 11:17:13 -0300
Subject: [PATCH 40/69] Minor style changes to silero_tts

---
 extensions/silero_tts/script.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index b66963e2..7e63d8b7 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -14,18 +14,19 @@ params = {
     'model_id': 'v3_en',
     'sample_rate': 48000,
     'device': 'cpu',
-    'show_text': True,
+    'show_text': False,
     'autoplay': True,
     'voice_pitch': 'medium',
     'voice_speed': 'medium',
 }
+
 current_params = params.copy()
 voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
 voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
 voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
 last_msg_id = 0
 
-#Used for making text xml compatible, needed for voice pitch and speed control
+# Used for making text xml compatible, needed for voice pitch and speed control
 table = str.maketrans({
     "<": "&lt;",
     ">": "&gt;",
@@ -33,6 +34,7 @@ table = str.maketrans({
     "'": "&apos;",
     '"': "&quot;",
 })
+
 def xmlesc(txt):
     return txt.translate(table)
 
@@ -57,7 +59,8 @@ def input_modifier(string):
     This function is applied to your text inputs before
     they are fed into the model.
     """
-    #remove autoplay from previous
+
+    # Remove autoplay from previous
     if len(shared.history['internal'])>0:
         [text, reply] = shared.history['internal'][-1]
         [visible_text, visible_reply] = shared.history['visible'][-1]
@@ -91,30 +94,30 @@ def output_modifier(string):
     string = string.replace('\n', ' ')
     string = string.strip()
 
-    silent_string = False #Used to prevent unnecessary audio file generation
+    silent_string = False # Used to prevent unnecessary audio file generation
     if string == '':
-            string = 'empty reply, try regenerating'
-            silent_string = True
+        string = 'empty reply, try regenerating'
+        silent_string = True
 
-    #x-slow, slow, medium, fast, x-fast
-    #x-low, low, medium, high, x-high
+    # x-slow, slow, medium, fast, x-fast
+    # x-low, low, medium, high, x-high
     pitch = params['voice_pitch']
     speed = params['voice_speed']
     prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
-    string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
+    string = '<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
 
-    current_msg_id=len(shared.history['visible'])#check length here, since output_modifier can run many times on the same message
+    current_msg_id = len(shared.history['visible']) # Check length here, since output_modifier can run many times on the same message
     output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
     if not shared.still_streaming and not silent_string:
         model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
         string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
     else:
-        #placeholder so text doesn't shift around so much
-        string ='<audio controls></audio>\n\n'
+        # Placeholder so text doesn't shift around so much
+        string = '<audio controls></audio>\n\n'
 
     if params['show_text']:
-        #string+=f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
-        string+=orig_string
+        #string += f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
+        string += orig_string
 
     return string
 

From 501afbc23408df04d2d545b2100bde55b6611598 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 14:47:30 -0300
Subject: [PATCH 41/69] Add requests to requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 6133f394..a7df93bb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ bitsandbytes==0.37.0
 flexgen==0.1.7
 gradio==3.18.0
 numpy
+requests
 rwkv==0.1.0
 safetensors==0.2.8
 sentencepiece

From 195e99d0b6d116105c0adc0978a4ec4dbb0d847c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 16:11:15 -0300
Subject: [PATCH 42/69] Add llama_prompts extension

---
 extensions/llama_prompts/script.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 extensions/llama_prompts/script.py

diff --git a/extensions/llama_prompts/script.py b/extensions/llama_prompts/script.py
new file mode 100644
index 00000000..e45cd445
--- /dev/null
+++ b/extensions/llama_prompts/script.py
@@ -0,0 +1,18 @@
+import gradio as gr
+import modules.shared as shared
+import pandas as pd
+
+df = pd.read_csv("https://raw.githubusercontent.com/devbrones/llama-prompts/main/prompts/prompts.csv")
+
+def get_prompt_by_name(name):
+    if name == 'None':
+        return ''
+    else:
+        return df[df['Prompt name'] == name].iloc[0]['Prompt'].replace('\\n', '\n')
+
+def ui():
+    if not shared.args.chat or share.args.cai_chat:
+        choices = ['None'] + list(df['Prompt name'])
+
+        prompts_menu = gr.Dropdown(value=choices[0], choices=choices, label='Prompt')
+        prompts_menu.change(get_prompt_by_name, prompts_menu, shared.gradio['textbox'])

From def97f658c016d4c50fe9d682265841154eb5336 Mon Sep 17 00:00:00 2001
From: HideLord <polimonom@gmail.com>
Date: Sun, 12 Mar 2023 02:54:22 +0200
Subject: [PATCH 43/69] Small patch to fix loading of character jsons. Now it
 correctly reads non-ascii characters on Windows.

---
 modules/chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..4a7fb873 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -332,7 +332,7 @@ def load_character(_character, name1, name2):
     shared.history['visible'] = []
     if _character != 'None':
         shared.character = _character
-        data = json.loads(open(Path(f'characters/{_character}.json'), 'r').read())
+        data = json.loads(open(Path(f'characters/{_character}.json'), 'r', encoding='utf-8').read())
         name2 = data['char_name']
         if 'char_persona' in data and data['char_persona'] != '':
             context += f"{data['char_name']}'s Persona: {data['char_persona']}\n"

From 37f0166b2d6b0f2938a5a4c1762479829de1c5be Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 11 Mar 2023 23:14:49 -0300
Subject: [PATCH 44/69] Fix memory leak in new streaming (second attempt)

---
 modules/callbacks.py       | 5 ++++-
 modules/text_generation.py | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index 15674b8a..05e8fafa 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -49,7 +49,7 @@ class Iteratorize:
     def __init__(self, func, kwargs={}, callback=None):
         self.mfunc=func
         self.c_callback=callback
-        self.q = Queue(maxsize=1)
+        self.q = Queue()
         self.sentinel = object()
         self.kwargs = kwargs
 
@@ -73,3 +73,6 @@ class Iteratorize:
             raise StopIteration
         else:
             return obj
+
+    def __del__(self):
+        pass
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 6a59f9a7..5d01c8cb 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -187,7 +187,6 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
 
         yield formatted_outputs(original_question, shared.model_name)
         for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
-            print(print('Used vram in gib:', torch.cuda.memory_allocated() / 1024**3))
             if shared.soft_prompt:
                 output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
             reply = decode(output)

From 0bd54309887f6e7adc7e59d4f8675ed6f322bb81 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 02:04:28 -0300
Subject: [PATCH 45/69] Use 'with' statement to better handle streaming memory

---
 modules/RWKV.py            | 10 +++++-----
 modules/callbacks.py       | 27 +++++++++++++++++++++++----
 modules/text_generation.py | 19 ++++++++++---------
 3 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/modules/RWKV.py b/modules/RWKV.py
index 70deab28..836d31dc 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -50,11 +50,11 @@ class RWKVModel:
         return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
 
     def generate_with_streaming(self, **kwargs):
-        iterable = Iteratorize(self.generate, kwargs, callback=None)
-        reply = kwargs['context']
-        for token in iterable:
-            reply += token
-            yield reply
+        with Iteratorize(self.generate, kwargs, callback=None) as generator:
+            reply = kwargs['context']
+            for token in generator:
+                reply += token
+                yield reply
 
 class RWKVTokenizer:
     def __init__(self):
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 05e8fafa..e0d1c988 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -1,3 +1,4 @@
+import gc
 from queue import Queue
 from threading import Thread
 
@@ -6,7 +7,6 @@ import transformers
 
 import modules.shared as shared
 
-
 # Copied from https://github.com/PygmalionAI/gradio-ui/
 class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria):
 
@@ -52,17 +52,24 @@ class Iteratorize:
         self.q = Queue()
         self.sentinel = object()
         self.kwargs = kwargs
+        self.stop_now = False
 
         def _callback(val):
+            if self.stop_now:
+                raise ValueError
             self.q.put(val)
 
         def gentask():
-            ret = self.mfunc(callback=_callback, **self.kwargs)
+            try:
+                ret = self.mfunc(callback=_callback, **self.kwargs)
+            except ValueError:
+                pass
             self.q.put(self.sentinel)
             if self.c_callback:
                 self.c_callback(ret)
 
-        Thread(target=gentask).start()
+        self.thread = Thread(target=gentask)
+        self.thread.start()
 
     def __iter__(self):
         return self
@@ -75,4 +82,16 @@ class Iteratorize:
             return obj
 
     def __del__(self):
-        pass
+        clear_torch_cache()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop_now = True
+        clear_torch_cache()
+
+def clear_torch_cache():
+    gc.collect()
+    if not shared.args.cpu:
+        torch.cuda.empty_cache()
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 5d01c8cb..7f5aad5e 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -186,17 +186,18 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             return Iteratorize(generate_with_callback, kwargs, callback=None)
 
         yield formatted_outputs(original_question, shared.model_name)
-        for output in eval(f"generate_with_streaming({', '.join(generate_params)})"):
-            if shared.soft_prompt:
-                output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-            reply = decode(output)
+        with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
+            for output in generator:
+                if shared.soft_prompt:
+                    output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+                reply = decode(output)
 
-            if not (shared.args.chat or shared.args.cai_chat):
-                reply = original_question + apply_extensions(reply[len(question):], "output")
-            yield formatted_outputs(reply, shared.model_name)
+                if not (shared.args.chat or shared.args.cai_chat):
+                    reply = original_question + apply_extensions(reply[len(question):], "output")
+                yield formatted_outputs(reply, shared.model_name)
 
-            if output[-1] == n:
-                break
+                if output[-1] == n:
+                    break
 
     # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
     else:

From 433f6350bc794e0a904e1a34abaffe49a106a484 Mon Sep 17 00:00:00 2001
From: unknown <lxe@lxe.co>
Date: Sat, 11 Mar 2023 21:21:30 -0800
Subject: [PATCH 46/69] Load and save character files in UTF-8

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..a0cae949 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -332,7 +332,7 @@ def load_character(_character, name1, name2):
     shared.history['visible'] = []
     if _character != 'None':
         shared.character = _character
-        data = json.loads(open(Path(f'characters/{_character}.json'), 'r').read())
+        data = json.loads(open(Path(f'characters/{_character}.json'), 'r', encoding='utf-8').read())
         name2 = data['char_name']
         if 'char_persona' in data and data['char_persona'] != '':
             context += f"{data['char_name']}'s Persona: {data['char_persona']}\n"
@@ -372,7 +372,7 @@ def upload_character(json_file, img, tavern=False):
         i += 1
     if tavern:
         outfile_name = f'TavernAI-{outfile_name}'
-    with open(Path(f'characters/{outfile_name}.json'), 'w') as f:
+    with open(Path(f'characters/{outfile_name}.json'), 'w', encoding='utf-8') as f:
         f.write(json_file)
     if img is not None:
         img = Image.open(io.BytesIO(img))

From b0e8cb8c889cdadd9779517ba8055114b39357cd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 02:31:45 -0300
Subject: [PATCH 47/69] Various fixes in chat mode

---
 modules/chat.py            |  16 +++---
 modules/text_generation.py | 102 +++++++++++++++++++------------------
 2 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index f40f8299..69d81e94 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -115,14 +115,18 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
         visible_text = visible_text.replace('\n', '<br>')
     text = apply_extensions(text, "input")
 
-    if custom_generate_chat_prompt is None:
-        prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
-    else:
-        prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
-
     # Generate
     reply = ''
     for i in range(chat_generation_attempts):
+
+        #  The prompt needs to be generated here because, as the reply
+        #  grows, it may become necessary to remove more old messages to
+        #  fit into the 2048 tokens window.
+        if custom_generate_chat_prompt is None:
+            prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
+        else:
+            prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
+
         for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
 
             # Extracting the reply
@@ -156,10 +160,10 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
     if 'pygmalion' in shared.model_name.lower():
         name1 = "You"
 
-    prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
 
     reply = ''
     for i in range(chat_generation_attempts):
+        prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]), impersonate=True)
         for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
             reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
             if not substring_found:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 7f5aad5e..2460df4f 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -159,35 +159,53 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     else:
         generate_params.insert(0, "inputs=input_ids")
 
-    # Generate the entire reply at once.
-    if shared.args.no_stream:
-        with torch.no_grad():
-            output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
-        if shared.soft_prompt:
-            output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-
-        reply = decode(output)
-        if not (shared.args.chat or shared.args.cai_chat):
-            reply = original_question + apply_extensions(reply[len(question):], "output")
-
-        yield formatted_outputs(reply, shared.model_name)
-
-    # Stream the reply 1 token at a time.
-    # This is based on the trick of using 'stopping_criteria' to create an iterator.
-    elif not shared.args.flexgen:
-
-        def generate_with_callback(callback=None, **kwargs):
-            kwargs['stopping_criteria'].append(Stream(callback_func=callback))
-            clear_torch_cache()
+    try:
+        # Generate the entire reply at once.
+        if shared.args.no_stream:
             with torch.no_grad():
-                shared.model.generate(**kwargs)
+                output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
+            if shared.soft_prompt:
+                output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
 
-        def generate_with_streaming(**kwargs):
-            return Iteratorize(generate_with_callback, kwargs, callback=None)
+            reply = decode(output)
+            if not (shared.args.chat or shared.args.cai_chat):
+                reply = original_question + apply_extensions(reply[len(question):], "output")
 
-        yield formatted_outputs(original_question, shared.model_name)
-        with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
-            for output in generator:
+            yield formatted_outputs(reply, shared.model_name)
+
+        # Stream the reply 1 token at a time.
+        # This is based on the trick of using 'stopping_criteria' to create an iterator.
+        elif not shared.args.flexgen:
+
+            def generate_with_callback(callback=None, **kwargs):
+                kwargs['stopping_criteria'].append(Stream(callback_func=callback))
+                clear_torch_cache()
+                with torch.no_grad():
+                    shared.model.generate(**kwargs)
+
+            def generate_with_streaming(**kwargs):
+                return Iteratorize(generate_with_callback, kwargs, callback=None)
+
+            yield formatted_outputs(original_question, shared.model_name)
+            with eval(f"generate_with_streaming({', '.join(generate_params)})") as generator:
+                for output in generator:
+                    if shared.soft_prompt:
+                        output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
+                    reply = decode(output)
+
+                    if not (shared.args.chat or shared.args.cai_chat):
+                        reply = original_question + apply_extensions(reply[len(question):], "output")
+                    yield formatted_outputs(reply, shared.model_name)
+
+                    if output[-1] == n:
+                        break
+
+        # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
+        else:
+            for i in range(max_new_tokens//8+1):
+                clear_torch_cache()
+                with torch.no_grad():
+                    output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
                 if shared.soft_prompt:
                     output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
                 reply = decode(output)
@@ -196,30 +214,14 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                     reply = original_question + apply_extensions(reply[len(question):], "output")
                 yield formatted_outputs(reply, shared.model_name)
 
-                if output[-1] == n:
+                if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
                     break
 
-    # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
-    else:
-        for i in range(max_new_tokens//8+1):
-            clear_torch_cache()
-            with torch.no_grad():
-                output = eval(f"shared.model.generate({', '.join(generate_params)})")[0]
-            if shared.soft_prompt:
-                output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
-            reply = decode(output)
+                input_ids = np.reshape(output, (1, output.shape[0]))
+                if shared.soft_prompt:
+                    inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
 
-            if not (shared.args.chat or shared.args.cai_chat):
-                reply = original_question + apply_extensions(reply[len(question):], "output")
-            yield formatted_outputs(reply, shared.model_name)
-
-            if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
-                break
-
-            input_ids = np.reshape(output, (1, output.shape[0]))
-            if shared.soft_prompt:
-                inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
-
-    t1 = time.time()
-    print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
-    return
+    finally:
+        t1 = time.time()
+        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(original_input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(original_input_ids[0])} tokens)")
+        return

From 3baf5fc700c603182456e7b4c3ac4c0f5e9748e8 Mon Sep 17 00:00:00 2001
From: Aleksey Smolenchuk <lxe@lxe.co>
Date: Sat, 11 Mar 2023 21:40:01 -0800
Subject: [PATCH 48/69] Load and save chat history in utf-8

---
 modules/chat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index a0cae949..8a221526 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -291,7 +291,7 @@ def save_history(timestamp=True):
         fname = f"{prefix}persistent.json"
     if not Path('logs').exists():
         Path('logs').mkdir()
-    with open(Path(f'logs/{fname}'), 'w') as f:
+    with open(Path(f'logs/{fname}'), 'w', encoding='utf-8') as f:
         f.write(json.dumps({'data': shared.history['internal'], 'data_visible': shared.history['visible']}, indent=2))
     return Path(f'logs/{fname}')
 
@@ -321,7 +321,7 @@ def load_history(file, name1, name2):
 
 def load_default_history(name1, name2):
     if Path('logs/persistent.json').exists():
-        load_history(open(Path('logs/persistent.json'), 'rb').read(), name1, name2)
+        load_history(open(Path('logs/persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
     else:
         shared.history['internal'] = []
         shared.history['visible'] = []
@@ -355,7 +355,7 @@ def load_character(_character, name1, name2):
         name2 = shared.settings['name2_pygmalion']
 
     if Path(f'logs/{shared.character}_persistent.json').exists():
-        load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb').read(), name1, name2)
+        load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
 
     if shared.args.cai_chat:
         return name2, context, generate_chat_html(shared.history['visible'], name1, name2, shared.character)

From 341e13503634a0debb684105f055e09772d16c6e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 02:53:08 -0300
Subject: [PATCH 49/69] Various fixes in chat mode

---
 modules/callbacks.py       |  1 +
 modules/chat.py            | 16 ++++++----------
 modules/text_generation.py | 29 +++++++++++++++--------------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/modules/callbacks.py b/modules/callbacks.py
index e0d1c988..faa4a5e9 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -64,6 +64,7 @@ class Iteratorize:
                 ret = self.mfunc(callback=_callback, **self.kwargs)
             except ValueError:
                 pass
+            clear_torch_cache()
             self.q.put(self.sentinel)
             if self.c_callback:
                 self.c_callback(ret)
diff --git a/modules/chat.py b/modules/chat.py
index 69d81e94..f40f8299 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -115,18 +115,14 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
         visible_text = visible_text.replace('\n', '<br>')
     text = apply_extensions(text, "input")
 
+    if custom_generate_chat_prompt is None:
+        prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
+    else:
+        prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
+
     # Generate
     reply = ''
     for i in range(chat_generation_attempts):
-
-        #  The prompt needs to be generated here because, as the reply
-        #  grows, it may become necessary to remove more old messages to
-        #  fit into the 2048 tokens window.
-        if custom_generate_chat_prompt is None:
-            prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
-        else:
-            prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]))
-
         for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
 
             # Extracting the reply
@@ -160,10 +156,10 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
     if 'pygmalion' in shared.model_name.lower():
         name1 = "You"
 
+    prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size, impersonate=True)
 
     reply = ''
     for i in range(chat_generation_attempts):
-        prompt = generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size-len(encode(' '+reply)[0]), impersonate=True)
         for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
             reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
             if not substring_found:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 2460df4f..7966e126 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -92,21 +92,22 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     # These models are not part of Hugging Face, so we handle them
     # separately and terminate the function call earlier
     if shared.is_RWKV:
-        if shared.args.no_stream:
-            reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
-            yield formatted_outputs(reply, shared.model_name)
-        else:
-            yield formatted_outputs(question, shared.model_name)
-            # RWKV has proper streaming, which is very nice.
-            # No need to generate 8 tokens at a time.
-            for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
+        try:
+            if shared.args.no_stream:
+                reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
                 yield formatted_outputs(reply, shared.model_name)
-
-        t1 = time.time()
-        output = encode(reply)[0]
-        input_ids = encode(question)
-        print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
-        return
+            else:
+                yield formatted_outputs(question, shared.model_name)
+                # RWKV has proper streaming, which is very nice.
+                # No need to generate 8 tokens at a time.
+                for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
+                    yield formatted_outputs(reply, shared.model_name)
+        finally:
+            t1 = time.time()
+            output = encode(reply)[0]
+            input_ids = encode(question)
+            print(f"Output generated in {(t1-t0):.2f} seconds ({(len(output)-len(input_ids[0]))/(t1-t0):.2f} tokens/s, {len(output)-len(input_ids[0])} tokens)")
+            return
 
     original_question = question
     if not (shared.args.chat or shared.args.cai_chat):

From 3f7c3d6559a51a3b95667b3ff74d048ffb722484 Mon Sep 17 00:00:00 2001
From: Aleksey Smolenchuk <lxe@lxe.co>
Date: Sat, 11 Mar 2023 22:10:57 -0800
Subject: [PATCH 50/69] No need to set encoding on binary read

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 8a221526..ab5dbc2d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -321,7 +321,7 @@ def load_history(file, name1, name2):
 
 def load_default_history(name1, name2):
     if Path('logs/persistent.json').exists():
-        load_history(open(Path('logs/persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
+        load_history(open(Path('logs/persistent.json'), 'rb').read(), name1, name2)
     else:
         shared.history['internal'] = []
         shared.history['visible'] = []
@@ -355,7 +355,7 @@ def load_character(_character, name1, name2):
         name2 = shared.settings['name2_pygmalion']
 
     if Path(f'logs/{shared.character}_persistent.json').exists():
-        load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb', encoding='utf-8').read(), name1, name2)
+        load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb').read(), name1, name2)
 
     if shared.args.cai_chat:
         return name2, context, generate_chat_html(shared.history['visible'], name1, name2, shared.character)

From e2da6b9685a40825fa9c299d676aaae1c3d21dcc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 03:25:56 -0300
Subject: [PATCH 51/69] Fix You You You appearing in chat mode

---
 modules/chat.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/chat.py b/modules/chat.py
index 5bf96b1a..a6167d35 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -84,6 +84,7 @@ def extract_message_from_reply(question, reply, name1, name2, check, impersonate
         tmp = f"\n{asker}:"
         for j in range(1, len(tmp)):
             if reply[-j:] == tmp[:j]:
+                reply = reply[:-j]
                 substring_found = True
 
     return reply, next_character_found, substring_found

From ad14f0e49929d426560413c0b9de19986cbeac9e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 03:42:29 -0300
Subject: [PATCH 52/69] Fix regenerate (provisory way)

---
 modules/chat.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index ae089ca5..2048e2c5 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -92,7 +92,7 @@ def extract_message_from_reply(question, reply, name1, name2, check, impersonate
 def stop_everything_event():
     shared.stop_everything = True
 
-def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
+def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1, regenerate=False):
     shared.stop_everything = False
     just_started = True
     eos_token = '\n' if check else None
@@ -121,8 +121,9 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
     else:
         prompt = custom_generate_chat_prompt(text, max_new_tokens, name1, name2, context, chat_prompt_size)
 
-    # Display user input and "*is typing...*" imediately
-    yield shared.history['visible']+[[visible_text, '*Is typing...*']]
+    if not regenerate:
+        # Display user input and "*is typing...*" imediately
+        yield shared.history['visible']+[[visible_text, '*Is typing...*']]
 
     # Generate
     reply = ''
@@ -189,7 +190,7 @@ def regenerate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typi
         last_visible = shared.history['visible'].pop()
         last_internal = shared.history['internal'].pop()
 
-        for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts):
+        for _history in chatbot_wrapper(last_internal[0], max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts, regenerate=True):
             if shared.args.cai_chat:
                 shared.history['visible'][-1] = [last_visible[0], _history[-1][1]]
                 yield generate_chat_html(shared.history['visible'], name1, name2, shared.character)

From d4afed4e44a748c22d9fa97edb3f818ae8af191f Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sun, 12 Mar 2023 17:56:57 +1100
Subject: [PATCH 53/69] Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.
---
 extensions/silero_tts/script.py | 89 ++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 17 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 7e63d8b7..1a60c901 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -2,8 +2,10 @@ from pathlib import Path
 
 import gradio as gr
 import torch
-
+import time
+import re
 import modules.shared as shared
+import modules.chat as chat
 
 torch._C._jit_set_profiling_mode(False)
 
@@ -54,19 +56,57 @@ def remove_surrounded_chars(string):
             new_string += char
     return new_string
 
+def remove_tts_from_history():
+    suffix = '_pygmalion' if 'pygmalion' in shared.model_name.lower() else ''
+    for i, entry in enumerate(shared.history['internal']):
+        reply = entry[1]
+        reply = re.sub("(<USER>|<user>|{{user}})", shared.settings[f'name1{suffix}'], reply)
+        if shared.args.chat:
+            reply = reply.replace('\n', '<br>')
+        shared.history['visible'][i][1] = reply
+
+    if shared.args.cai_chat:
+        return chat.generate_chat_html(shared.history['visible'], shared.settings[f'name1{suffix}'], shared.settings[f'name1{suffix}'], shared.character)
+    else:
+        return shared.history['visible']
+
+def toggle_text_in_history():
+    suffix = '_pygmalion' if 'pygmalion' in shared.model_name.lower() else ''
+    audio_str='\n\n' # The '\n\n' used after </audio>
+    if shared.args.chat:
+         audio_str='<br><br>'
+
+    if params['show_text']==True:
+        #for i, entry in enumerate(shared.history['internal']):
+        for i, entry in enumerate(shared.history['visible']):
+            vis_reply = entry[1]
+            if vis_reply.startswith('<audio'):
+                reply = shared.history['internal'][i][1]
+                reply = re.sub("(<USER>|<user>|{{user}})", shared.settings[f'name1{suffix}'], reply)
+                if shared.args.chat:
+                    reply = reply.replace('\n', '<br>')
+                shared.history['visible'][i][1] = vis_reply.split(audio_str,1)[0]+audio_str+reply
+    else:
+        for i, entry in enumerate(shared.history['visible']):
+            vis_reply = entry[1]
+            if vis_reply.startswith('<audio'):
+                shared.history['visible'][i][1] = vis_reply.split(audio_str,1)[0]+audio_str
+
+    if shared.args.cai_chat:
+        return chat.generate_chat_html(shared.history['visible'], shared.settings[f'name1{suffix}'], shared.settings[f'name1{suffix}'], shared.character)
+    else:
+        return shared.history['visible']
+
 def input_modifier(string):
     """
     This function is applied to your text inputs before
     they are fed into the model.
     """
 
-    # Remove autoplay from previous
-    if len(shared.history['internal'])>0:
-        [text, reply] = shared.history['internal'][-1]
+    # Remove autoplay from previous chat history
+    if (shared.args.chat or shared.args.cai_chat)and len(shared.history['internal'])>0:
         [visible_text, visible_reply] = shared.history['visible'][-1]
-        rep_clean = reply.replace('controls autoplay>','controls>')
         vis_rep_clean = visible_reply.replace('controls autoplay>','controls>')
-        shared.history['internal'][-1] = [text, rep_clean]
         shared.history['visible'][-1] = [visible_text, vis_rep_clean]
 
     return string
@@ -99,24 +139,21 @@ def output_modifier(string):
         string = 'empty reply, try regenerating'
         silent_string = True
 
-    # x-slow, slow, medium, fast, x-fast
-    # x-low, low, medium, high, x-high
     pitch = params['voice_pitch']
     speed = params['voice_speed']
     prosody=f'<prosody rate="{speed}" pitch="{pitch}">'
     string = '<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
 
-    current_msg_id = len(shared.history['visible']) # Check length here, since output_modifier can run many times on the same message
-    output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{current_msg_id:06d}.wav')
     if not shared.still_streaming and not silent_string:
+        output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{int(time.time())}.wav')
         model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-        string = f'<audio id="audio_{current_msg_id:06d}" src="file/{output_file.as_posix()}" controls autoplay></audio>\n\n'
+        autoplay_str = ' autoplay' if params['autoplay'] else ''
+        string = f'<audio src="file/{output_file.as_posix()}" controls{autoplay_str}></audio>\n\n'
     else:
         # Placeholder so text doesn't shift around so much
         string = '<audio controls></audio>\n\n'
 
     if params['show_text']:
-        #string += f'*[{current_msg_id}]:*'+orig_string #Debug, looks like there is a delay in "current_msg_id" being updated when switching characters (updates after new message sent). Can't find the source. "shared.character" is updating properly.
         string += orig_string
 
     return string
@@ -133,16 +170,34 @@ def bot_prefix_modifier(string):
 def ui():
     # Gradio elements
     with gr.Accordion("Silero TTS"):
-        activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+        with gr.Row():
+            activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
+            autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
         show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
-        autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
         voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice')
-        v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
-        v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
+        with gr.Row():
+            v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
+            v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
+        with gr.Row():
+            convert = gr.Button('Permanently replace chat history audio with message text')
+            convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
+            convert_cancel = gr.Button('Cancel', visible=False)
+
+    # Convert history with confirmation
+    convert_arr = [convert_confirm, convert, convert_cancel]
+    convert.click(lambda :[gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
+    convert_confirm.click(lambda :[gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
+    convert_confirm.click(remove_tts_from_history, [], shared.gradio['display'])
+    convert_confirm.click(lambda : chat.save_history(timestamp=False), [], [], show_progress=False)
+    convert_cancel.click(lambda :[gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
+
+    # Toggle message text in history
+    show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
+    show_text.change(toggle_text_in_history, [], shared.gradio['display'])
+    show_text.change(lambda : chat.save_history(timestamp=False), [], [], show_progress=False)
 
     # Event functions to update the parameters in the backend
     activate.change(lambda x: params.update({"activate": x}), activate, None)
-    show_text.change(lambda x: params.update({"show_text": x}), show_text, None)
     autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
     voice.change(lambda x: params.update({"speaker": x}), voice, None)
     v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)

From 9276af3561df4d6b25cadc85dd9e51fe167fe807 Mon Sep 17 00:00:00 2001
From: Xan <70198941+xanthousm@users.noreply.github.com>
Date: Sun, 12 Mar 2023 19:06:24 +1100
Subject: [PATCH 54/69] clean up

---
 .idea/workspace.xml | 64 ---------------------------------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 .idea/workspace.xml

diff --git a/.idea/workspace.xml b/.idea/workspace.xml
deleted file mode 100644
index 404920a8..00000000
--- a/.idea/workspace.xml
+++ /dev/null
@@ -1,64 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ChangeListManager">
-    <list default="true" id="edbf3935-4476-45aa-aea0-f1e7cbcf4b9a" name="Changes" comment="">
-      <change afterPath="$PROJECT_DIR$/extensions/llama_prompts/script.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/modules/callbacks.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/RWKV.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/RWKV.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/chat.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/chat.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/shared.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/shared.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/stopping_criteria.py" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/modules/text_generation.py" beforeDir="false" afterPath="$PROJECT_DIR$/modules/text_generation.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/server.py" beforeDir="false" afterPath="$PROJECT_DIR$/server.py" afterDir="false" />
-    </list>
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="MarkdownSettingsMigration">
-    <option name="stateVersion" value="1" />
-  </component>
-  <component name="ProjectId" id="2MtdH03e5QdbSP16WYYfDkhyFUC" />
-  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
-  <component name="ProjectViewState">
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent"><![CDATA[{
-  "keyToString": {
-    "ASKED_SHARE_PROJECT_CONFIGURATION_FILES": "true",
-    "RunOnceActivity.OpenProjectViewOnStart": "true",
-    "RunOnceActivity.ShowReadmeOnStart": "true"
-  }
-}]]></component>
-  <component name="RunManager">
-    <configuration default="true" type="JetRunConfigurationType">
-      <module name="text-generation-webui" />
-      <method v="2">
-        <option name="Make" enabled="true" />
-      </method>
-    </configuration>
-    <configuration default="true" type="KotlinStandaloneScriptRunConfigurationType">
-      <module name="text-generation-webui" />
-      <option name="filePath" />
-      <method v="2">
-        <option name="Make" enabled="true" />
-      </method>
-    </configuration>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="edbf3935-4476-45aa-aea0-f1e7cbcf4b9a" name="Changes" comment="" />
-      <created>1678590722207</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1678590722207</updated>
-    </task>
-    <servers />
-  </component>
-</project>
\ No newline at end of file

From 781c09235ced081fc5fca5228bcf1234882083ae Mon Sep 17 00:00:00 2001
From: Phuoc-Hieu Le <hieultp1@gmail.com>
Date: Sun, 12 Mar 2023 15:21:50 +0700
Subject: [PATCH 55/69] Fix typo error in script.py

---
 extensions/llama_prompts/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/llama_prompts/script.py b/extensions/llama_prompts/script.py
index e45cd445..22c96f7c 100644
--- a/extensions/llama_prompts/script.py
+++ b/extensions/llama_prompts/script.py
@@ -11,7 +11,7 @@ def get_prompt_by_name(name):
         return df[df['Prompt name'] == name].iloc[0]['Prompt'].replace('\\n', '\n')
 
 def ui():
-    if not shared.args.chat or share.args.cai_chat:
+    if not shared.args.chat or shared.args.cai_chat:
         choices = ['None'] + list(df['Prompt name'])
 
         prompts_menu = gr.Dropdown(value=choices[0], choices=choices, label='Prompt')

From 3c25557ef0e1c727dfdadf5a5f9a53c533a82fe0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 08:48:16 -0300
Subject: [PATCH 56/69] Add tqdm to requirements.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index a7df93bb..ceaa0b70 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,5 @@ requests
 rwkv==0.1.0
 safetensors==0.2.8
 sentencepiece
+tqdm
 git+https://github.com/zphang/transformers@llama_push

From 78901d522be3aa9dd260ea751085b1230f2a8d45 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 08:59:05 -0300
Subject: [PATCH 57/69] Remove unused imports

---
 modules/RWKV.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/RWKV.py b/modules/RWKV.py
index 836d31dc..d97c1706 100644
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@@ -1,7 +1,5 @@
 import os
 from pathlib import Path
-from queue import Queue
-from threading import Thread
 
 import numpy as np
 from tokenizers import Tokenizer

From 0ac562bdba381ed61042802cf56ee3d4e3d674bf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 10:46:16 -0300
Subject: [PATCH 58/69] Add a default prompt for OpenAssistant
 oasst-sft-1-pythia-12b #253

---
 modules/shared.py      | 3 ++-
 settings-template.json | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index c42ba7ed..de621710 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -46,7 +46,8 @@ settings = {
     'prompts': {
         'default': 'Common sense questions and answers\n\nQuestion: \nFactual answer:',
         '^(gpt4chan|gpt-4chan|4chan)': '-----\n--- 865467536\nInput text\n--- 865467537\n',
-        '(rosey|chip|joi)_.*_instruct.*': 'User: \n'
+        '(rosey|chip|joi)_.*_instruct.*': 'User: \n',
+        'oasst-*': '<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>'
     }
 }
 
diff --git a/settings-template.json b/settings-template.json
index 6585f313..9da43970 100644
--- a/settings-template.json
+++ b/settings-template.json
@@ -29,6 +29,7 @@
     "prompts": {
         "default": "Common sense questions and answers\n\nQuestion: \nFactual answer:",
         "^(gpt4chan|gpt-4chan|4chan)": "-----\n--- 865467536\nInput text\n--- 865467537\n",
-        "(rosey|chip|joi)_.*_instruct.*": "User: \n"
+        "(rosey|chip|joi)_.*_instruct.*": "User: \n",
+        "oasst-*": "<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|>"
     }
 }

From fed3617f0727ba04d150781c698dd7df9965dc22 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 11:12:34 -0300
Subject: [PATCH 59/69] Move LLaMA 4-bit into a separate file

---
 modules/models.py          | 54 +++-------------------------------
 modules/quantized_LLaMA.py | 60 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 50 deletions(-)
 create mode 100644 modules/quantized_LLaMA.py

diff --git a/modules/models.py b/modules/models.py
index 8e7caa8d..249c05ee 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.llama_bits>0 or shared.args.load_in_4bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.llama_bits > 0, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -88,56 +88,10 @@ def load_model(model_name):
         return model, tokenizer
 
     # 4-bit LLaMA
-    elif shared.args.llama_bits>0 or shared.args.load_in_4bit:
-        sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
-        if shared.args.load_in_4bit:
-            bits = 4
-        else:
-            bits = shared.args.llama_bits
-        
+    elif shared.args.llama_bits > 0 or shared.args.load_in_4bit:
+        from modules.quantized_LLaMA import load_quantized_LLaMA
 
-        from llama import load_quant
-
-        path_to_model = Path(f'models/{model_name}')
-        pt_model = ''
-        if path_to_model.name.lower().startswith('llama-7b'):
-            pt_model = f'llama-7b-{bits}bit.pt'
-        elif path_to_model.name.lower().startswith('llama-13b'):
-            pt_model = f'llama-13b-{bits}bit.pt'
-        elif path_to_model.name.lower().startswith('llama-30b'):
-            pt_model = f'llama-30b-{bits}bit.pt'
-        elif path_to_model.name.lower().startswith('llama-65b'):
-            pt_model = f'llama-65b-{bits}bit.pt'
-        else:
-            pt_model = f'{model_name}-{bits}bit.pt'
-
-        # Try to find the .pt both in models/ and in the subfolder
-        pt_path = None
-        for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
-            if path.exists():
-                pt_path = path
-
-        if not pt_path:
-            print(f"Could not find {pt_model}, exiting...")
-            exit()
-
-        model = load_quant(path_to_model, pt_path, bits)
-
-        # Multi-GPU setup
-        if shared.args.gpu_memory:
-            import accelerate
-
-            max_memory = {}
-            for i in range(len(shared.args.gpu_memory)):
-                max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
-            max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
-
-            device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
-            model = accelerate.dispatch_model(model, device_map=device_map)
-
-        # Single GPU
-        else:
-            model = model.to(torch.device('cuda:0'))
+        model = load_quantized_LLaMA(model_name)
 
     # Custom
     else:
diff --git a/modules/quantized_LLaMA.py b/modules/quantized_LLaMA.py
new file mode 100644
index 00000000..e4817da5
--- /dev/null
+++ b/modules/quantized_LLaMA.py
@@ -0,0 +1,60 @@
+import os
+import sys
+from pathlib import Path
+
+import accelerate
+import torch
+
+import modules.shared as shared
+
+sys.path.insert(0, os.path.abspath(Path("repositories/GPTQ-for-LLaMa")))
+from llama import load_quant
+
+
+# 4-bit LLaMA
+def load_quantized_LLaMA(model_name):
+    if shared.args.load_in_4bit:
+        bits = 4
+    else:
+        bits = shared.args.llama_bits
+
+    path_to_model = Path(f'models/{model_name}')
+    pt_model = ''
+    if path_to_model.name.lower().startswith('llama-7b'):
+        pt_model = f'llama-7b-{bits}bit.pt'
+    elif path_to_model.name.lower().startswith('llama-13b'):
+        pt_model = f'llama-13b-{bits}bit.pt'
+    elif path_to_model.name.lower().startswith('llama-30b'):
+        pt_model = f'llama-30b-{bits}bit.pt'
+    elif path_to_model.name.lower().startswith('llama-65b'):
+        pt_model = f'llama-65b-{bits}bit.pt'
+    else:
+        pt_model = f'{model_name}-{bits}bit.pt'
+
+    # Try to find the .pt both in models/ and in the subfolder
+    pt_path = None
+    for path in [Path(p) for p in [f"models/{pt_model}", f"{path_to_model}/{pt_model}"]]:
+        if path.exists():
+            pt_path = path
+
+    if not pt_path:
+        print(f"Could not find {pt_model}, exiting...")
+        exit()
+
+    model = load_quant(path_to_model, pt_path, bits)
+
+    # Multi-GPU setup
+    if shared.args.gpu_memory:
+        max_memory = {}
+        for i in range(len(shared.args.gpu_memory)):
+            max_memory[i] = f"{shared.args.gpu_memory[i]}GiB"
+        max_memory['cpu'] = f"{shared.args.cpu_memory or '99'}GiB"
+
+        device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LLaMADecoderLayer"])
+        model = accelerate.dispatch_model(model, device_map=device_map)
+
+    # Single GPU
+    else:
+        model = model.to(torch.device('cuda:0'))
+
+    return model

From 65dda28c9dc71a2ad6bf6c627822ea2ae9d7878a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 11:19:07 -0300
Subject: [PATCH 60/69] Rename --llama-bits to --gptq-bits

---
 modules/models.py          | 4 ++--
 modules/quantized_LLaMA.py | 2 +-
 modules/shared.py          | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/models.py b/modules/models.py
index 249c05ee..7d094ed5 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -42,7 +42,7 @@ def load_model(model_name):
     shared.is_RWKV = model_name.lower().startswith('rwkv-')
 
     # Default settings
-    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.llama_bits > 0, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
+    if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.gptq_bits > 0, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV]):
         if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
             model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
         else:
@@ -88,7 +88,7 @@ def load_model(model_name):
         return model, tokenizer
 
     # 4-bit LLaMA
-    elif shared.args.llama_bits > 0 or shared.args.load_in_4bit:
+    elif shared.args.gptq_bits > 0 or shared.args.load_in_4bit:
         from modules.quantized_LLaMA import load_quantized_LLaMA
 
         model = load_quantized_LLaMA(model_name)
diff --git a/modules/quantized_LLaMA.py b/modules/quantized_LLaMA.py
index e4817da5..ca4eebf2 100644
--- a/modules/quantized_LLaMA.py
+++ b/modules/quantized_LLaMA.py
@@ -16,7 +16,7 @@ def load_quantized_LLaMA(model_name):
     if shared.args.load_in_4bit:
         bits = 4
     else:
-        bits = shared.args.llama_bits
+        bits = shared.args.gptq_bits
 
     path_to_model = Path(f'models/{model_name}')
     pt_model = ''
diff --git a/modules/shared.py b/modules/shared.py
index 3ea4ef41..627fd205 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -68,7 +68,7 @@ parser.add_argument('--cai-chat', action='store_true', help='Launch the web UI i
 parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.')
 parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.')
 parser.add_argument('--load-in-4bit', action='store_true', help='Load the model with 4-bit precision. Currently only works with LLaMA.')
-parser.add_argument('--llama-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
+parser.add_argument('--gptq-bits', type=int, default=0, help='Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA.')
 parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
 parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
 parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')

From 89e9493509547e6d92c1dab724b2b0d39f389c27 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 11:23:20 -0300
Subject: [PATCH 61/69] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 50d07cd6..3d0c3a23 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,7 @@ Optionally, you can use the following command-line flags:
 | `--cpu`       | Use the CPU to generate text.|
 | `--load-in-8bit`  | Load the model with 8-bit precision.|
 | `--load-in-4bit`  | Load the model with 4-bit precision. Currently only works with LLaMA.|
-| `--llama-bits`  |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
+| `--gptq-bits`  |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
 | `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |

From 8403152257b3e0c405f88a0cbf08dc640e29e206 Mon Sep 17 00:00:00 2001
From: HideLord <polimonom@gmail.com>
Date: Sun, 12 Mar 2023 17:28:15 +0200
Subject: [PATCH 62/69] Fixing compatibility with GPTQ repo commit
 2f667f7da051967566a5fb0546f8614bcd3a1ccd. Expects string and breaks on

---
 modules/quantized_LLaMA.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/quantized_LLaMA.py b/modules/quantized_LLaMA.py
index ca4eebf2..9ab7f333 100644
--- a/modules/quantized_LLaMA.py
+++ b/modules/quantized_LLaMA.py
@@ -41,7 +41,7 @@ def load_quantized_LLaMA(model_name):
         print(f"Could not find {pt_model}, exiting...")
         exit()
 
-    model = load_quant(path_to_model, pt_path, bits)
+    model = load_quant(path_to_model, str(pt_path), bits)
 
     # Multi-GPU setup
     if shared.args.gpu_memory:

From fda376d9c386aebffe6966ed72cf7202c491bd3f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 12:41:04 -0300
Subject: [PATCH 63/69] Use os.path.abspath() instead of str()

---
 modules/quantized_LLaMA.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/quantized_LLaMA.py b/modules/quantized_LLaMA.py
index 9ab7f333..5e4a38e8 100644
--- a/modules/quantized_LLaMA.py
+++ b/modules/quantized_LLaMA.py
@@ -41,7 +41,7 @@ def load_quantized_LLaMA(model_name):
         print(f"Could not find {pt_model}, exiting...")
         exit()
 
-    model = load_quant(path_to_model, str(pt_path), bits)
+    model = load_quant(path_to_model, os.path.abspath(pt_path), bits)
 
     # Multi-GPU setup
     if shared.args.gpu_memory:

From 4dc1d8c091461de4489b29660930ae929d60b171 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 12:46:53 -0300
Subject: [PATCH 64/69] Update README.md

---
 README.md | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 3d0c3a23..95fb85ae 100644
--- a/README.md
+++ b/README.md
@@ -179,14 +179,10 @@ Check the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/System-
 
 Pull requests, suggestions, and issue reports are welcome.
 
-Before reporting a bug, make sure that you have created a conda environment and installed the dependencies exactly as in the *Installation* section above.
+Before reporting a bug, make sure that you have:
 
-These issues are known:
-
-* 8-bit doesn't work properly on Windows or older GPUs.
-* DeepSpeed doesn't work properly on Windows.
-
-For these two, please try commenting on an existing issue instead of creating a new one.
+1. Created a conda environment and installed the dependencies exactly as in the *Installation* section above.
+2. [Searched](https://github.com/oobabooga/text-generation-webui/issues) to see if an issue already exists for the issue you encountered.
 
 ## Credits
 

From 4066ab4c0ca608bc4f95f50fe7c7f11334192946 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 13:36:18 -0300
Subject: [PATCH 65/69] Reorder the imports

---
 extensions/silero_tts/script.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/extensions/silero_tts/script.py b/extensions/silero_tts/script.py
index 1a60c901..62d4b441 100644
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@@ -1,11 +1,12 @@
+import re
+import time
 from pathlib import Path
 
 import gradio as gr
 import torch
-import time
-import re
-import modules.shared as shared
+
 import modules.chat as chat
+import modules.shared as shared
 
 torch._C._jit_set_profiling_mode(False)
 

From 441e993c51ac100fa4565419791cd9a88fd8d3df Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 14:25:14 -0300
Subject: [PATCH 66/69] Bump accelerate, RWKV and safetensors

---
 requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ceaa0b70..b078ecf4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,11 @@
-accelerate==0.16.0
+accelerate==0.17.0
 bitsandbytes==0.37.0
 flexgen==0.1.7
 gradio==3.18.0
 numpy
 requests
-rwkv==0.1.0
-safetensors==0.2.8
+rwkv==0.3.1
+safetensors==0.3.0
 sentencepiece
 tqdm
 git+https://github.com/zphang/transformers@llama_push

From 17210ff88f55dc650c3dc2ff1c8692f27734851c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 14:31:24 -0300
Subject: [PATCH 67/69] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 95fb85ae..26e70d76 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * [FlexGen offload](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen).
 * [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed).
 * Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming.
-* [Supports the LLaMA model](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model).
+* [Supports the LLaMA model, including 4-bit mode](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model).
 * [Supports the RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model).
 * Supports softprompts.
 * [Supports extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions).

From c7aa51faa6488f019447c7f2eba26013105281e7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 14:54:58 -0300
Subject: [PATCH 68/69] Use a list of eos_tokens instead of just a number

This might be the cause of LLaMA ramblings that some people have experienced.
---
 modules/text_generation.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 6f53e416..7cf68c06 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -119,7 +119,9 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     original_input_ids = input_ids
     output = input_ids[0]
     cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
-    n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
+    eos_token_ids = [shared.tokenizer.eos_token_id]
+    if eos_token is not None:
+        eos_token_ids.append(int(encode(eos_token)[0][-1]))
     stopping_criteria_list = transformers.StoppingCriteriaList()
     if stopping_string is not None:
         # Copied from https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
@@ -129,7 +131,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
     if not shared.args.flexgen:
         generate_params = [
             f"max_new_tokens=max_new_tokens",
-            f"eos_token_id={n}",
+            f"eos_token_id={eos_token_ids}",
             f"stopping_criteria=stopping_criteria_list",
             f"do_sample={do_sample}",
             f"temperature={temperature}",
@@ -149,7 +151,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
             f"max_new_tokens={max_new_tokens if shared.args.no_stream else 8}",
             f"do_sample={do_sample}",
             f"temperature={temperature}",
-            f"stop={n}",
+            f"stop={eos_token_ids[-1]}",
         ]
     if shared.args.deepspeed:
         generate_params.append("synced_gpus=True")
@@ -198,7 +200,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                     if not (shared.args.chat or shared.args.cai_chat):
                         reply = original_question + apply_extensions(reply[len(question):], "output")
 
-                    if output[-1] == n:
+                    if output[-1] in eos_token_ids:
                         break
                     yield formatted_outputs(reply, shared.model_name)
 
@@ -219,7 +221,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
                 if not (shared.args.chat or shared.args.cai_chat):
                     reply = original_question + apply_extensions(reply[len(question):], "output")
 
-                if np.count_nonzero(input_ids[0] == n) < np.count_nonzero(output == n):
+                if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
                     break
                 yield formatted_outputs(reply, shared.model_name)
 

From 3375eaece0851b318a7d77fade12ac6a264c6b64 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 12 Mar 2023 15:01:32 -0300
Subject: [PATCH 69/69] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 26e70d76..dc5ed659 100644
--- a/README.md
+++ b/README.md
@@ -139,7 +139,7 @@ Optionally, you can use the following command-line flags:
 | `--cpu`       | Use the CPU to generate text.|
 | `--load-in-8bit`  | Load the model with 8-bit precision.|
 | `--load-in-4bit`  | Load the model with 4-bit precision. Currently only works with LLaMA.|
-| `--gptq-bits`  |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8bit are supported. Currently only works with LLaMA. |
+| `--gptq-bits GPTQ_BITS`  |  Load a pre-quantized model with specified precision. 2, 3, 4 and 8 (bit) are supported. Currently only works with LLaMA. |
 | `--bf16`  | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. |
 | `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.|
 | `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. |