From 566898a79a0915879273f3d77017908bcf7d62ab Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Sat, 25 Mar 2023 12:08:26 -0700
Subject: [PATCH 01/21] initial lora training tab

---
 modules/training.py                           | 139 ++++++++++++++++++
 requirements.txt                              |   2 +
 server.py                                     |   7 +-
 .../datasets/put-trainer-datasets-here.txt    |   0
 training/formats/alpaca-chatbot-format.json   |   4 +
 training/formats/alpaca-format.json           |   4 +
 training/formats/put-trainer-formats-here.txt |   0
 7 files changed, 153 insertions(+), 3 deletions(-)
 create mode 100644 modules/training.py
 create mode 100644 training/datasets/put-trainer-datasets-here.txt
 create mode 100644 training/formats/alpaca-chatbot-format.json
 create mode 100644 training/formats/alpaca-format.json
 create mode 100644 training/formats/put-trainer-formats-here.txt

diff --git a/modules/training.py b/modules/training.py
new file mode 100644
index 00000000..96cd6e7c
--- /dev/null
+++ b/modules/training.py
@@ -0,0 +1,139 @@
+import sys, torch, json
+from pathlib import Path
+import gradio as gr
+from datasets import load_dataset
+import transformers
+from modules import ui, shared
+from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, get_peft_model_state_dict
+
+def get_json_dataset(path: str):
+    def get_set():
+        return ['None'] + sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path(path).glob('*.json'))), key=str.lower)
+    return get_set
+
+def create_train_interface():
+    with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
+        loraName = gr.Textbox(label="Name", info="The name of your new LoRA file")
+        # TODO: Add explanations of batch sizes and recommendations. Note that batch/microBatch determines gradient accumulation and explain what that means. Note the effects on VRAM usage from changing these values.
+        microBatchSize = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='(TODO)')
+        batchSize = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='(TODO)')
+        epochs = gr.Slider(label='Epochs', value=1, minimum=1, maximum=1000, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+        learningRate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+        # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
+        loraRank = gr.Slider(label='LoRA Rank', value=8, minimum=1, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
+        loraAlpha = gr.Slider(label='LoRA Alpha', value=16, minimum=1, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+        # TODO: Better explain what this does.
+        loraDropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.')
+        cutoffLen = gr.Slider(label='Cutoff Length', minimum=1,maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+        with gr.Row():
+            datasetFunction = get_json_dataset('training/datasets')
+            dataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Dataset')
+            ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
+        with gr.Row():
+            evalDataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Evaluation Dataset')
+            ui.create_refresh_button(evalDataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
+        with gr.Row():
+            formatsFunction = get_json_dataset('training/formats')
+            format = gr.Dropdown(choices=formatsFunction(), value='None', label='Data Format')
+            ui.create_refresh_button(format, lambda : None, lambda : {'choices': formatsFunction()}, 'refresh-button')
+        startButton = gr.Button("Start LoRA Training")
+        output = gr.Markdown(value="(...)")
+        startButton.click(do_train, [loraName, microBatchSize, batchSize, epochs, learningRate, loraRank, loraAlpha, loraDropout, cutoffLen, dataset, evalDataset, format], [output])
+
+def cleanPath(basePath: str, path: str):
+    """"Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
+    # TODO: Probably could do with a security audit to guarantee there's no ways this can be bypassed to target an unwanted path.
+    # Or swap it to a strict whitelist of [a-zA-Z_0-9]
+    path = path.replace('\\', '/').replace('..', '_')
+    if basePath is None:
+        return path
+    return f'{Path(basePath).absolute()}/{path}'
+
+def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, learningRate: float, loraRank: int, loraAlpha: int, loraDropout: float, cutoffLen: int, dataset: str, evalDataset: str, format: str):
+    # Input validation / processing
+    # TODO: --lora-dir PR once pulled will need to be applied here
+    loraName = f"loras/{cleanPath(None, loraName)}"
+    if dataset is None:
+        return "**Missing dataset choice input, cannot continue.**"
+    if format is None:
+        return "**Missing format choice input, cannot continue.**"
+    gradientAccumulationSteps = batchSize // microBatchSize
+    actualLR = float(learningRate)
+    model = shared.model
+    tokenizer = shared.tokenizer
+    tokenizer.pad_token = 0
+    tokenizer.padding_side = "left"
+    # Prep the dataset, format, etc
+    with open(cleanPath('training/formats', f'{format}.json'), 'r') as formatFile:
+        formatData: dict[str, str] = json.load(formatFile)
+    def tokenize(prompt):
+        result = tokenizer(prompt, truncation=True, max_length=cutoffLen + 1, padding="max_length")
+        return {
+            "input_ids": result["input_ids"][:-1],
+            "attention_mask": result["attention_mask"][:-1],
+        }
+    def generate_prompt(data_point: dict[str, str]):
+        for options, data in formatData.items():
+            if set(options.split(',')) == set(data_point.keys()):
+                for key, val in data_point.items():
+                    data = data.replace(f'%{key}%', val)
+            return data
+        raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(formatData.keys())}"')
+    def generate_and_tokenize_prompt(data_point):
+        prompt = generate_prompt(data_point)
+        return tokenize(prompt)
+    data = load_dataset("json", data_files=cleanPath('training/datasets', f'{dataset}.json'))
+    train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
+    if evalDataset == 'None':
+        evalData = None
+    else:
+        evalData = load_dataset("json", data_files=cleanPath('training/datasets', f'{evalDataset}.json'))
+        evalData = evalData['train'].shuffle().map(generate_and_tokenize_prompt)
+    # Start prepping the model itself
+    model = prepare_model_for_int8_training(model)
+    config = LoraConfig(
+        r=loraRank,
+        lora_alpha=loraAlpha,
+        # TODO: Should target_modules be configurable?
+        target_modules=[ "q_proj", "v_proj" ],
+        lora_dropout=loraDropout,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+    model = get_peft_model(model, config)
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=train_data,
+        eval_dataset=evalData,
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=microBatchSize,
+            gradient_accumulation_steps=gradientAccumulationSteps,
+            # TODO: Should more of these be configurable? Probably.
+            warmup_steps=100,
+            num_train_epochs=epochs,
+            learning_rate=actualLR,
+            fp16=True,
+            logging_steps=20,
+            evaluation_strategy="steps" if evalData is not None else "no",
+            save_strategy="steps",
+            eval_steps=200 if evalData is not None else None,
+            save_steps=200,
+            output_dir=loraName,
+            save_total_limit=3,
+            load_best_model_at_end=True if evalData is not None else False,
+            # TODO: Enable multi-device support
+            ddp_find_unused_parameters=None,
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+    )
+    model.config.use_cache = False
+    old_state_dict = model.state_dict
+    model.state_dict = (
+        lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
+    ).__get__(model, type(model))
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        model = torch.compile(model)
+    # Actually start and run and save at the end
+    trainer.train()
+    model.save_pretrained(loraName)
+    return "Done!"
diff --git a/requirements.txt b/requirements.txt
index e5b3de69..c93ce671 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,6 @@ rwkv==0.7.0
 safetensors==0.3.0
 sentencepiece
 tqdm
+peft
+datasets
 git+https://github.com/huggingface/transformers
diff --git a/server.py b/server.py
index f423e368..cd95d5ef 100644
--- a/server.py
+++ b/server.py
@@ -8,10 +8,8 @@ from pathlib import Path
 
 import gradio as gr
 
-import modules.chat as chat
+from modules import chat, shared, ui, training
 import modules.extensions as extensions_module
-import modules.shared as shared
-import modules.ui as ui
 from modules.html_generator import generate_chat_html
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, load_soft_prompt
@@ -443,6 +441,9 @@ def create_interface():
 
             shared.gradio['reset_interface'].click(set_interface_arguments, [shared.gradio[k] for k in ['interface_modes_menu', 'extensions_menu', 'cmd_arguments_menu']], None)
             shared.gradio['reset_interface'].click(lambda : None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;margin-top:20%;color:lightgray;text-align:center;">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500)}')
+        
+        with gr.Tab("Training", elem_id="training-tab"):
+            training.create_train_interface()
 
         if shared.args.extensions is not None:
             extensions_module.create_extensions_block()
diff --git a/training/datasets/put-trainer-datasets-here.txt b/training/datasets/put-trainer-datasets-here.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/training/formats/alpaca-chatbot-format.json b/training/formats/alpaca-chatbot-format.json
new file mode 100644
index 00000000..4b38103f
--- /dev/null
+++ b/training/formats/alpaca-chatbot-format.json
@@ -0,0 +1,4 @@
+{
+    "instruction,output": "User: %instruction%\nAssistant: %output%",
+    "instruction,input,output": "User: %instruction%: %input%\nAssistant: %output%"
+}
diff --git a/training/formats/alpaca-format.json b/training/formats/alpaca-format.json
new file mode 100644
index 00000000..dd6df956
--- /dev/null
+++ b/training/formats/alpaca-format.json
@@ -0,0 +1,4 @@
+{
+    "instruction,output": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n%instruction%\n\n### Response:\n%output%",
+    "instruction,input,output": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n%instruction%\n\n### Input:\n%input%\n\n### Response:\n%output%"
+}
diff --git a/training/formats/put-trainer-formats-here.txt b/training/formats/put-trainer-formats-here.txt
new file mode 100644
index 00000000..e69de29b

From 7bf601107c1b9aebd5bbbb5d08aa3d20c697daf1 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Sat, 25 Mar 2023 12:28:46 -0700
Subject: [PATCH 02/21] automatically strip empty data entries (for better
 alpaca dataset compat)

---
 modules/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/training.py b/modules/training.py
index 96cd6e7c..e2be18e8 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -74,7 +74,7 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
         }
     def generate_prompt(data_point: dict[str, str]):
         for options, data in formatData.items():
-            if set(options.split(',')) == set(data_point.keys()):
+            if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0):
                 for key, val in data_point.items():
                     data = data.replace(f'%{key}%', val)
             return data

From 5c49a0dcd02c3cf2e31a00fdaf554f36895276d7 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Sat, 25 Mar 2023 12:37:32 -0700
Subject: [PATCH 03/21] fix error from prepare call running twice in a row

---
 modules/training.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/modules/training.py b/modules/training.py
index e2be18e8..0e210c52 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -90,7 +90,8 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
         evalData = load_dataset("json", data_files=cleanPath('training/datasets', f'{evalDataset}.json'))
         evalData = evalData['train'].shuffle().map(generate_and_tokenize_prompt)
     # Start prepping the model itself
-    model = prepare_model_for_int8_training(model)
+    if not hasattr(model, 'lm_head') or hasattr(model.lm_head, 'weight'):
+        model = prepare_model_for_int8_training(model)
     config = LoraConfig(
         r=loraRank,
         lora_alpha=loraAlpha,

From 8da237223ed008c418386a805524929ddebb59ba Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Sat, 25 Mar 2023 12:48:35 -0700
Subject: [PATCH 04/21] document options better

---
 modules/training.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 0e210c52..250093a0 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -14,27 +14,27 @@ def get_json_dataset(path: str):
 def create_train_interface():
     with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
         loraName = gr.Textbox(label="Name", info="The name of your new LoRA file")
-        # TODO: Add explanations of batch sizes and recommendations. Note that batch/microBatch determines gradient accumulation and explain what that means. Note the effects on VRAM usage from changing these values.
-        microBatchSize = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='(TODO)')
-        batchSize = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='(TODO)')
+        # TODO: Implement multi-device support.
+        microBatchSize = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
+        batchSize = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
         epochs = gr.Slider(label='Epochs', value=1, minimum=1, maximum=1000, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
         learningRate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
         # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
         loraRank = gr.Slider(label='LoRA Rank', value=8, minimum=1, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
         loraAlpha = gr.Slider(label='LoRA Alpha', value=16, minimum=1, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
-        # TODO: Better explain what this does.
+        # TODO: Better explain what this does, in terms of real world effect especially.
         loraDropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.')
         cutoffLen = gr.Slider(label='Cutoff Length', minimum=1,maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
         with gr.Row():
             datasetFunction = get_json_dataset('training/datasets')
-            dataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Dataset')
+            dataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Dataset', info='The dataset file to use for training.')
             ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
         with gr.Row():
-            evalDataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Evaluation Dataset')
+            evalDataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
             ui.create_refresh_button(evalDataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
         with gr.Row():
             formatsFunction = get_json_dataset('training/formats')
-            format = gr.Dropdown(choices=formatsFunction(), value='None', label='Data Format')
+            format = gr.Dropdown(choices=formatsFunction(), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
             ui.create_refresh_button(format, lambda : None, lambda : {'choices': formatsFunction()}, 'refresh-button')
         startButton = gr.Button("Start LoRA Training")
         output = gr.Markdown(value="(...)")

From f1ba2196b1a640bd094623120486b847ca59ccf5 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Sat, 25 Mar 2023 12:57:36 -0700
Subject: [PATCH 05/21] make 'model' variables less ambiguous

---
 modules/training.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 250093a0..f9f0790f 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -59,15 +59,13 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
         return "**Missing format choice input, cannot continue.**"
     gradientAccumulationSteps = batchSize // microBatchSize
     actualLR = float(learningRate)
-    model = shared.model
-    tokenizer = shared.tokenizer
-    tokenizer.pad_token = 0
-    tokenizer.padding_side = "left"
+    shared.tokenizer.pad_token = 0
+    shared.tokenizer.padding_side = "left"
     # Prep the dataset, format, etc
     with open(cleanPath('training/formats', f'{format}.json'), 'r') as formatFile:
         formatData: dict[str, str] = json.load(formatFile)
     def tokenize(prompt):
-        result = tokenizer(prompt, truncation=True, max_length=cutoffLen + 1, padding="max_length")
+        result = shared.tokenizer(prompt, truncation=True, max_length=cutoffLen + 1, padding="max_length")
         return {
             "input_ids": result["input_ids"][:-1],
             "attention_mask": result["attention_mask"][:-1],
@@ -90,8 +88,8 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
         evalData = load_dataset("json", data_files=cleanPath('training/datasets', f'{evalDataset}.json'))
         evalData = evalData['train'].shuffle().map(generate_and_tokenize_prompt)
     # Start prepping the model itself
-    if not hasattr(model, 'lm_head') or hasattr(model.lm_head, 'weight'):
-        model = prepare_model_for_int8_training(model)
+    if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
+        prepare_model_for_int8_training(shared.model)
     config = LoraConfig(
         r=loraRank,
         lora_alpha=loraAlpha,
@@ -101,9 +99,9 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
         bias="none",
         task_type="CAUSAL_LM"
     )
-    model = get_peft_model(model, config)
+    loraModel = get_peft_model(shared.model, config)
     trainer = transformers.Trainer(
-        model=model,
+        model=loraModel,
         train_dataset=train_data,
         eval_dataset=evalData,
         args=transformers.TrainingArguments(
@@ -125,16 +123,16 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
             # TODO: Enable multi-device support
             ddp_find_unused_parameters=None,
         ),
-        data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
+        data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
     )
-    model.config.use_cache = False
-    old_state_dict = model.state_dict
-    model.state_dict = (
+    loraModel.config.use_cache = False
+    old_state_dict = loraModel.state_dict
+    loraModel.state_dict = (
         lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
-    ).__get__(model, type(model))
+    ).__get__(loraModel, type(loraModel))
     if torch.__version__ >= "2" and sys.platform != "win32":
-        model = torch.compile(model)
+        loraModel = torch.compile(loraModel)
     # Actually start and run and save at the end
     trainer.train()
-    model.save_pretrained(loraName)
+    loraModel.save_pretrained(loraName)
     return "Done!"

From d911c22af9019312eb05f3981ffee22c7243f1d8 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 08:31:49 -0700
Subject: [PATCH 06/21] use shared rows to make the LoRA Trainer interface a
 bit more compact / clean

---
 modules/training.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index f9f0790f..aa085fda 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -14,11 +14,13 @@ def get_json_dataset(path: str):
 def create_train_interface():
     with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
         loraName = gr.Textbox(label="Name", info="The name of your new LoRA file")
-        # TODO: Implement multi-device support.
-        microBatchSize = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
-        batchSize = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
-        epochs = gr.Slider(label='Epochs', value=1, minimum=1, maximum=1000, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
-        learningRate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+        with gr.Row():
+            # TODO: Implement multi-device support.
+            microBatchSize = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
+            batchSize = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+        with gr.Row():
+            epochs = gr.Number(label='Epochs', value=1, minimum=1, maximum=1000, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+            learningRate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
         # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
         loraRank = gr.Slider(label='LoRA Rank', value=8, minimum=1, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
         loraAlpha = gr.Slider(label='LoRA Alpha', value=16, minimum=1, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
@@ -29,10 +31,8 @@ def create_train_interface():
             datasetFunction = get_json_dataset('training/datasets')
             dataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Dataset', info='The dataset file to use for training.')
             ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
-        with gr.Row():
             evalDataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
             ui.create_refresh_button(evalDataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
-        with gr.Row():
             formatsFunction = get_json_dataset('training/formats')
             format = gr.Dropdown(choices=formatsFunction(), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
             ui.create_refresh_button(format, lambda : None, lambda : {'choices': formatsFunction()}, 'refresh-button')

From 2afe1c13c143dd3e8c2d63c15fb8c1ef59895448 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 08:32:32 -0700
Subject: [PATCH 07/21] move Training to before Interface mode

as Interface Mode seems to be a core 'settings' page that naturally belongs at the very end
---
 server.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/server.py b/server.py
index 03158ac6..0e512c7b 100644
--- a/server.py
+++ b/server.py
@@ -468,6 +468,9 @@ def create_interface():
             shared.gradio['Stop'].click(None, None, None, cancels=gen_events)
             shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.main_js}}}")
 
+        with gr.Tab("Training", elem_id="training-tab"):
+            training.create_train_interface()
+
         with gr.Tab("Interface mode", elem_id="interface-mode"):
             modes = ["default", "notebook", "chat", "cai_chat"]
             current_mode = "default"
@@ -488,9 +491,6 @@ def create_interface():
             shared.gradio['reset_interface'].click(set_interface_arguments, [shared.gradio[k] for k in ['interface_modes_menu', 'extensions_menu', 'cmd_arguments_menu']], None)
             shared.gradio['reset_interface'].click(lambda : None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;margin-top:20%;color:lightgray;text-align:center;">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500)}')
         
-        with gr.Tab("Training", elem_id="training-tab"):
-            training.create_train_interface()
-
         if shared.args.extensions is not None:
             extensions_module.create_extensions_block()
 

From c07bcd0850ce0312826f6195450a3b04eb1788f8 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 09:41:06 -0700
Subject: [PATCH 08/21] add some outputs to indicate progress updates (sorta)

Actual progressbar still needed. Also minor formatting fixes.
---
 modules/training.py | 15 ++++++++++++---
 server.py           |  2 +-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index aa085fda..b9f3d192 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -19,7 +19,7 @@ def create_train_interface():
             microBatchSize = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
             batchSize = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
         with gr.Row():
-            epochs = gr.Number(label='Epochs', value=1, minimum=1, maximum=1000, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+            epochs = gr.Number(label='Epochs', value=1, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
             learningRate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
         # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
         loraRank = gr.Slider(label='LoRA Rank', value=8, minimum=1, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
@@ -50,6 +50,7 @@ def cleanPath(basePath: str, path: str):
     return f'{Path(basePath).absolute()}/{path}'
 
 def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, learningRate: float, loraRank: int, loraAlpha: int, loraDropout: float, cutoffLen: int, dataset: str, evalDataset: str, format: str):
+    yield "Prepping..."
     # Input validation / processing
     # TODO: --lora-dir PR once pulled will need to be applied here
     loraName = f"loras/{cleanPath(None, loraName)}"
@@ -80,6 +81,7 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     def generate_and_tokenize_prompt(data_point):
         prompt = generate_prompt(data_point)
         return tokenize(prompt)
+    print("Loading datasets...")
     data = load_dataset("json", data_files=cleanPath('training/datasets', f'{dataset}.json'))
     train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
     if evalDataset == 'None':
@@ -89,7 +91,9 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
         evalData = evalData['train'].shuffle().map(generate_and_tokenize_prompt)
     # Start prepping the model itself
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
+        print("Getting model ready...")
         prepare_model_for_int8_training(shared.model)
+    print("Prepping for training...")
     config = LoraConfig(
         r=loraRank,
         lora_alpha=loraAlpha,
@@ -121,7 +125,7 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
             save_total_limit=3,
             load_best_model_at_end=True if evalData is not None else False,
             # TODO: Enable multi-device support
-            ddp_find_unused_parameters=None,
+            ddp_find_unused_parameters=None
         ),
         data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
     )
@@ -133,6 +137,11 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     if torch.__version__ >= "2" and sys.platform != "win32":
         loraModel = torch.compile(loraModel)
     # Actually start and run and save at the end
+    # TODO: save/load checkpoints to resume from?
+    print("Starting training...")
+    yield "Running..."
     trainer.train()
+    print("Training complete, saving...")
     loraModel.save_pretrained(loraName)
-    return "Done!"
+    print("Training complete!")
+    yield f"Done! Lora saved to `{loraName}`"
diff --git a/server.py b/server.py
index 0e512c7b..caca85c9 100644
--- a/server.py
+++ b/server.py
@@ -490,7 +490,7 @@ def create_interface():
 
             shared.gradio['reset_interface'].click(set_interface_arguments, [shared.gradio[k] for k in ['interface_modes_menu', 'extensions_menu', 'cmd_arguments_menu']], None)
             shared.gradio['reset_interface'].click(lambda : None, None, None, _js='() => {document.body.innerHTML=\'<h1 style="font-family:monospace;margin-top:20%;color:lightgray;text-align:center;">Reloading...</h1>\'; setTimeout(function(){location.reload()},2500)}')
-        
+
         if shared.args.extensions is not None:
             extensions_module.create_extensions_block()
 

From 8fc723fc95d82755ae9280d9a8fe8b6feb804b1e Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 10:25:08 -0700
Subject: [PATCH 09/21] initial progress tracker in UI

---
 modules/training.py | 48 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index b9f3d192..c83427d6 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -1,4 +1,4 @@
-import sys, torch, json
+import sys, torch, json, threading, time
 from pathlib import Path
 import gradio as gr
 from datasets import load_dataset
@@ -6,6 +6,9 @@ import transformers
 from modules import ui, shared
 from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, get_peft_model_state_dict
 
+CURRENT_STEPS = 0
+MAX_STEPS = 0
+
 def get_json_dataset(path: str):
     def get_set():
         return ['None'] + sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path(path).glob('*.json'))), key=str.lower)
@@ -40,6 +43,12 @@ def create_train_interface():
         output = gr.Markdown(value="(...)")
         startButton.click(do_train, [loraName, microBatchSize, batchSize, epochs, learningRate, loraRank, loraAlpha, loraDropout, cutoffLen, dataset, evalDataset, format], [output])
 
+class Callbacks(transformers.TrainerCallback):
+    def on_step_begin(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
+        global CURRENT_STEPS, MAX_STEPS
+        CURRENT_STEPS = state.global_step
+        MAX_STEPS = state.max_steps
+
 def cleanPath(basePath: str, path: str):
     """"Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
     # TODO: Probably could do with a security audit to guarantee there's no ways this can be bypassed to target an unwanted path.
@@ -50,8 +59,11 @@ def cleanPath(basePath: str, path: str):
     return f'{Path(basePath).absolute()}/{path}'
 
 def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, learningRate: float, loraRank: int, loraAlpha: int, loraDropout: float, cutoffLen: int, dataset: str, evalDataset: str, format: str):
+    global CURRENT_STEPS, MAX_STEPS
+    CURRENT_STEPS = 0
+    MAX_STEPS = 0
     yield "Prepping..."
-    # Input validation / processing
+    # == Input validation / processing ==
     # TODO: --lora-dir PR once pulled will need to be applied here
     loraName = f"loras/{cleanPath(None, loraName)}"
     if dataset is None:
@@ -62,7 +74,7 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     actualLR = float(learningRate)
     shared.tokenizer.pad_token = 0
     shared.tokenizer.padding_side = "left"
-    # Prep the dataset, format, etc
+    # == Prep the dataset, format, etc ==
     with open(cleanPath('training/formats', f'{format}.json'), 'r') as formatFile:
         formatData: dict[str, str] = json.load(formatFile)
     def tokenize(prompt):
@@ -89,7 +101,7 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     else:
         evalData = load_dataset("json", data_files=cleanPath('training/datasets', f'{evalDataset}.json'))
         evalData = evalData['train'].shuffle().map(generate_and_tokenize_prompt)
-    # Start prepping the model itself
+    # == Start prepping the model itself ==
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
         print("Getting model ready...")
         prepare_model_for_int8_training(shared.model)
@@ -128,6 +140,7 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
             ddp_find_unused_parameters=None
         ),
         data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
+        callbacks=list([Callbacks()])
     )
     loraModel.config.use_cache = False
     old_state_dict = loraModel.state_dict
@@ -136,12 +149,31 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     ).__get__(loraModel, type(loraModel))
     if torch.__version__ >= "2" and sys.platform != "win32":
         loraModel = torch.compile(loraModel)
-    # Actually start and run and save at the end
+    # == Main run and monitor loop ==
     # TODO: save/load checkpoints to resume from?
     print("Starting training...")
-    yield "Running..."
-    trainer.train()
+    yield "Starting..."
+    def threadedRun():
+        trainer.train()
+    thread = threading.Thread(target=threadedRun)
+    thread.start()
+    lastStep = 0
+    startTime = time.perf_counter()
+    while thread.is_alive():
+        time.sleep(0.5)
+        if CURRENT_STEPS != lastStep:
+            lastStep = CURRENT_STEPS
+            timeElapsed = time.perf_counter() - startTime
+            if timeElapsed <= 0:
+                timerInfo = ""
+            else:
+                its = CURRENT_STEPS / timeElapsed
+                if its > 1:
+                    timerInfo = f"`{its:.2f}` it/s"
+                else:
+                    timerInfo = f"`{1.0/its:.2f}` s/it"
+            yield f"Running... **{CURRENT_STEPS}** / **{MAX_STEPS}** ... {timerInfo}, `{timeElapsed:.1f}` seconds"
     print("Training complete, saving...")
     loraModel.save_pretrained(loraName)
     print("Training complete!")
-    yield f"Done! Lora saved to `{loraName}`"
+    yield f"Done! LoRA saved to `{loraName}`"

From 16ea4fc36df9ec0cde796eaecf22db64c4d91fd8 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 10:43:01 -0700
Subject: [PATCH 10/21] interrupt button

---
 modules/training.py | 42 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index c83427d6..19f33220 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -6,8 +6,10 @@ import transformers
 from modules import ui, shared
 from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, get_peft_model_state_dict
 
+WANT_INTERRUPT = False
 CURRENT_STEPS = 0
 MAX_STEPS = 0
+CURRENT_GRADIENT_ACCUM = 1
 
 def get_json_dataset(path: str):
     def get_set():
@@ -39,15 +41,31 @@ def create_train_interface():
             formatsFunction = get_json_dataset('training/formats')
             format = gr.Dropdown(choices=formatsFunction(), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
             ui.create_refresh_button(format, lambda : None, lambda : {'choices': formatsFunction()}, 'refresh-button')
-        startButton = gr.Button("Start LoRA Training")
+        with gr.Row():
+            startButton = gr.Button("Start LoRA Training")
+            stopButton = gr.Button("Interrupt")
         output = gr.Markdown(value="(...)")
-        startButton.click(do_train, [loraName, microBatchSize, batchSize, epochs, learningRate, loraRank, loraAlpha, loraDropout, cutoffLen, dataset, evalDataset, format], [output])
+        startEvent = startButton.click(do_train, [loraName, microBatchSize, batchSize, epochs, learningRate, loraRank, loraAlpha, loraDropout, cutoffLen, dataset, evalDataset, format], [output])
+        stopButton.click(doInterrupt, [], [], cancels=[], queue=False)
+
+def doInterrupt():
+    global WANT_INTERRUPT
+    WANT_INTERRUPT = True
 
 class Callbacks(transformers.TrainerCallback):
     def on_step_begin(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
         global CURRENT_STEPS, MAX_STEPS
-        CURRENT_STEPS = state.global_step
-        MAX_STEPS = state.max_steps
+        CURRENT_STEPS = state.global_step * CURRENT_GRADIENT_ACCUM
+        MAX_STEPS = state.max_steps * CURRENT_GRADIENT_ACCUM
+        if WANT_INTERRUPT:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+    def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs):
+        global CURRENT_STEPS
+        CURRENT_STEPS += 1
+        if WANT_INTERRUPT:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
 
 def cleanPath(basePath: str, path: str):
     """"Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
@@ -59,7 +77,8 @@ def cleanPath(basePath: str, path: str):
     return f'{Path(basePath).absolute()}/{path}'
 
 def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, learningRate: float, loraRank: int, loraAlpha: int, loraDropout: float, cutoffLen: int, dataset: str, evalDataset: str, format: str):
-    global CURRENT_STEPS, MAX_STEPS
+    global WANT_INTERRUPT, CURRENT_STEPS, MAX_STEPS, CURRENT_GRADIENT_ACCUM
+    WANT_INTERRUPT = False
     CURRENT_STEPS = 0
     MAX_STEPS = 0
     yield "Prepping..."
@@ -71,6 +90,7 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     if format is None:
         return "**Missing format choice input, cannot continue.**"
     gradientAccumulationSteps = batchSize // microBatchSize
+    CURRENT_GRADIENT_ACCUM = gradientAccumulationSteps
     actualLR = float(learningRate)
     shared.tokenizer.pad_token = 0
     shared.tokenizer.padding_side = "left"
@@ -161,7 +181,9 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     startTime = time.perf_counter()
     while thread.is_alive():
         time.sleep(0.5)
-        if CURRENT_STEPS != lastStep:
+        if WANT_INTERRUPT:
+            yield "Interrupting, please wait... *(Run will stop after the current training step completes.)*"
+        elif CURRENT_STEPS != lastStep:
             lastStep = CURRENT_STEPS
             timeElapsed = time.perf_counter() - startTime
             if timeElapsed <= 0:
@@ -175,5 +197,9 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
             yield f"Running... **{CURRENT_STEPS}** / **{MAX_STEPS}** ... {timerInfo}, `{timeElapsed:.1f}` seconds"
     print("Training complete, saving...")
     loraModel.save_pretrained(loraName)
-    print("Training complete!")
-    yield f"Done! LoRA saved to `{loraName}`"
+    if WANT_INTERRUPT:
+        print("Training interrupted.")
+        yield f"Interrupted. Incomplete LoRA saved to `{loraName}`"
+    else:
+        print("Training complete!")
+        yield f"Done! LoRA saved to `{loraName}`"

From 9ced75746de1335e383626d996ced7d0d17e489b Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 10:57:27 -0700
Subject: [PATCH 11/21] add total time estimate

---
 modules/training.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/training.py b/modules/training.py
index 19f33220..f8846049 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -188,13 +188,15 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
             timeElapsed = time.perf_counter() - startTime
             if timeElapsed <= 0:
                 timerInfo = ""
+                totalTimeEstimate = 999
             else:
                 its = CURRENT_STEPS / timeElapsed
                 if its > 1:
                     timerInfo = f"`{its:.2f}` it/s"
                 else:
                     timerInfo = f"`{1.0/its:.2f}` s/it"
-            yield f"Running... **{CURRENT_STEPS}** / **{MAX_STEPS}** ... {timerInfo}, `{timeElapsed:.1f}` seconds"
+                totalTimeEstimate = (1.0/its) * (MAX_STEPS)
+            yield f"Running... **{CURRENT_STEPS}** / **{MAX_STEPS}** ... {timerInfo}, `{timeElapsed:.0f}`/`{totalTimeEstimate:.0f}` seconds"
     print("Training complete, saving...")
     loraModel.save_pretrained(loraName)
     if WANT_INTERRUPT:

From 2f0571bfa4a17300113b3e91f422cc8aa5471b4d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 27 Mar 2023 21:24:39 -0300
Subject: [PATCH 12/21] Small style changes

---
 css/main.css        |  2 +-
 modules/training.py | 23 ++++++++++++++++++-----
 server.py           |  2 +-
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/css/main.css b/css/main.css
index 3f044094..6aa3bc1a 100644
--- a/css/main.css
+++ b/css/main.css
@@ -41,7 +41,7 @@ ol li p, ul li p {
     display: inline-block;
 }
 
-#main, #parameters, #chat-settings, #interface-mode, #lora {
+#main, #parameters, #chat-settings, #interface-mode, #lora, #training-tab {
   border: 0;
 }
 
diff --git a/modules/training.py b/modules/training.py
index f8846049..bc5b3878 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -1,10 +1,17 @@
-import sys, torch, json, threading, time
+import json
+import sys
+import threading
+import time
 from pathlib import Path
+
 import gradio as gr
-from datasets import load_dataset
+import torch
 import transformers
-from modules import ui, shared
-from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, get_peft_model_state_dict
+from datasets import load_dataset
+from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict,
+                  prepare_model_for_int8_training)
+
+from modules import shared, ui
 
 WANT_INTERRUPT = False
 CURRENT_STEPS = 0
@@ -44,7 +51,7 @@ def create_train_interface():
         with gr.Row():
             startButton = gr.Button("Start LoRA Training")
             stopButton = gr.Button("Interrupt")
-        output = gr.Markdown(value="(...)")
+        output = gr.Markdown(value="Ready")
         startEvent = startButton.click(do_train, [loraName, microBatchSize, batchSize, epochs, learningRate, loraRank, loraAlpha, loraDropout, cutoffLen, dataset, evalDataset, format], [output])
         stopButton.click(doInterrupt, [], [], cancels=[], queue=False)
 
@@ -169,16 +176,20 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
     ).__get__(loraModel, type(loraModel))
     if torch.__version__ >= "2" and sys.platform != "win32":
         loraModel = torch.compile(loraModel)
+
     # == Main run and monitor loop ==
     # TODO: save/load checkpoints to resume from?
     print("Starting training...")
     yield "Starting..."
+
     def threadedRun():
         trainer.train()
+
     thread = threading.Thread(target=threadedRun)
     thread.start()
     lastStep = 0
     startTime = time.perf_counter()
+
     while thread.is_alive():
         time.sleep(0.5)
         if WANT_INTERRUPT:
@@ -197,8 +208,10 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
                     timerInfo = f"`{1.0/its:.2f}` s/it"
                 totalTimeEstimate = (1.0/its) * (MAX_STEPS)
             yield f"Running... **{CURRENT_STEPS}** / **{MAX_STEPS}** ... {timerInfo}, `{timeElapsed:.0f}`/`{totalTimeEstimate:.0f}` seconds"
+
     print("Training complete, saving...")
     loraModel.save_pretrained(loraName)
+
     if WANT_INTERRUPT:
         print("Training interrupted.")
         yield f"Interrupted. Incomplete LoRA saved to `{loraName}`"
diff --git a/server.py b/server.py
index cf37dc50..c3c8d2c8 100644
--- a/server.py
+++ b/server.py
@@ -9,8 +9,8 @@ from pathlib import Path
 
 import gradio as gr
 
-from modules import chat, shared, ui, training
 import modules.extensions as extensions_module
+from modules import chat, shared, training, ui
 from modules.html_generator import generate_chat_html
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, load_soft_prompt

From 6368dad7dbf4a85d840930548bce5a28714f65e5 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 18:17:42 -0700
Subject: [PATCH 13/21] Fix camelCase to snake_case to match repo format
 standard

---
 modules/training.py | 132 +++++++++++++++++++++++++-------------------
 1 file changed, 74 insertions(+), 58 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index bc5b3878..f63f2990 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -25,35 +25,40 @@ def get_json_dataset(path: str):
 
 def create_train_interface():
     with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
-        loraName = gr.Textbox(label="Name", info="The name of your new LoRA file")
+        lora_name = gr.Textbox(label="Name", info="The name of your new LoRA file")
         with gr.Row():
             # TODO: Implement multi-device support.
-            microBatchSize = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
-            batchSize = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+            micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
+            batch_size = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+
         with gr.Row():
             epochs = gr.Number(label='Epochs', value=1, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
-            learningRate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+            learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
+
         # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
-        loraRank = gr.Slider(label='LoRA Rank', value=8, minimum=1, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
-        loraAlpha = gr.Slider(label='LoRA Alpha', value=16, minimum=1, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+        lora_rank = gr.Slider(label='LoRA Rank', value=8, minimum=1, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
+        lora_alpha = gr.Slider(label='LoRA Alpha', value=16, minimum=1, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
         # TODO: Better explain what this does, in terms of real world effect especially.
-        loraDropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.')
-        cutoffLen = gr.Slider(label='Cutoff Length', minimum=1,maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+        lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.')
+        cutoff_len = gr.Slider(label='Cutoff Length', minimum=1,maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+
         with gr.Row():
-            datasetFunction = get_json_dataset('training/datasets')
-            dataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Dataset', info='The dataset file to use for training.')
-            ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
-            evalDataset = gr.Dropdown(choices=datasetFunction(), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
-            ui.create_refresh_button(evalDataset, lambda : None, lambda : {'choices': datasetFunction()}, 'refresh-button')
-            formatsFunction = get_json_dataset('training/formats')
-            format = gr.Dropdown(choices=formatsFunction(), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
-            ui.create_refresh_button(format, lambda : None, lambda : {'choices': formatsFunction()}, 'refresh-button')
+            dataset_function = get_json_dataset('training/datasets')
+            dataset = gr.Dropdown(choices=dataset_function(), value='None', label='Dataset', info='The dataset file to use for training.')
+            ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': dataset_function()}, 'refresh-button')
+            eval_dataset = gr.Dropdown(choices=dataset_function(), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
+            ui.create_refresh_button(eval_dataset, lambda : None, lambda : {'choices': dataset_function()}, 'refresh-button')
+            formats_function = get_json_dataset('training/formats')
+            format = gr.Dropdown(choices=formats_function(), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
+            ui.create_refresh_button(format, lambda : None, lambda : {'choices': formats_function()}, 'refresh-button')
+
         with gr.Row():
-            startButton = gr.Button("Start LoRA Training")
-            stopButton = gr.Button("Interrupt")
+            start_button = gr.Button("Start LoRA Training")
+            stop_button = gr.Button("Interrupt")
+
         output = gr.Markdown(value="Ready")
-        startEvent = startButton.click(do_train, [loraName, microBatchSize, batchSize, epochs, learningRate, loraRank, loraAlpha, loraDropout, cutoffLen, dataset, evalDataset, format], [output])
-        stopButton.click(doInterrupt, [], [], cancels=[], queue=False)
+        startEvent = start_button.click(do_train, [lora_name, micro_batch_size, batch_size, epochs, learning_rate, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format], [output])
+        stop_button.click(doInterrupt, [], [], cancels=[], queue=False)
 
 def doInterrupt():
     global WANT_INTERRUPT
@@ -74,108 +79,119 @@ class Callbacks(transformers.TrainerCallback):
             control.should_epoch_stop = True
             control.should_training_stop = True
 
-def cleanPath(basePath: str, path: str):
+def cleanPath(base_path: str, path: str):
     """"Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
     # TODO: Probably could do with a security audit to guarantee there's no ways this can be bypassed to target an unwanted path.
     # Or swap it to a strict whitelist of [a-zA-Z_0-9]
     path = path.replace('\\', '/').replace('..', '_')
-    if basePath is None:
+    if base_path is None:
         return path
-    return f'{Path(basePath).absolute()}/{path}'
+    return f'{Path(base_path).absolute()}/{path}'
 
-def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, learningRate: float, loraRank: int, loraAlpha: int, loraDropout: float, cutoffLen: int, dataset: str, evalDataset: str, format: str):
+def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: float, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str):
     global WANT_INTERRUPT, CURRENT_STEPS, MAX_STEPS, CURRENT_GRADIENT_ACCUM
     WANT_INTERRUPT = False
     CURRENT_STEPS = 0
     MAX_STEPS = 0
-    yield "Prepping..."
+
     # == Input validation / processing ==
+    yield "Prepping..."
     # TODO: --lora-dir PR once pulled will need to be applied here
-    loraName = f"loras/{cleanPath(None, loraName)}"
+    lora_name = f"loras/{cleanPath(None, lora_name)}"
     if dataset is None:
         return "**Missing dataset choice input, cannot continue.**"
     if format is None:
         return "**Missing format choice input, cannot continue.**"
-    gradientAccumulationSteps = batchSize // microBatchSize
-    CURRENT_GRADIENT_ACCUM = gradientAccumulationSteps
-    actualLR = float(learningRate)
+    gradient_accumulation_steps = batch_size // micro_batch_size
+    CURRENT_GRADIENT_ACCUM = gradient_accumulation_steps
+    actual_lr = float(learning_rate)
     shared.tokenizer.pad_token = 0
     shared.tokenizer.padding_side = "left"
+
     # == Prep the dataset, format, etc ==
     with open(cleanPath('training/formats', f'{format}.json'), 'r') as formatFile:
-        formatData: dict[str, str] = json.load(formatFile)
+        format_data: dict[str, str] = json.load(formatFile)
+
     def tokenize(prompt):
-        result = shared.tokenizer(prompt, truncation=True, max_length=cutoffLen + 1, padding="max_length")
+        result = shared.tokenizer(prompt, truncation=True, max_length=cutoff_len + 1, padding="max_length")
         return {
             "input_ids": result["input_ids"][:-1],
             "attention_mask": result["attention_mask"][:-1],
         }
+
     def generate_prompt(data_point: dict[str, str]):
-        for options, data in formatData.items():
+        for options, data in format_data.items():
             if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0):
                 for key, val in data_point.items():
                     data = data.replace(f'%{key}%', val)
             return data
-        raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(formatData.keys())}"')
+        raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
+
     def generate_and_tokenize_prompt(data_point):
         prompt = generate_prompt(data_point)
         return tokenize(prompt)
+
     print("Loading datasets...")
     data = load_dataset("json", data_files=cleanPath('training/datasets', f'{dataset}.json'))
     train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
-    if evalDataset == 'None':
-        evalData = None
+
+    if eval_dataset == 'None':
+        eval_data = None
     else:
-        evalData = load_dataset("json", data_files=cleanPath('training/datasets', f'{evalDataset}.json'))
-        evalData = evalData['train'].shuffle().map(generate_and_tokenize_prompt)
+        eval_data = load_dataset("json", data_files=cleanPath('training/datasets', f'{eval_dataset}.json'))
+        eval_data = eval_data['train'].shuffle().map(generate_and_tokenize_prompt)
+    
     # == Start prepping the model itself ==
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
         print("Getting model ready...")
         prepare_model_for_int8_training(shared.model)
+    
     print("Prepping for training...")
     config = LoraConfig(
-        r=loraRank,
-        lora_alpha=loraAlpha,
+        r=lora_rank,
+        lora_alpha=lora_alpha,
         # TODO: Should target_modules be configurable?
         target_modules=[ "q_proj", "v_proj" ],
-        lora_dropout=loraDropout,
+        lora_dropout=lora_dropout,
         bias="none",
         task_type="CAUSAL_LM"
     )
-    loraModel = get_peft_model(shared.model, config)
+    lora_model = get_peft_model(shared.model, config)
     trainer = transformers.Trainer(
-        model=loraModel,
+        model=lora_model,
         train_dataset=train_data,
-        eval_dataset=evalData,
+        eval_dataset=eval_data,
         args=transformers.TrainingArguments(
-            per_device_train_batch_size=microBatchSize,
-            gradient_accumulation_steps=gradientAccumulationSteps,
+            per_device_train_batch_size=micro_batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
             # TODO: Should more of these be configurable? Probably.
             warmup_steps=100,
             num_train_epochs=epochs,
-            learning_rate=actualLR,
+            learning_rate=actual_lr,
             fp16=True,
             logging_steps=20,
-            evaluation_strategy="steps" if evalData is not None else "no",
+            evaluation_strategy="steps" if eval_data is not None else "no",
             save_strategy="steps",
-            eval_steps=200 if evalData is not None else None,
+            eval_steps=200 if eval_data is not None else None,
             save_steps=200,
-            output_dir=loraName,
+            output_dir=lora_name,
             save_total_limit=3,
-            load_best_model_at_end=True if evalData is not None else False,
+            load_best_model_at_end=True if eval_data is not None else False,
             # TODO: Enable multi-device support
             ddp_find_unused_parameters=None
         ),
         data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False),
         callbacks=list([Callbacks()])
     )
-    loraModel.config.use_cache = False
-    old_state_dict = loraModel.state_dict
-    loraModel.state_dict = (
+
+    lora_model.config.use_cache = False
+    old_state_dict = lora_model.state_dict
+    lora_model.state_dict = (
         lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
-    ).__get__(loraModel, type(loraModel))
+    ).__get__(lora_model, type(lora_model))
+
     if torch.__version__ >= "2" and sys.platform != "win32":
-        loraModel = torch.compile(loraModel)
+        lora_model = torch.compile(lora_model)
 
     # == Main run and monitor loop ==
     # TODO: save/load checkpoints to resume from?
@@ -210,11 +226,11 @@ def do_train(loraName: str, microBatchSize: int, batchSize: int, epochs: int, le
             yield f"Running... **{CURRENT_STEPS}** / **{MAX_STEPS}** ... {timerInfo}, `{timeElapsed:.0f}`/`{totalTimeEstimate:.0f}` seconds"
 
     print("Training complete, saving...")
-    loraModel.save_pretrained(loraName)
+    lora_model.save_pretrained(lora_name)
 
     if WANT_INTERRUPT:
         print("Training interrupted.")
-        yield f"Interrupted. Incomplete LoRA saved to `{loraName}`"
+        yield f"Interrupted. Incomplete LoRA saved to `{lora_name}`"
     else:
         print("Training complete!")
-        yield f"Done! LoRA saved to `{loraName}`"
+        yield f"Done! LoRA saved to `{lora_name}`"

From 7fab7ea1b64a262f63535d06b5e418910bed7edd Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 18:19:06 -0700
Subject: [PATCH 14/21] couple missed camelCases

---
 modules/training.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index f63f2990..e3976d8f 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -58,9 +58,9 @@ def create_train_interface():
 
         output = gr.Markdown(value="Ready")
         startEvent = start_button.click(do_train, [lora_name, micro_batch_size, batch_size, epochs, learning_rate, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format], [output])
-        stop_button.click(doInterrupt, [], [], cancels=[], queue=False)
+        stop_button.click(do_interrupt, [], [], cancels=[], queue=False)
 
-def doInterrupt():
+def do_interrupt():
     global WANT_INTERRUPT
     WANT_INTERRUPT = True
 
@@ -79,7 +79,7 @@ class Callbacks(transformers.TrainerCallback):
             control.should_epoch_stop = True
             control.should_training_stop = True
 
-def cleanPath(base_path: str, path: str):
+def clean_path(base_path: str, path: str):
     """"Strips unusual symbols and forcibly builds a path as relative to the intended directory."""
     # TODO: Probably could do with a security audit to guarantee there's no ways this can be bypassed to target an unwanted path.
     # Or swap it to a strict whitelist of [a-zA-Z_0-9]
@@ -97,7 +97,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
     # == Input validation / processing ==
     yield "Prepping..."
     # TODO: --lora-dir PR once pulled will need to be applied here
-    lora_name = f"loras/{cleanPath(None, lora_name)}"
+    lora_name = f"loras/{clean_path(None, lora_name)}"
     if dataset is None:
         return "**Missing dataset choice input, cannot continue.**"
     if format is None:
@@ -109,7 +109,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
     shared.tokenizer.padding_side = "left"
 
     # == Prep the dataset, format, etc ==
-    with open(cleanPath('training/formats', f'{format}.json'), 'r') as formatFile:
+    with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile:
         format_data: dict[str, str] = json.load(formatFile)
 
     def tokenize(prompt):
@@ -132,13 +132,13 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
         return tokenize(prompt)
 
     print("Loading datasets...")
-    data = load_dataset("json", data_files=cleanPath('training/datasets', f'{dataset}.json'))
+    data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
     train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
 
     if eval_dataset == 'None':
         eval_data = None
     else:
-        eval_data = load_dataset("json", data_files=cleanPath('training/datasets', f'{eval_dataset}.json'))
+        eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
         eval_data = eval_data['train'].shuffle().map(generate_and_tokenize_prompt)
     
     # == Start prepping the model itself ==

From 8a97f6ba293228f7e33fd96670db81b0d2001f23 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 18:39:06 -0700
Subject: [PATCH 15/21] corrections per the PR comments

---
 modules/training.py | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index e3976d8f..52ecc55e 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -19,9 +19,7 @@ MAX_STEPS = 0
 CURRENT_GRADIENT_ACCUM = 1
 
 def get_json_dataset(path: str):
-    def get_set():
-        return ['None'] + sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path(path).glob('*.json'))), key=str.lower)
-    return get_set
+    return ['None'] + sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path(path).glob('*.json'))), key=str.lower)
 
 def create_train_interface():
     with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
@@ -32,7 +30,7 @@ def create_train_interface():
             batch_size = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
 
         with gr.Row():
-            epochs = gr.Number(label='Epochs', value=1, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
+            epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
             learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
 
         # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
@@ -43,21 +41,19 @@ def create_train_interface():
         cutoff_len = gr.Slider(label='Cutoff Length', minimum=1,maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
 
         with gr.Row():
-            dataset_function = get_json_dataset('training/datasets')
-            dataset = gr.Dropdown(choices=dataset_function(), value='None', label='Dataset', info='The dataset file to use for training.')
-            ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': dataset_function()}, 'refresh-button')
-            eval_dataset = gr.Dropdown(choices=dataset_function(), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
-            ui.create_refresh_button(eval_dataset, lambda : None, lambda : {'choices': dataset_function()}, 'refresh-button')
-            formats_function = get_json_dataset('training/formats')
-            format = gr.Dropdown(choices=formats_function(), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
-            ui.create_refresh_button(format, lambda : None, lambda : {'choices': formats_function()}, 'refresh-button')
+            dataset = gr.Dropdown(choices=get_json_dataset('training/datasets'), value='None', label='Dataset', info='The dataset file to use for training.')
+            ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': get_json_dataset('training/datasets')}, 'refresh-button')
+            eval_dataset = gr.Dropdown(choices=get_json_dataset('training/datasets'), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
+            ui.create_refresh_button(eval_dataset, lambda : None, lambda : {'choices': get_json_dataset('training/datasets')}, 'refresh-button')
+            format = gr.Dropdown(choices=get_json_dataset('training/formats'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
+            ui.create_refresh_button(format, lambda : None, lambda : {'choices': get_json_dataset('training/formats')}, 'refresh-button')
 
         with gr.Row():
             start_button = gr.Button("Start LoRA Training")
             stop_button = gr.Button("Interrupt")
 
         output = gr.Markdown(value="Ready")
-        startEvent = start_button.click(do_train, [lora_name, micro_batch_size, batch_size, epochs, learning_rate, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format], [output])
+        start_button.click(do_train, [lora_name, micro_batch_size, batch_size, epochs, learning_rate, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format], [output])
         stop_button.click(do_interrupt, [], [], cancels=[], queue=False)
 
 def do_interrupt():

From ec6224f5561ce200ef8c98f967c3f6edafd2ffb0 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 20:04:16 -0700
Subject: [PATCH 16/21] use new shared.args.lora_dir

---
 modules/training.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 52ecc55e..0d54a251 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -92,8 +92,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
 
     # == Input validation / processing ==
     yield "Prepping..."
-    # TODO: --lora-dir PR once pulled will need to be applied here
-    lora_name = f"loras/{clean_path(None, lora_name)}"
+    lora_name = f"{shared.args.lora_dir}/{clean_path(None, lora_name)}"
     if dataset is None:
         return "**Missing dataset choice input, cannot continue.**"
     if format is None:

From b749952fe3de309ca1b5ec98fe114608be4c8dce Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 21:22:43 -0700
Subject: [PATCH 17/21] change number minimums to 0

gradio calculates 'step' relative to the minimum, so at '1' the step values were all offset awkwardly. 0 isn't valid, but, uh, just don't slam the slider to the left.
---
 modules/training.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 0d54a251..656a8b3a 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -27,18 +27,18 @@ def create_train_interface():
         with gr.Row():
             # TODO: Implement multi-device support.
             micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.')
-            batch_size = gr.Slider(label='Batch Size', value=128, minimum=1, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
+            batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.')
 
         with gr.Row():
             epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.')
             learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
 
         # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
-        lora_rank = gr.Slider(label='LoRA Rank', value=8, minimum=1, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
-        lora_alpha = gr.Slider(label='LoRA Alpha', value=16, minimum=1, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+        lora_rank = gr.Slider(label='LoRA Rank', value=8, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
+        lora_alpha = gr.Slider(label='LoRA Alpha', value=16, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
         # TODO: Better explain what this does, in terms of real world effect especially.
         lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.')
-        cutoff_len = gr.Slider(label='Cutoff Length', minimum=1,maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
+        cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
 
         with gr.Row():
             dataset = gr.Dropdown(choices=get_json_dataset('training/datasets'), value='None', label='Dataset', info='The dataset file to use for training.')

From 2e08af4edf07b5b79f3e105c0be892e518da28bd Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 22:15:32 -0700
Subject: [PATCH 18/21] implement initial Raw Text File Input

also bump default Rank & Alpha for values that will make sense in testing if you don't know what you're doing and leave the defaults.
---
 modules/training.py | 116 +++++++++++++++++++++++++++++---------------
 1 file changed, 76 insertions(+), 40 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 656a8b3a..1949fa4e 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -7,7 +7,7 @@ from pathlib import Path
 import gradio as gr
 import torch
 import transformers
-from datasets import load_dataset
+from datasets import Dataset, load_dataset
 from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict,
                   prepare_model_for_int8_training)
 
@@ -18,8 +18,8 @@ CURRENT_STEPS = 0
 MAX_STEPS = 0
 CURRENT_GRADIENT_ACCUM = 1
 
-def get_json_dataset(path: str):
-    return ['None'] + sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path(path).glob('*.json'))), key=str.lower)
+def get_dataset(path: str, ext: str):
+    return ['None'] + sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path(path).glob(f'*.{ext}'))), key=str.lower)
 
 def create_train_interface():
     with gr.Tab('Train LoRA', elem_id='lora-train-tab'):
@@ -40,20 +40,26 @@ def create_train_interface():
         lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.')
         cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
 
-        with gr.Row():
-            dataset = gr.Dropdown(choices=get_json_dataset('training/datasets'), value='None', label='Dataset', info='The dataset file to use for training.')
-            ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': get_json_dataset('training/datasets')}, 'refresh-button')
-            eval_dataset = gr.Dropdown(choices=get_json_dataset('training/datasets'), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
-            ui.create_refresh_button(eval_dataset, lambda : None, lambda : {'choices': get_json_dataset('training/datasets')}, 'refresh-button')
-            format = gr.Dropdown(choices=get_json_dataset('training/formats'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
-            ui.create_refresh_button(format, lambda : None, lambda : {'choices': get_json_dataset('training/formats')}, 'refresh-button')
+        with gr.Tab(label="Formatted Dataset"):
+            with gr.Row():
+                dataset = gr.Dropdown(choices=get_dataset('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.')
+                ui.create_refresh_button(dataset, lambda : None, lambda : {'choices': get_dataset('training/datasets', 'json')}, 'refresh-button')
+                eval_dataset = gr.Dropdown(choices=get_dataset('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The dataset file used to evaluate the model after training.')
+                ui.create_refresh_button(eval_dataset, lambda : None, lambda : {'choices': get_dataset('training/datasets', 'json')}, 'refresh-button')
+                format = gr.Dropdown(choices=get_dataset('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.')
+                ui.create_refresh_button(format, lambda : None, lambda : {'choices': get_dataset('training/formats', 'json')}, 'refresh-button')
+        with gr.Tab(label="Raw Text File"):
+            with gr.Row():
+                raw_text_file = gr.Dropdown(choices=get_dataset('training/datasets', 'txt'), value='None', label='Text File', info='The raw text file to use for training.')
+                ui.create_refresh_button(raw_text_file, lambda : None, lambda : {'choices': get_dataset('training/datasets', 'txt')}, 'refresh-button')
+                overlap_len = gr.Slider(label='Overlap Length', minimum=0,maximum=512, value=32, step=8, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length above)')
 
         with gr.Row():
             start_button = gr.Button("Start LoRA Training")
             stop_button = gr.Button("Interrupt")
 
         output = gr.Markdown(value="Ready")
-        start_button.click(do_train, [lora_name, micro_batch_size, batch_size, epochs, learning_rate, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format], [output])
+        start_button.click(do_train, [lora_name, micro_batch_size, batch_size, epochs, learning_rate, lora_rank, lora_alpha, lora_dropout, cutoff_len, dataset, eval_dataset, format, raw_text_file, overlap_len], [output])
         stop_button.click(do_interrupt, [], [], cancels=[], queue=False)
 
 def do_interrupt():
@@ -84,7 +90,8 @@ def clean_path(base_path: str, path: str):
         return path
     return f'{Path(base_path).absolute()}/{path}'
 
-def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: float, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str):
+def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lora_rank: int,
+             lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, raw_text_file: str, overlap_len: int):
     global WANT_INTERRUPT, CURRENT_STEPS, MAX_STEPS, CURRENT_GRADIENT_ACCUM
     WANT_INTERRUPT = False
     CURRENT_STEPS = 0
@@ -93,20 +100,17 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
     # == Input validation / processing ==
     yield "Prepping..."
     lora_name = f"{shared.args.lora_dir}/{clean_path(None, lora_name)}"
-    if dataset is None:
-        return "**Missing dataset choice input, cannot continue.**"
-    if format is None:
-        return "**Missing format choice input, cannot continue.**"
+    actual_lr = float(learning_rate)
+
+    if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
+        yield f"Cannot input zeroes."
+        return
+
     gradient_accumulation_steps = batch_size // micro_batch_size
     CURRENT_GRADIENT_ACCUM = gradient_accumulation_steps
-    actual_lr = float(learning_rate)
     shared.tokenizer.pad_token = 0
     shared.tokenizer.padding_side = "left"
 
-    # == Prep the dataset, format, etc ==
-    with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile:
-        format_data: dict[str, str] = json.load(formatFile)
-
     def tokenize(prompt):
         result = shared.tokenizer(prompt, truncation=True, max_length=cutoff_len + 1, padding="max_length")
         return {
@@ -114,27 +118,55 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
             "attention_mask": result["attention_mask"][:-1],
         }
 
-    def generate_prompt(data_point: dict[str, str]):
-        for options, data in format_data.items():
-            if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0):
-                for key, val in data_point.items():
-                    data = data.replace(f'%{key}%', val)
-            return data
-        raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
-
-    def generate_and_tokenize_prompt(data_point):
-        prompt = generate_prompt(data_point)
-        return tokenize(prompt)
-
-    print("Loading datasets...")
-    data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
-    train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
-
-    if eval_dataset == 'None':
+    # == Prep the dataset, format, etc ==
+    if raw_text_file is not None:
+        print("Loading raw text file dataset...")
+        with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r') as file:
+            raw_text = file.read()
+        tokens = shared.tokenizer.encode(raw_text)
+        del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM
+        tokens = list(split_chunks(tokens, cutoff_len - overlap_len))
+        for i in range(1, len(tokens)):
+            tokens[i] = tokens[i - 1][-overlap_len:] + tokens[i]
+        text_chunks = [shared.tokenizer.decode(x) for x in tokens]
+        del tokens
+        data = Dataset.from_list([tokenize(x) for x in text_chunks])
+        train_data = data.shuffle()
         eval_data = None
+        del text_chunks
+
     else:
-        eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
-        eval_data = eval_data['train'].shuffle().map(generate_and_tokenize_prompt)
+        with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile:
+            format_data: dict[str, str] = json.load(formatFile)
+
+        if dataset is None:
+            yield "**Missing dataset choice input, cannot continue.**"
+            return
+        if format is None:
+            yield "**Missing format choice input, cannot continue.**"
+            return
+
+        def generate_prompt(data_point: dict[str, str]):
+            for options, data in format_data.items():
+                if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0):
+                    for key, val in data_point.items():
+                        data = data.replace(f'%{key}%', val)
+                return data
+            raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"')
+
+        def generate_and_tokenize_prompt(data_point):
+            prompt = generate_prompt(data_point)
+            return tokenize(prompt)
+
+        print("Loading JSON datasets...")
+        data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
+        train_data = data['train'].shuffle().map(generate_and_tokenize_prompt)
+
+        if eval_dataset == 'None':
+            eval_data = None
+        else:
+            eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
+            eval_data = eval_data['train'].shuffle().map(generate_and_tokenize_prompt)
     
     # == Start prepping the model itself ==
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
@@ -229,3 +261,7 @@ def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int
     else:
         print("Training complete!")
         yield f"Done! LoRA saved to `{lora_name}`"
+
+def split_chunks(arr, step):
+    for i in range(0, len(arr), step):
+        yield arr[i:i + step]

From 9cc811a0e6abbc32ef5699255db4127740ea1e8d Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 22:16:40 -0700
Subject: [PATCH 19/21] fix LoRA path typo in #549

---
 server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server.py b/server.py
index 311a624f..4d9ee5e8 100644
--- a/server.py
+++ b/server.py
@@ -55,7 +55,7 @@ def get_available_softprompts():
     return ['None'] + sorted(set(map(lambda x : '.'.join(str(x.name).split('.')[:-1]), Path('softprompts').glob('*.zip'))), key=str.lower)
 
 def get_available_loras():
-    return ['None'] + sorted([item.name for item in list(Path('shared.args.lora_dir').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json'))], key=str.lower)
+    return ['None'] + sorted([item.name for item in list(Path(shared.args.lora_dir).glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json'))], key=str.lower)
 
 def unload_model():
     shared.model = shared.tokenizer = None

From e817fac5424f7f19a5f20071dc08ce4e483d0636 Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 22:29:23 -0700
Subject: [PATCH 20/21] better defaults

---
 modules/training.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/training.py b/modules/training.py
index 1949fa4e..7bcecb38 100644
--- a/modules/training.py
+++ b/modules/training.py
@@ -34,8 +34,8 @@ def create_train_interface():
             learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.')
 
         # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale.
-        lora_rank = gr.Slider(label='LoRA Rank', value=8, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
-        lora_alpha = gr.Slider(label='LoRA Alpha', value=16, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
+        lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.')
+        lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.')
         # TODO: Better explain what this does, in terms of real world effect especially.
         lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.')
         cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.')
@@ -52,7 +52,7 @@ def create_train_interface():
             with gr.Row():
                 raw_text_file = gr.Dropdown(choices=get_dataset('training/datasets', 'txt'), value='None', label='Text File', info='The raw text file to use for training.')
                 ui.create_refresh_button(raw_text_file, lambda : None, lambda : {'choices': get_dataset('training/datasets', 'txt')}, 'refresh-button')
-                overlap_len = gr.Slider(label='Overlap Length', minimum=0,maximum=512, value=32, step=8, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length above)')
+                overlap_len = gr.Slider(label='Overlap Length', minimum=0,maximum=512, value=128, step=16, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length above). Setting overlap to exactly half the cutoff length may be ideal.')
 
         with gr.Row():
             start_button = gr.Button("Start LoRA Training")

From b0f05046b307ce484c8fe8300a10e1909d94904d Mon Sep 17 00:00:00 2001
From: "Alex \"mcmonkey\" Goodwin" <git_commits@alexgoodwin.dev>
Date: Mon, 27 Mar 2023 22:50:37 -0700
Subject: [PATCH 21/21] remove duplicate import

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 8400250f..79da715d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,5 @@ rwkv==0.7.1
 safetensors==0.3.0
 sentencepiece
 tqdm
-peft
 datasets
 git+https://github.com/huggingface/transformers