Add Feature to Log Sample of Training Dataset for Inspection (#1711)

This commit is contained in:
practicaldreamer 2023-07-12 09:26:45 -05:00 committed by GitHub
parent b6ba68eda9
commit 73a0def4af
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: 4AEE18F83AFDEB23

View file

@ -579,8 +579,27 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
if WANT_INTERRUPT:
yield "Interrupted before start."
return
def log_train_dataset(trainer):
decoded_entries = []
# Try to decode the entries and write the log file
try:
# Iterate over the first 10 elements in the dataset (or fewer if there are less than 10)
for i in range(min(10, len(trainer.train_dataset))):
decoded_text = shared.tokenizer.decode(trainer.train_dataset[i]['input_ids'])
decoded_entries.append({"value": decoded_text})
# Write the log file
Path('logs').mkdir(exist_ok=True)
with open(Path('logs/train_dataset_sample.json'), 'w') as json_file:
json.dump(decoded_entries, json_file, indent=4)
logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.")
except Exception as e:
logger.error(f"Failed to create log file due to error: {e}")
def threaded_run():
log_train_dataset(trainer)
trainer.train()
# Note: save in the thread in case the gradio thread breaks (eg browser closed)
lora_model.save_pretrained(lora_file_path)