Remove universal llama tokenizer support

Instead replace it with a warning if the tokenizer files look off
This commit is contained in:
oobabooga 2023-07-04 19:43:19 -07:00
parent 84d6c93d0d
commit 8705eba830
2 changed files with 24 additions and 29 deletions

View file

@ -12,13 +12,7 @@ This guide will cover usage through the official `transformers` implementation.
* Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789 * Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789
* Direct download: https://huggingface.co/Neko-Institute-of-Science * Direct download: https://huggingface.co/Neko-Institute-of-Science
⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, so I recommend downloading the following universal LLaMA tokenizer: ⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer
```
python download-model.py oobabooga/llama-tokenizer
```
Once downloaded, it will be automatically applied to **every** `LlamaForCausalLM` model that you try to load.
### Option 2: convert the weights yourself ### Option 2: convert the weights yourself

View file

@ -3,6 +3,7 @@ import os
import re import re
import time import time
from pathlib import Path from pathlib import Path
import hashlib
import torch import torch
import transformers import transformers
@ -14,7 +15,6 @@ from transformers import (
AutoModelForSeq2SeqLM, AutoModelForSeq2SeqLM,
AutoTokenizer, AutoTokenizer,
BitsAndBytesConfig, BitsAndBytesConfig,
LlamaTokenizer
) )
import modules.shared as shared import modules.shared as shared
@ -91,30 +91,31 @@ def load_model(model_name, loader=None):
def load_tokenizer(model_name, model): def load_tokenizer(model_name, model):
tokenizer = None tokenizer = None
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): if any(s in model_name.lower() for s in ['gpt-4chan', 'gpt4chan']) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists():
tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/"))
elif model.__class__.__name__ in ['LlamaForCausalLM', 'LlamaGPTQForCausalLM', 'ExllamaHF']: elif path_to_model.exists():
# Try to load an universal LLaMA tokenizer tokenizer = AutoTokenizer.from_pretrained(
if not any(s in shared.model_name.lower() for s in ['llava', 'oasst']): path_to_model,
for p in [Path(f"{shared.args.model_dir}/llama-tokenizer/"), Path(f"{shared.args.model_dir}/oobabooga_llama-tokenizer/")]: trust_remote_code=shared.args.trust_remote_code,
if p.exists(): use_fast=False
logger.info(f"Loading the universal LLaMA tokenizer from {p}...") )
tokenizer = LlamaTokenizer.from_pretrained(p, clean_up_tokenization_spaces=True)
return tokenizer
# Otherwise, load it from the model folder and hope that these if tokenizer.__class__.__name__ == 'LlamaTokenizer':
# are not outdated tokenizer files. pairs = [
tokenizer = LlamaTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}/"), clean_up_tokenization_spaces=True) ['tokenizer_config.json', '516c6167c884793a738c440e29ccb80c15e1493ffc965affc69a1a8ddef4572a'],
try: ['special_tokens_map.json', 'ff3b4a612c4e447acb02d40071bddd989fe0da87eb5b7fe0dbadfc4f74de7531']
tokenizer.eos_token_id = 2 ]
tokenizer.bos_token_id = 1
tokenizer.pad_token_id = 0 for pair in pairs:
except: p = path_to_model / pair[0]
pass if p.exists():
else: with open(p, "rb") as f:
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") bytes = f.read()
if path_to_model.exists():
tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) file_hash = hashlib.sha256(bytes).hexdigest()
if file_hash != pair[1]:
logger.warning(f"{p} is different from the original LlamaTokenizer file. It is either customized or outdated.")
return tokenizer return tokenizer