Fix UnicodeDecodeError for BPE-based Models (especially GLM-4) (#6357)

This commit is contained in:
GralchemOz 2024-09-03 10:00:59 +08:00 committed by GitHub
parent 41a8eb4eeb
commit 4c74c7a116
WARNING! Although there is a key with this ID in the database it does not verify this commit! This commit is SUSPICIOUS.
GPG key ID: B5690EEEBB952194

View file

@ -274,7 +274,12 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from) and not reply.startswith(' '):
first_token = shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from]))
if isinstance(first_token, (bytes,)):
first_token = first_token.decode('utf8')
#try to decode the bytes to a string
try:
first_token = first_token.decode('utf8')
#if it fails, which means it's not a string in this turn, just ignore it
except UnicodeDecodeError:
first_token = ''
if first_token.startswith(''):
reply = ' ' + reply