Fix UnicodeDecodeError for BPE-based Models (especially GLM-4) (#6357)

2024-09-20 10:35:10 +02:00 · 2024-09-03 10:00:59 +08:00 · 2024-09-03 10:00:59 +08:00 · 4c74c7a116
commit 4c74c7a116
parent 41a8eb4eeb
1 changed files with 6 additions and 1 deletions
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -274,7 +274,12 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
    if (hasattr(shared.tokenizer, 'convert_ids_to_tokens') and len(output_ids) > starting_from) and not reply.startswith(' '):
        first_token = shared.tokenizer.convert_ids_to_tokens(int(output_ids[starting_from]))
        if isinstance(first_token, (bytes,)):
-            first_token = first_token.decode('utf8')
+            #try to decode the bytes to a string
+            try:
+                first_token = first_token.decode('utf8')
+            #if it fails, which means it's not a string in this turn, just ignore it
+            except UnicodeDecodeError:
+                first_token = ''

        if first_token.startswith('▁'):
            reply = ' ' + reply