diff --git a/characters/instruction-following/ChatGLM.yaml b/characters/instruction-following/ChatGLM.yaml index 0e5d3f41..f25f4908 100644 --- a/characters/instruction-following/ChatGLM.yaml +++ b/characters/instruction-following/ChatGLM.yaml @@ -1,4 +1,4 @@ -user: "[Round <|round|>]\n问:" -bot: "答:" +user: "[Round <|round|>]\n问:" +bot: "答:" turn_template: "<|user|><|user-message|>\n<|bot|><|bot-message|>\n" context: "" diff --git a/extensions/openai/README.md b/extensions/openai/README.md index b4d4ff3a..b20eba33 100644 --- a/extensions/openai/README.md +++ b/extensions/openai/README.md @@ -11,6 +11,15 @@ Optional (for flask_cloudflared, embeddings): pip3 install -r requirements.txt ``` +It listens on tcp port 5001 by default. You can use the OPENEDAI_PORT environment variable to change this. + +To enable the bare bones image generation (txt2img) set: SD_WEBUI_URL to point to your Stable Diffusion API ([Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui)). + +Example: +``` +SD_WEBUI_URL=http://127.0.0.1:7861 +``` + ### Embeddings (alpha) Embeddings requires ```sentence-transformers``` installed, but chat and completions will function without it loaded. The embeddings endpoint is currently using the HuggingFace model: ```sentence-transformers/all-mpnet-base-v2``` for embeddings. This produces 768 dimensional embeddings (the same as the text-davinci-002 embeddings), which is different from OpenAI's current default ```text-embedding-ada-002``` model which produces 1536 dimensional embeddings. The model is small-ish and fast-ish. This model and embedding size may change in the future. @@ -67,17 +76,22 @@ const api = new ChatGPTAPI({ ## Compatibility & not so compatibility -What's working: - | API endpoint | tested with | notes | | --- | --- | --- | | /v1/models | openai.Model.list() | returns the currently loaded model_name and some mock compatibility options | | /v1/models/{id} | openai.Model.get() | returns whatever you ask for, model does nothing yet anyways | | /v1/text_completion | openai.Completion.create() | the most tested, only supports single string input so far | | /v1/chat/completions | openai.ChatCompletion.create() | depending on the model, this may add leading linefeeds | +| /v1/edits | openai.Edit.create() | Assumes an instruction following model, but may work with others | +| /v1/images/generations | openai.Image.create() | Bare bones, no model configuration, response_format='b64_json' only. | | /v1/embeddings | openai.Embedding.create() | Using Sentence Transformer, dimensions are different and may never be directly comparable to openai embeddings. | | /v1/moderations | openai.Moderation.create() | does nothing. successfully. | | /v1/engines/\*/... completions, embeddings, generate | python-openai v0.25 and earlier | Legacy engines endpoints | +| /v1/images/edits | openai.Image.create_edit() | not supported | +| /v1/images/variations | openai.Image.create_variation() | not supported | +| /v1/audio/\* | openai.Audio.\* | not supported | +| /v1/files\* | openai.Files.\* | not supported | +| /v1/fine-tunes\* | openai.FineTune.\* | not supported | The model name setting is ignored in completions, but you may need to adjust the maximum token length to fit the model (ie. set to <2048 tokens instead of 4096, 8k, etc). To mitigate some of this, the max_tokens value is halved until it is less than truncation_length for the model (typically 2k). @@ -99,6 +113,10 @@ Some hacky mappings: defaults are mostly from openai, so are different. I use the openai defaults where I can and try to scale them to the webui defaults with the same intent. +### Models + +This has been successfully tested with Koala, Alpaca, gpt4-x-alpaca, GPT4all-snoozy, wizard-vicuna, stable-vicuna and Vicuna 1.1 - ie. Instruction Following models. If you test with other models please let me know how it goes. Less than satisfying results (so far): RWKV-4-Raven, llama, mpt-7b-instruct/chat + ### Applications Everything needs OPENAI_API_KEY=dummy set. @@ -120,4 +138,7 @@ Everything needs OPENAI_API_KEY=dummy set. * model changing, esp. something for swapping loras or embedding models * consider switching to FastAPI + starlette for SSE (openai SSE seems non-standard) * do something about rate limiting or locking requests for completions, most systems will only be able handle a single request at a time before OOM -* the whole api, images (stable diffusion), audio (whisper), fine-tunes (training), edits, files, etc. \ No newline at end of file + +## Bugs? Feedback? Comments? Pull requests? + +Are all appreciated, please @matatonic and I'll try to get back to you as soon as possible. diff --git a/extensions/openai/cache_embedding_model.py b/extensions/openai/cache_embedding_model.py new file mode 100755 index 00000000..44ac1dcd --- /dev/null +++ b/extensions/openai/cache_embedding_model.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python3 +# preload the embedding model, useful for Docker images to prevent re-download on config change +# Dockerfile: +# ENV OPENEDAI_EMBEDDING_MODEL=all-mpnet-base-v2 # Optional +# RUN python3 cache_embedded_model.py +import os, sentence_transformers +st_model = os.environ["OPENEDAI_EMBEDDING_MODEL"] if "OPENEDAI_EMBEDDING_MODEL" in os.environ else "all-mpnet-base-v2" +model = sentence_transformers.SentenceTransformer(st_model) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index c46dbe04..711b76a2 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -2,6 +2,8 @@ import base64 import json import os import time +import requests +import yaml from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer from threading import Thread @@ -48,6 +50,31 @@ def clamp(value, minvalue, maxvalue): return max(minvalue, min(value, maxvalue)) +def deduce_template(): + # Alpaca is verbose so a good default prompt + default_template = ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" + ) + + # Use the special instruction/input/response template for anything trained like Alpaca + if shared.settings['instruction_template'] in ['Alpaca', 'Alpaca-Input']: + return default_template + + try: + instruct = yaml.safe_load(open(f"characters/instruction-following/{shared.settings['instruction_template']}.yaml", 'r')) + + template = instruct['turn_template'] + template = template\ + .replace('<|user|>', instruct.get('user', ''))\ + .replace('<|bot|>', instruct.get('bot', ''))\ + .replace('<|user-message|>', '{instruction}\n{input}') + return instruct.get('context', '') + template[:template.find('<|bot-message|>')] + except: + return default_template + + def float_list_to_base64(float_list): # Convert the list to a float32 array that the OpenAPI client expects float_array = np.array(float_list, dtype="float32") @@ -120,11 +147,20 @@ class Handler(BaseHTTPRequestHandler): self.send_error(404) def do_POST(self): + # ... haaack. + is_chat = shared.args.chat + try: + shared.args.chat = True + self.do_POST_wrap() + finally: + shared.args.chat = is_chat + + def do_POST_wrap(self): + if debug: + print(self.headers) # did you know... python-openai sends your linux kernel & python version? content_length = int(self.headers['Content-Length']) body = json.loads(self.rfile.read(content_length).decode('utf-8')) - if debug: - print(self.headers) # did you know... python-openai sends your linux kernel & python version? if debug: print(body) @@ -150,7 +186,7 @@ class Handler(BaseHTTPRequestHandler): truncation_length = default(shared.settings, 'truncation_length', 2048) truncation_length = clamp(default(body, 'truncation_length', truncation_length), 1, truncation_length) - default_max_tokens = truncation_length if is_chat else 16 # completions default, chat default is 'inf' so we need to cap it., the default for chat is "inf" + default_max_tokens = truncation_length if is_chat else 16 # completions default, chat default is 'inf' so we need to cap it. max_tokens_str = 'length' if is_legacy else 'max_tokens' max_tokens = default(body, max_tokens_str, default(shared.settings, 'max_new_tokens', default_max_tokens)) @@ -440,6 +476,129 @@ class Handler(BaseHTTPRequestHandler): else: resp[resp_list][0]["text"] = answer + response = json.dumps(resp) + self.wfile.write(response.encode('utf-8')) + elif '/edits' in self.path: + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + + created_time = int(time.time()) + + # Using Alpaca format, this may work with other models too. + instruction = body['instruction'] + input = body.get('input', '') + + instruction_template = deduce_template() + edit_task = instruction_template.format(instruction=instruction, input=input) + + truncation_length = default(shared.settings, 'truncation_length', 2048) + token_count = len(encode(edit_task)[0]) + max_tokens = truncation_length - token_count + + req_params = { + 'max_new_tokens': max_tokens, + 'temperature': clamp(default(body, 'temperature', 1.0), 0.001, 1.999), + 'top_p': clamp(default(body, 'top_p', 1.0), 0.001, 1.0), + 'top_k': 1, + 'repetition_penalty': 1.18, + 'encoder_repetition_penalty': 1.0, + 'suffix': None, + 'stream': False, + 'echo': False, + 'seed': shared.settings.get('seed', -1), + # 'n' : default(body, 'n', 1), # 'n' doesn't have a direct map + 'truncation_length': truncation_length, + 'add_bos_token': shared.settings.get('add_bos_token', True), + 'do_sample': True, + 'typical_p': 1.0, + 'min_length': 0, + 'no_repeat_ngram_size': 0, + 'num_beams': 1, + 'penalty_alpha': 0.0, + 'length_penalty': 1, + 'early_stopping': False, + 'ban_eos_token': False, + 'skip_special_tokens': True, + 'custom_stopping_strings': [], + } + + if debug: + print({'edit_template': edit_task, 'req_params': req_params, 'token_count': token_count}) + + generator = generate_reply(edit_task, req_params, stopping_strings=standard_stopping_strings) + + answer = '' + for a in generator: + if isinstance(a, str): + answer = a + else: + answer = a[0] + + completion_token_count = len(encode(answer)[0]) + + resp = { + "object": "edit", + "created": created_time, + "choices": [{ + "text": answer, + "index": 0, + }], + "usage": { + "prompt_tokens": token_count, + "completion_tokens": completion_token_count, + "total_tokens": token_count + completion_token_count + } + } + + if debug: + print({'answer': answer, 'completion_token_count': completion_token_count}) + + response = json.dumps(resp) + self.wfile.write(response.encode('utf-8')) + elif '/images/generations' in self.path and 'SD_WEBUI_URL' in os.environ: + # Stable Diffusion callout wrapper for txt2img + # Low effort implementation for compatibility. With only "prompt" being passed and assuming DALL-E + # the results will be limited and likely poor. SD has hundreds of models and dozens of settings. + # If you want high quality tailored results you should just use the Stable Diffusion API directly. + # it's too general an API to try and shape the result with specific tags like "masterpiece", etc, + # Will probably work best with the stock SD models. + # SD configuration is beyond the scope of this API. + # At this point I will not add the edits and variations endpoints (ie. img2img) because they + # require changing the form data handling to accept multipart form data, also to properly support + # url return types will require file management and a web serving files... Perhaps later! + + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + + width, height = [ int(x) for x in default(body, 'size', '1024x1024').split('x') ] # ignore the restrictions on size + response_format = default(body, 'response_format', 'url') # or b64_json + + payload = { + 'prompt': body['prompt'], # ignore prompt limit of 1000 characters + 'width': width, + 'height': height, + 'batch_size': default(body, 'n', 1) # ignore the batch limits of max 10 + } + + resp = { + 'created': int(time.time()), + 'data': [] + } + + # TODO: support SD_WEBUI_AUTH username:password pair. + sd_url = f"{os.environ['SD_WEBUI_URL']}/sdapi/v1/txt2img" + + response = requests.post(url=sd_url, json=payload) + r = response.json() + # r['parameters']... + for b64_json in r['images']: + if response_format == 'b64_json': + resp['data'].extend([{'b64_json': b64_json}]) + else: + resp['data'].extend([{'url': f'data:image/png;base64,{b64_json}'}]) # yeah it's lazy. requests.get() will not work with this + response = json.dumps(resp) self.wfile.write(response.encode('utf-8')) elif '/embeddings' in self.path and embedding_model is not None: @@ -540,11 +699,12 @@ def run_server(): try: from flask_cloudflared import _run_cloudflared public_url = _run_cloudflared(params['port'], params['port'] + 1) - print(f'Starting OpenAI compatible api at {public_url}/') + print(f'Starting OpenAI compatible api at\nOPENAI_API_BASE={public_url}/v1') except ImportError: print('You should install flask_cloudflared manually') else: - print(f'Starting OpenAI compatible api at http://{server_addr[0]}:{server_addr[1]}/') + print(f'Starting OpenAI compatible api:\nOPENAI_API_BASE=http://{server_addr[0]}:{server_addr[1]}/v1') + server.serve_forever() diff --git a/models/config.yaml b/models/config.yaml index f5c9d508..2bef3ce5 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -54,6 +54,9 @@ .*vicuna.*(1.1|1_1): mode: 'instruct' instruction_template: 'Vicuna-v1.1' +.*wizard.*vicuna: + mode: 'instruct' + instruction_template: 'Vicuna-v1.1' .*stable.*vicuna: mode: 'instruct' instruction_template: 'StableVicuna' @@ -135,4 +138,4 @@ instruction_template: 'INCITE-Chat' .*incite.*instruct: mode: 'instruct' - instruction_template: 'INCITE-Instruct' \ No newline at end of file + instruction_template: 'INCITE-Instruct'