text-generation-webui/extensions/api/util.py

import asyncio
import functools
import threading
import time
import traceback
from threading import Thread
from typing import Callable, Optional

from modules import shared
from modules.chat import load_character_memoized
from modules.presets import load_preset_memoized

# We use a thread local to store the asyncio lock, so that each thread
# has its own lock.  This isn't strictly necessary, but it makes it
# such that if we can support multiple worker threads in the future,
# thus handling multiple requests in parallel.
api_tls = threading.local()


def build_parameters(body, chat=False):

    generate_params = {
        'max_new_tokens': int(body.get('max_new_tokens', body.get('max_length', 200))),
        'auto_max_new_tokens': bool(body.get('auto_max_new_tokens', False)),
        'do_sample': bool(body.get('do_sample', True)),
        'temperature': float(body.get('temperature', 0.5)),
        'top_p': float(body.get('top_p', 1)),
        'typical_p': float(body.get('typical_p', body.get('typical', 1))),
        'epsilon_cutoff': float(body.get('epsilon_cutoff', 0)),
        'eta_cutoff': float(body.get('eta_cutoff', 0)),
        'tfs': float(body.get('tfs', 1)),
        'top_a': float(body.get('top_a', 0)),
        'repetition_penalty': float(body.get('repetition_penalty', body.get('rep_pen', 1.1))),
        'repetition_penalty_range': int(body.get('repetition_penalty_range', 0)),
        'encoder_repetition_penalty': float(body.get('encoder_repetition_penalty', 1.0)),
        'top_k': int(body.get('top_k', 0)),
        'min_length': int(body.get('min_length', 0)),
        'no_repeat_ngram_size': int(body.get('no_repeat_ngram_size', 0)),
        'num_beams': int(body.get('num_beams', 1)),
        'penalty_alpha': float(body.get('penalty_alpha', 0)),
        'length_penalty': float(body.get('length_penalty', 1)),
        'early_stopping': bool(body.get('early_stopping', False)),
        'mirostat_mode': int(body.get('mirostat_mode', 0)),
        'mirostat_tau': float(body.get('mirostat_tau', 5)),
        'mirostat_eta': float(body.get('mirostat_eta', 0.1)),
        'guidance_scale': float(body.get('guidance_scale', 1)),
        'negative_prompt': str(body.get('negative_prompt', '')),
        'seed': int(body.get('seed', -1)),
        'add_bos_token': bool(body.get('add_bos_token', True)),
        'truncation_length': int(body.get('truncation_length', body.get('max_context_length', 2048))),
        'ban_eos_token': bool(body.get('ban_eos_token', False)),
        'skip_special_tokens': bool(body.get('skip_special_tokens', True)),
        'custom_stopping_strings': '',  # leave this blank
        'stopping_strings': body.get('stopping_strings', []),
    }

    preset_name = body.get('preset', 'None')
    if preset_name not in ['None', None, '']:
        preset = load_preset_memoized(preset_name)
        generate_params.update(preset)

    if chat:
        character = body.get('character')
        instruction_template = body.get('instruction_template', shared.settings['instruction_template'])
        if str(instruction_template) == "None":
            instruction_template = "Vicuna-v1.1"

        name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
        name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
        generate_params.update({
            'stop_at_newline': bool(body.get('stop_at_newline', shared.settings['stop_at_newline'])),
            'chat_generation_attempts': int(body.get('chat_generation_attempts', shared.settings['chat_generation_attempts'])),
            'mode': str(body.get('mode', 'chat')),
            'name1': str(body.get('name1', name1)),
            'name2': str(body.get('name2', name2)),
            'context': str(body.get('context', context)),
            'greeting': str(body.get('greeting', greeting)),
            'name1_instruct': str(body.get('name1_instruct', name1_instruct)),
            'name2_instruct': str(body.get('name2_instruct', name2_instruct)),
            'context_instruct': str(body.get('context_instruct', context_instruct)),
            'turn_template': str(body.get('turn_template', turn_template)),
            'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
            'history': body.get('history', {'internal': [], 'visible': []})
        })

    return generate_params


def try_start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
    Thread(target=_start_cloudflared, args=[
           port, max_attempts, on_start], daemon=True).start()


def _start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):
    try:
        from flask_cloudflared import _run_cloudflared
    except ImportError:
        print('You should install flask_cloudflared manually')
        raise Exception(
            'flask_cloudflared not installed. Make sure you installed the requirements.txt for this extension.')

    for _ in range(max_attempts):
        try:
            public_url = _run_cloudflared(port, port + 1)

            if on_start:
                on_start(public_url)

            return
        except Exception:
            traceback.print_exc()
            time.sleep(3)

        raise Exception('Could not start cloudflared.')


def _get_api_lock(tls) -> asyncio.Lock:
    """
    The streaming and blocking API implementations each run on their own
    thread, and multiplex requests using asyncio.  If multiple outstanding
    requests are received at once, we will try to acquire the shared lock
    shared.generation_lock multiple times in succession in the same thread,
    which will cause a deadlock.

    To avoid this, we use this wrapper function to block on an asyncio
    lock, and then try and grab the shared lock only while holding
    the asyncio lock.
    """
    if not hasattr(tls, "asyncio_lock"):
        tls.asyncio_lock = asyncio.Lock()

    return tls.asyncio_lock


def with_api_lock(func):
    """
    This decorator should be added to all streaming API methods which
    require access to the shared.generation_lock.  It ensures that the
    tls.asyncio_lock is acquired before the method is called, and
    released afterwards.
    """
    @functools.wraps(func)
    async def api_wrapper(*args, **kwargs):
        async with _get_api_lock(api_tls):
            return await func(*args, **kwargs)
    return api_wrapper
fix for issue #2475: Streaming api deadlock (#3048) 2023-07-09 04:21:20 +02:00			`import asyncio`
			`import functools`
			`import threading`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`import time`
Make API extension print its exceptions 2023-04-26 04:23:47 +02:00			`import traceback`
			`from threading import Thread`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`from typing import Callable, Optional`
Make API extension print its exceptions 2023-04-26 04:23:47 +02:00
Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`from modules import shared`
memoize load_character to speed up the chat API 2023-05-23 05:50:58 +02:00			`from modules.chat import load_character_memoized`
Allow API requests to use parameter presets 2023-06-14 01:34:35 +02:00			`from modules.presets import load_preset_memoized`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00
fix for issue #2475: Streaming api deadlock (#3048) 2023-07-09 04:21:20 +02:00			`# We use a thread local to store the asyncio lock, so that each thread`
			`# has its own lock. This isn't strictly necessary, but it makes it`
			`# such that if we can support multiple worker threads in the future,`
			`# thus handling multiple requests in parallel.`
			`api_tls = threading.local()`


Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`def build_parameters(body, chat=False):`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00
			`generate_params = {`
			`'max_new_tokens': int(body.get('max_new_tokens', body.get('max_length', 200))),`
Add auto_max_new_tokens parameter (#3419) 2023-08-02 19:52:20 +02:00			`'auto_max_new_tokens': bool(body.get('auto_max_new_tokens', False)),`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`'do_sample': bool(body.get('do_sample', True)),`
			`'temperature': float(body.get('temperature', 0.5)),`
			`'top_p': float(body.get('top_p', 1)),`
			`'typical_p': float(body.get('typical_p', body.get('typical', 1))),`
Add epsilon_cutoff/eta_cutoff parameters (#2258) 2023-05-21 20:11:57 +02:00			`'epsilon_cutoff': float(body.get('epsilon_cutoff', 0)),`
			`'eta_cutoff': float(body.get('eta_cutoff', 0)),`
Add new parameters to API extension 2023-05-30 03:03:08 +02:00			`'tfs': float(body.get('tfs', 1)),`
			`'top_a': float(body.get('top_a', 0)),`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`'repetition_penalty': float(body.get('repetition_penalty', body.get('rep_pen', 1.1))),`
Add repetition penalty range parameter to transformers (#2916) 2023-06-29 18:40:13 +02:00			`'repetition_penalty_range': int(body.get('repetition_penalty_range', 0)),`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`'encoder_repetition_penalty': float(body.get('encoder_repetition_penalty', 1.0)),`
			`'top_k': int(body.get('top_k', 0)),`
			`'min_length': int(body.get('min_length', 0)),`
			`'no_repeat_ngram_size': int(body.get('no_repeat_ngram_size', 0)),`
			`'num_beams': int(body.get('num_beams', 1)),`
			`'penalty_alpha': float(body.get('penalty_alpha', 0)),`
			`'length_penalty': float(body.get('length_penalty', 1)),`
			`'early_stopping': bool(body.get('early_stopping', False)),`
Add mirostat parameters for llama.cpp (#2287) 2023-05-23 00:37:24 +02:00			`'mirostat_mode': int(body.get('mirostat_mode', 0)),`
			`'mirostat_tau': float(body.get('mirostat_tau', 5)),`
			`'mirostat_eta': float(body.get('mirostat_eta', 0.1)),`
Add Classifier Free Guidance (CFG) for Transformers/ExLlama (#3325) 2023-08-06 22:22:48 +02:00			`'guidance_scale': float(body.get('guidance_scale', 1)),`
			`'negative_prompt': str(body.get('negative_prompt', '')),`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`'seed': int(body.get('seed', -1)),`
Fix typing bug in api 2023-05-04 00:27:20 +02:00			`'add_bos_token': bool(body.get('add_bos_token', True)),`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`'truncation_length': int(body.get('truncation_length', body.get('max_context_length', 2048))),`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`'ban_eos_token': bool(body.get('ban_eos_token', False)),`
			`'skip_special_tokens': bool(body.get('skip_special_tokens', True)),`
			`'custom_stopping_strings': '', # leave this blank`
			`'stopping_strings': body.get('stopping_strings', []),`
			`}`

Allow API requests to use parameter presets 2023-06-14 01:34:35 +02:00			`preset_name = body.get('preset', 'None')`
			`if preset_name not in ['None', None, '']:`
			`preset = load_preset_memoized(preset_name)`
			`generate_params.update(preset)`

Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`if chat:`
			`character = body.get('character')`
Add context_instruct to API. Load default model instruction template … (#2688) 2023-07-12 05:01:03 +02:00			`instruction_template = body.get('instruction_template', shared.settings['instruction_template'])`
			`if str(instruction_template) == "None":`
			`instruction_template = "Vicuna-v1.1"`

Fix spelling mistake in new name var of chat api (#2309) 2023-05-24 04:03:03 +02:00			`name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)`
memoize load_character to speed up the chat API 2023-05-23 05:50:58 +02:00			`name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`generate_params.update({`
			`'stop_at_newline': bool(body.get('stop_at_newline', shared.settings['stop_at_newline'])),`
			`'chat_generation_attempts': int(body.get('chat_generation_attempts', shared.settings['chat_generation_attempts'])),`
			`'mode': str(body.get('mode', 'chat')),`
Use character settings from API properties if present (#3428) 2023-08-03 20:56:40 +02:00			`'name1': str(body.get('name1', name1)),`
			`'name2': str(body.get('name2', name2)),`
			`'context': str(body.get('context', context)),`
			`'greeting': str(body.get('greeting', greeting)),`
			`'name1_instruct': str(body.get('name1_instruct', name1_instruct)),`
			`'name2_instruct': str(body.get('name2_instruct', name2_instruct)),`
			`'context_instruct': str(body.get('context_instruct', context_instruct)),`
			`'turn_template': str(body.get('turn_template', turn_template)),`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),`
Update chat API (fixes #3006) 2023-07-05 02:36:47 +02:00			`'history': body.get('history', {'internal': [], 'visible': []})`
Add chat API (#2233) 2023-05-20 23:42:17 +02:00			`})`

New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`return generate_params`


			`def try_start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):`
			`Thread(target=_start_cloudflared, args=[`
			`port, max_attempts, on_start], daemon=True).start()`


			`def _start_cloudflared(port: int, max_attempts: int = 3, on_start: Optional[Callable[[str], None]] = None):`
			`try:`
			`from flask_cloudflared import _run_cloudflared`
			`except ImportError:`
			`print('You should install flask_cloudflared manually')`
			`raise Exception(`
			`'flask_cloudflared not installed. Make sure you installed the requirements.txt for this extension.')`

			`for _ in range(max_attempts):`
			`try:`
			`public_url = _run_cloudflared(port, port + 1)`

			`if on_start:`
			`on_start(public_url)`

			`return`
			`except Exception:`
Make API extension print its exceptions 2023-04-26 04:23:47 +02:00			`traceback.print_exc()`
New universal API with streaming/blocking endpoints (#990) Previous title: Add api_streaming extension and update api-example-stream to use it * Merge with latest main * Add parameter capturing encoder_repetition_penalty * Change some defaults, minor fixes * Add --api, --public-api flags * remove unneeded/broken comment from blocking API startup. The comment is already correctly emitted in try_start_cloudflared by calling the lambda we pass in. * Update on_start message for blocking_api, it should say 'non-streaming' and not 'streaming' * Update the API examples * Change a comment * Update README * Remove the gradio API * Remove unused import * Minor change * Remove unused import --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> 2023-04-23 20:52:43 +02:00			`time.sleep(3)`

			`raise Exception('Could not start cloudflared.')`
fix for issue #2475: Streaming api deadlock (#3048) 2023-07-09 04:21:20 +02:00

			`def _get_api_lock(tls) -> asyncio.Lock:`
			`"""`
			`The streaming and blocking API implementations each run on their own`
			`thread, and multiplex requests using asyncio. If multiple outstanding`
			`requests are received at once, we will try to acquire the shared lock`
			`shared.generation_lock multiple times in succession in the same thread,`
			`which will cause a deadlock.`

			`To avoid this, we use this wrapper function to block on an asyncio`
			`lock, and then try and grab the shared lock only while holding`
			`the asyncio lock.`
			`"""`
			`if not hasattr(tls, "asyncio_lock"):`
			`tls.asyncio_lock = asyncio.Lock()`

			`return tls.asyncio_lock`


			`def with_api_lock(func):`
			`"""`
			`This decorator should be added to all streaming API methods which`
			`require access to the shared.generation_lock. It ensures that the`
			`tls.asyncio_lock is acquired before the method is called, and`
			`released afterwards.`
			`"""`
			`@functools.wraps(func)`
			`async def api_wrapper(args, *kwargs):`
			`async with _get_api_lock(api_tls):`
			`return await func(args, *kwargs)`
			`return api_wrapper`