text-generation-webui/extensions/silero_tts/script.py

import html
import random
import time
from pathlib import Path

import gradio as gr
import torch

from extensions.silero_tts import tts_preprocessor
from modules import chat, shared, ui_chat
from modules.utils import gradio

torch._C._jit_set_profiling_mode(False)


params = {
    'activate': True,
    'speaker': 'en_56',
    'language': 'en',
    'model_id': 'v3_en',
    'sample_rate': 48000,
    'device': 'cpu',
    'show_text': False,
    'autoplay': True,
    'voice_pitch': 'medium',
    'voice_speed': 'medium',
    'local_cache_path': ''  # User can override the default cache path to something other via settings.json
}

current_params = params.copy()

voices_en = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
voices_es = ["es_0", "es_1", "es_2"]
voices_fr = ["fr_0", "fr_1", "fr_2", "fr_3", "fr_4", "fr_5"]
voices_de = ["bernd_ungerer", "eva_k", "friedrich", "hokuspokus", "karlsson"]
voices_ru = ["aidar", "baya", "kseniya", "xenia"]
voices_ua = ["mykyta"]
voices_uz = ["dilnavoz"]

languages = {
    "en": {"label": "English", "voices": voices_en, "default_voice": "en_56", "model_id": "v3_en"},
    "es": {"label": "Español", "voices": voices_es, "default_voice": "es_0", "model_id": "v3_es"},
    "fr": {"label": "Français", "voices": voices_fr, "default_voice": "fr_0", "model_id": "v3_fr"},
    "de": {"label": "Deutsch", "voices": voices_de, "default_voice": "eva_k", "model_id": "v3_de"},
    "ru": {"label": "русский", "voices": voices_ru, "default_voice": "aidar", "model_id": "ru_v3"},
    "ua": {"label": "українська", "voices": voices_ua, "default_voice": "mykyta", "model_id": "v3_ua"},
    "uz": {"label": "Oʻzbekcha", "voices": voices_uz, "default_voice": "dilnavoz", "model_id": "v3_uz"},
}

voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']

# Used for making text xml compatible, needed for voice pitch and speed control
table = str.maketrans({
    "<": "&lt;",
    ">": "&gt;",
    "&": "&amp;",
    "'": "&apos;",
    '"': "&quot;",
})


def xmlesc(txt):
    return txt.translate(table)


def load_model():
    torch_cache_path = torch.hub.get_dir() if params['local_cache_path'] == '' else params['local_cache_path']
    model_path = torch_cache_path + "/snakers4_silero-models_master/src/silero/model/" + params['model_id'] + ".pt"
    if Path(model_path).is_file():
        print(f'\nUsing Silero TTS cached checkpoint found at {torch_cache_path}')
        model, example_text = torch.hub.load(repo_or_dir=torch_cache_path + '/snakers4_silero-models_master/', model='silero_tts', language=params['language'], speaker=params['model_id'], source='local', path=model_path, force_reload=True)
    else:
        print(f'\nSilero TTS cache not found at {torch_cache_path}. Attempting to download...')
        model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
    model.to(params['device'])
    return model


def remove_tts_from_history(history):
    for i, entry in enumerate(history['internal']):
        history['visible'][i] = [history['visible'][i][0], entry[1]]

    return history


def toggle_text_in_history(history):
    for i, entry in enumerate(history['visible']):
        visible_reply = entry[1]
        if visible_reply.startswith('<audio'):
            if params['show_text']:
                reply = history['internal'][i][1]
                history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>\n\n{reply}"]
            else:
                history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>"]

    return history


def state_modifier(state):
    if not params['activate']:
        return state

    state['stream'] = False
    return state


def input_modifier(string, state):
    if not params['activate']:
        return string

    shared.processing_message = "*Is recording a voice message...*"
    return string


def history_modifier(history):
    # Remove autoplay from the last reply
    if len(history['internal']) > 0:
        history['visible'][-1] = [
            history['visible'][-1][0],
            history['visible'][-1][1].replace('controls autoplay>', 'controls>')
        ]

    return history


def output_modifier(string, state):
    global model, current_params, streaming_state

    for i in params:
        if params[i] != current_params[i]:
            model = load_model()
            current_params = params.copy()
            break

    if not params['activate']:
        return string

    original_string = string
    string = tts_preprocessor.preprocess(html.unescape(string))

    if string == '':
        string = '*Empty reply, try regenerating*'
    else:
        output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
        prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
        silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
        model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))

        autoplay = 'autoplay' if params['autoplay'] else ''
        string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
        if params['show_text']:
            string += f'\n\n{original_string}'

    shared.processing_message = "*Is typing...*"
    return string


def setup():
    global model
    model = load_model()


def random_sentence():
    with open(Path("extensions/silero_tts/harvard_sentences.txt")) as f:
        return random.choice(list(f))


def voice_preview(preview_text):
    global model, current_params, streaming_state

    for i in params:
        if params[i] != current_params[i]:
            model = load_model()
            current_params = params.copy()
            break

    string = tts_preprocessor.preprocess(preview_text or random_sentence())

    output_file = Path('extensions/silero_tts/outputs/voice_preview.wav')
    prosody = f"<prosody rate=\"{params['voice_speed']}\" pitch=\"{params['voice_pitch']}\">"
    silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
    model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))

    return f'<audio src="file/{output_file.as_posix()}?{int(time.time())}" controls autoplay></audio>'


def language_change(lang):
    global params
    lang_code = list(languages.keys())[lang]
    params.update({"language": lang_code, "speaker": languages[lang_code]["default_voice"], "model_id": languages[lang_code]["model_id"]})
    return gr.update(choices=languages[lang_code]["voices"], value=languages[lang_code]["default_voice"])


def custom_css():
    path_to_css = Path(__file__).parent.resolve() / 'style.css'
    return open(path_to_css, 'r').read()


def ui():
    # Gradio elements
    with gr.Accordion("Silero TTS"):
        with gr.Row():
            activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
            autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')

        show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
        
        with gr.Row():
            language = gr.Dropdown(value=languages[params['language']]["label"], choices=[v["label"] for _, v in languages.items()], label='Language', type="index")
            voice = gr.Dropdown(value=params['speaker'], choices=voices_en, label='TTS voice')
        with gr.Row():
            v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
            v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')

        with gr.Row():
            preview_text = gr.Text(show_label=False, placeholder="Preview text", elem_id="silero_preview_text")
            preview_play = gr.Button("Preview")
            preview_audio = gr.HTML(visible=False)

        with gr.Row():
            convert = gr.Button('Permanently replace audios with the message texts')
            convert_cancel = gr.Button('Cancel', visible=False)
            convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)

    # Convert history with confirmation
    convert_arr = [convert_confirm, convert, convert_cancel]
    convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
    convert_confirm.click(
        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
        remove_tts_from_history, gradio('history'), gradio('history')).then(
        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))

    convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)

    # Toggle message text in history
    show_text.change(
        lambda x: params.update({"show_text": x}), show_text, None).then(
        toggle_text_in_history, gradio('history'), gradio('history')).then(
        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))

    # Event functions to update the parameters in the backend
    activate.change(lambda x: params.update({"activate": x}), activate, None)
    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
    language.change(language_change, language, voice, show_progress=False)
    voice.change(lambda x: params.update({"speaker": x}), voice, None)
    v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
    v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)

    # Play preview
    preview_text.submit(voice_preview, preview_text, preview_audio)
    preview_play.click(voice_preview, preview_text, preview_audio)
-												Unescape model output for silero/elevenlabs

											
										
										
											2023-08-25 02:27:12 +02:00
+								import html
-												Add back silero preview by @missionfloyd (#3446)


											
										
										
											2023-08-04 07:29:14 +02:00
+								import random
-												Reorder the imports

											
										
										
											2023-03-12 17:36:18 +01:00
+								import time
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								from pathlib import Path
-												Add ui() function to extensions

											
										
										
											2023-02-24 23:00:11 +01:00
+								import gradio as gr
-												Minor changes
											
										
										
											2023-03-22 19:55:03 +01:00
+								import torch
-												Refactor chat functions (#2003)


											
										
										
											2023-05-11 20:37:04 +02:00
 								from extensions.silero_tts import tts_preprocessor
-												Unify the 3 interface modes (#3554)


											
										
										
											2023-08-13 06:12:15 +02:00
+								from modules import chat, shared, ui_chat
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								from modules.utils import gradio
-												Better TTS with autoplay

- Adds "still_streaming" to shared module for extensions to know if generation is complete
- Changed TTS extension with new options:
   - Show text under the audio widget
   - Automatically play the audio once text generation finishes
   - manage the generated wav files (only keep files for finished generations, optional max file limit)
   - [wip] ability to change voice pitch and speed
- added 'tensorboard' to requirements, since python sent "tensorboard not found" errors after a fresh installation.

											
										
										
											2023-03-08 12:02:17 +01:00
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								torch._C._jit_set_profiling_mode(False)
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								params = {
-												Make it possible to disable the TTS from within the interface

											
										
										
											2023-02-17 03:38:27 +01:00
+								    'activate': True,
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								    'speaker': 'en_56',
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								    'language': 'en',
 								    'model_id': 'v3_en',
 								    'sample_rate': 48000,
 								    'device': 'cpu',
-												Minor style changes to silero_tts

											
										
										
											2023-03-11 15:17:13 +01:00
+								    'show_text': False,
-												Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID

											
										
										
											2023-03-11 06:34:59 +01:00
+								    'autoplay': True,
 								    'voice_pitch': 'medium',
 								    'voice_speed': 'medium',
-												Silero TTS offline cache (#628)


											
										
										
											2023-04-07 17:15:57 +02:00
+								    'local_cache_path': ''  # User can override the default cache path to something other via settings.json
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								}
-												Minor style changes to silero_tts

											
										
										
											2023-03-11 15:17:13 +01:00
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								current_params = params.copy()
-												silero_tts: Add language option (#3878)


											
										
										
											2023-09-12 20:49:46 +02:00
 								voices_en = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115']
 								voices_es = ["es_0", "es_1", "es_2"]
 								voices_fr = ["fr_0", "fr_1", "fr_2", "fr_3", "fr_4", "fr_5"]
 								voices_de = ["bernd_ungerer", "eva_k", "friedrich", "hokuspokus", "karlsson"]
 								voices_ru = ["aidar", "baya", "kseniya", "xenia"]
 								voices_ua = ["mykyta"]
 								voices_uz = ["dilnavoz"]
 								languages = {
 								    "en": {"label": "English", "voices": voices_en, "default_voice": "en_56", "model_id": "v3_en"},
 								    "es": {"label": "Español", "voices": voices_es, "default_voice": "es_0", "model_id": "v3_es"},
 								    "fr": {"label": "Français", "voices": voices_fr, "default_voice": "fr_0", "model_id": "v3_fr"},
 								    "de": {"label": "Deutsch", "voices": voices_de, "default_voice": "eva_k", "model_id": "v3_de"},
 								    "ru": {"label": "русский", "voices": voices_ru, "default_voice": "aidar", "model_id": "ru_v3"},
 								    "ua": {"label": "українська", "voices": voices_ua, "default_voice": "mykyta", "model_id": "v3_ua"},
 								    "uz": {"label": "Oʻzbekcha", "voices": voices_uz, "default_voice": "dilnavoz", "model_id": "v3_uz"},
 								}
-												Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID

											
										
										
											2023-03-11 06:34:59 +01:00
+								voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high']
 								voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast']
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
-												Minor style changes to silero_tts

											
										
										
											2023-03-11 15:17:13 +01:00
+								# Used for making text xml compatible, needed for voice pitch and speed control
-												Better TTS with autoplay

- Adds "still_streaming" to shared module for extensions to know if generation is complete
- Changed TTS extension with new options:
   - Show text under the audio widget
   - Automatically play the audio once text generation finishes
   - manage the generated wav files (only keep files for finished generations, optional max file limit)
   - [wip] ability to change voice pitch and speed
- added 'tensorboard' to requirements, since python sent "tensorboard not found" errors after a fresh installation.

											
										
										
											2023-03-08 12:02:17 +01:00
+								table = str.maketrans({
 								    "<": "&lt;",
 								    ">": "&gt;",
 								    "&": "&amp;",
 								    "'": "&apos;",
 								    '"': "&quot;",
 								})
-												Minor style changes to silero_tts

											
										
										
											2023-03-11 15:17:13 +01:00
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Better TTS with autoplay

- Adds "still_streaming" to shared module for extensions to know if generation is complete
- Changed TTS extension with new options:
   - Show text under the audio widget
   - Automatically play the audio once text generation finishes
   - manage the generated wav files (only keep files for finished generations, optional max file limit)
   - [wip] ability to change voice pitch and speed
- added 'tensorboard' to requirements, since python sent "tensorboard not found" errors after a fresh installation.

											
										
										
											2023-03-08 12:02:17 +01:00
+								def xmlesc(txt):
 								    return txt.translate(table)
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								def load_model():
-												Silero TTS offline cache (#628)


											
										
										
											2023-04-07 17:15:57 +02:00
+								    torch_cache_path = torch.hub.get_dir() if params['local_cache_path'] == '' else params['local_cache_path']
 								    model_path = torch_cache_path + "/snakers4_silero-models_master/src/silero/model/" + params['model_id'] + ".pt"
 								    if Path(model_path).is_file():
 								        print(f'\nUsing Silero TTS cached checkpoint found at {torch_cache_path}')
 								        model, example_text = torch.hub.load(repo_or_dir=torch_cache_path + '/snakers4_silero-models_master/', model='silero_tts', language=params['language'], speaker=params['model_id'], source='local', path=model_path, force_reload=True)
 								    else:
 								        print(f'\nSilero TTS cache not found at {torch_cache_path}. Attempting to download...')
 								        model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id'])
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								    model.to(params['device'])
 								    return model
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								def remove_tts_from_history(history):
 								    for i, entry in enumerate(history['internal']):
 								        history['visible'][i] = [history['visible'][i][0], entry[1]]
 								    return history
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								def toggle_text_in_history(history):
 								    for i, entry in enumerate(history['visible']):
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								        visible_reply = entry[1]
 								        if visible_reply.startswith('<audio'):
 								            if params['show_text']:
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								                reply = history['internal'][i][1]
 								                history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>\n\n{reply}"]
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								            else:
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								                history['visible'][i] = [history['visible'][i][0], f"{visible_reply.split('</audio>')[0]}</audio>"]
 								    return history
-												Add support for custom chat styles (#1917)


											
										
										
											2023-05-08 17:35:03 +02:00
-												Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.

											
										
										
											2023-03-12 07:56:57 +01:00
-												Refactor text_generation.py, add support for custom generation functions (#1817)


											
										
										
											2023-05-05 23:53:03 +02:00
+								def state_modifier(state):
-												FIX silero_tts/elevenlabs_tts activation/deactivation (#2313)


											
										
										
											2023-05-24 15:06:38 +02:00
+								    if not params['activate']:
 								        return state
-												Refactor text_generation.py, add support for custom generation functions (#1817)


											
										
										
											2023-05-05 23:53:03 +02:00
+								    state['stream'] = False
 								    return state
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								def input_modifier(string, state):
-												FIX silero_tts/elevenlabs_tts activation/deactivation (#2313)


											
										
										
											2023-05-24 15:06:38 +02:00
+								    if not params['activate']:
 								        return string
-												Minor style changes to silero_tts

											
										
										
											2023-03-11 15:17:13 +01:00
-												Fix silero tts autoplay (attempt #2)

											
										
										
											2023-05-21 18:24:54 +02:00
+								    shared.processing_message = "*Is recording a voice message...*"
 								    return string
 								def history_modifier(history):
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								    # Remove autoplay from the last reply
-												Fix silero tts autoplay (attempt #2)

											
										
										
											2023-05-21 18:24:54 +02:00
+								    if len(history['internal']) > 0:
 								        history['visible'][-1] = [
 								            history['visible'][-1][0],
 								            history['visible'][-1][1].replace('controls autoplay>', 'controls>')
-												Fix silero tts autoplay

											
										
										
											2023-05-21 17:11:59 +02:00
+								        ]
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
-												Fix silero tts autoplay (attempt #2)

											
										
										
											2023-05-21 18:24:54 +02:00
+								    return history
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								def output_modifier(string, state):
-												Silero_tts streaming fix

Temporarily suppress the streaming during the audio response as it would interfere with the audio (making it stutter and play anew)

											
										
										
											2023-03-25 19:31:13 +01:00
+								    global model, current_params, streaming_state
-												Add back silero preview by @missionfloyd (#3446)


											
										
										
											2023-08-04 07:29:14 +02:00
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								    for i in params:
 								        if params[i] != current_params[i]:
 								            model = load_model()
 								            current_params = params.copy()
 								            break
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
+								    if not params['activate']:
-												Make it possible to disable the TTS from within the interface

											
										
										
											2023-02-17 03:38:27 +01:00
+								        return string
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								    original_string = string
-												Fix silero_tts HTML unescaping

											
										
										
											2023-08-26 09:45:07 +02:00
+								    string = tts_preprocessor.preprocess(html.unescape(string))
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
 								    if string == '':
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								        string = '*Empty reply, try regenerating*'
-												Better TTS with autoplay

- Adds "still_streaming" to shared module for extensions to know if generation is complete
- Changed TTS extension with new options:
   - Show text under the audio widget
   - Automatically play the audio once text generation finishes
   - manage the generated wav files (only keep files for finished generations, optional max file limit)
   - [wip] ability to change voice pitch and speed
- added 'tensorboard' to requirements, since python sent "tensorboard not found" errors after a fresh installation.

											
										
										
											2023-03-08 12:02:17 +01:00
+								    else:
-												Implement sessions + add basic multi-user support (#2991)


											
										
										
											2023-07-04 05:03:30 +02:00
+								        output_file = Path(f'extensions/silero_tts/outputs/{state["character_menu"]}_{int(time.time())}.wav')
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								        prosody = '<prosody rate="{}" pitch="{}">'.format(params['voice_speed'], params['voice_pitch'])
-												Fix silero_tts HTML unescaping

											
										
										
											2023-08-26 09:45:07 +02:00
+								        silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
-												Use str(Path) instead of os.path.abspath(Path)

											
										
										
											2023-03-13 04:08:01 +01:00
+								        model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
-												Merge in audio generation only on text stream finish., postpone audioblock autoplay

- Keeping simpleaudio until audio block "autoplay" doesn't play previous messages
- Only generate audio for finished messages
- Better name for autoplay, clean up comments
- set default to unlimited wav files. Still a few bugs when wav id resets

Co-Authored-By: Christoph Hess <9931495+ChristophHess@users.noreply.github.com>

											
										
										
											2023-03-09 00:48:44 +01:00
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								        autoplay = 'autoplay' if params['autoplay'] else ''
 								        string = f'<audio src="file/{output_file.as_posix()}" controls {autoplay}></audio>'
 								        if params['show_text']:
 								            string += f'\n\n{original_string}'
-												Merge in audio generation only on text stream finish., postpone audioblock autoplay

- Keeping simpleaudio until audio block "autoplay" doesn't play previous messages
- Only generate audio for finished messages
- Better name for autoplay, clean up comments
- set default to unlimited wav files. Still a few bugs when wav id resets

Co-Authored-By: Christoph Hess <9931495+ChristophHess@users.noreply.github.com>

											
										
										
											2023-03-09 00:48:44 +01:00
-												Implement "*Is recording a voice message...*" for TTS #303

											
										
										
											2023-03-14 02:28:00 +01:00
+								    shared.processing_message = "*Is typing...*"
-												Add Silero TTS extension

											
										
										
											2023-02-14 19:06:06 +01:00
+								    return string
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Silero TTS offline cache (#628)


											
										
										
											2023-04-07 17:15:57 +02:00
+								def setup():
 								    global model
 								    model = load_model()
-												Add back silero preview by @missionfloyd (#3446)


											
										
										
											2023-08-04 07:29:14 +02:00
+								def random_sentence():
 								    with open(Path("extensions/silero_tts/harvard_sentences.txt")) as f:
 								        return random.choice(list(f))
 								def voice_preview(preview_text):
 								    global model, current_params, streaming_state
 								    for i in params:
 								        if params[i] != current_params[i]:
 								            model = load_model()
 								            current_params = params.copy()
 								            break
 								    string = tts_preprocessor.preprocess(preview_text or random_sentence())
 								    output_file = Path('extensions/silero_tts/outputs/voice_preview.wav')
 								    prosody = f"<prosody rate=\"{params['voice_speed']}\" pitch=\"{params['voice_pitch']}\">"
 								    silero_input = f'<speak>{prosody}{xmlesc(string)}</prosody></speak>'
 								    model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
 								    return f'<audio src="file/{output_file.as_posix()}?{int(time.time())}" controls autoplay></audio>'
-												silero_tts: Add language option (#3878)


											
										
										
											2023-09-12 20:49:46 +02:00
+								def language_change(lang):
 								    global params
 								    lang_code = list(languages.keys())[lang]
 								    params.update({"language": lang_code, "speaker": languages[lang_code]["default_voice"], "model_id": languages[lang_code]["model_id"]})
 								    return gr.update(choices=languages[lang_code]["voices"], value=languages[lang_code]["default_voice"])
-												Add back silero preview by @missionfloyd (#3446)


											
										
										
											2023-08-04 07:29:14 +02:00
+								def custom_css():
 								    path_to_css = Path(__file__).parent.resolve() / 'style.css'
 								    return open(path_to_css, 'r').read()
-												Add ui() function to extensions

											
										
										
											2023-02-24 23:00:11 +01:00
+								def ui():
 								    # Gradio elements
-												Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID

											
										
										
											2023-03-11 06:34:59 +01:00
+								    with gr.Accordion("Silero TTS"):
-												Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.

											
										
										
											2023-03-12 07:56:57 +01:00
+								        with gr.Row():
 								            activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
 								            autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically')
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID

											
										
										
											2023-03-11 06:34:59 +01:00
+								        show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player')
-												silero_tts: Add language option (#3878)


											
										
										
											2023-09-12 20:49:46 +02:00
 								        with gr.Row():
 								            language = gr.Dropdown(value=languages[params['language']]["label"], choices=[v["label"] for _, v in languages.items()], label='Language', type="index")
 								            voice = gr.Dropdown(value=params['speaker'], choices=voices_en, label='TTS voice')
-												Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.

											
										
										
											2023-03-12 07:56:57 +01:00
+								        with gr.Row():
 								            v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch')
 								            v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed')
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
-												Add back silero preview by @missionfloyd (#3446)


											
										
										
											2023-08-04 07:29:14 +02:00
+								        with gr.Row():
 								            preview_text = gr.Text(show_label=False, placeholder="Preview text", elem_id="silero_preview_text")
 								            preview_play = gr.Button("Preview")
 								            preview_audio = gr.HTML(visible=False)
-												Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.

											
										
										
											2023-03-12 07:56:57 +01:00
+								        with gr.Row():
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								            convert = gr.Button('Permanently replace audios with the message texts')
-												Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.

											
										
										
											2023-03-12 07:56:57 +01:00
+								            convert_cancel = gr.Button('Cancel', visible=False)
-												Clean up silero_tts

This should only be used with --no-stream.

The shared.still_streaming implementation was faulty by design:
output_modifier should never be called when streaming is already over.

											
										
										
											2023-03-13 03:42:49 +01:00
+								            convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False)
-												Fixes and polish

- Change wav naming to be completely unique using timestamp instead of message ID, stops browser using cached audio when new audio is made with the same file name (eg after regenerate or clear history).
- Make the autoplay setting actually disable autoplay.
- Make Settings panel a bit more compact.
- Hide html errors when audio file of chat history is missing.
- Add button to permanently convert TTS history to normal text messages
- Changed the "show message text" toggle to affect the chat history.

											
										
										
											2023-03-12 07:56:57 +01:00
-												Unify the 3 interface modes (#3554)


											
										
										
											2023-08-13 06:12:15 +02:00
+								    # Convert history with confirmation
 								    convert_arr = [convert_confirm, convert, convert_cancel]
 								    convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr)
 								    convert_confirm.click(
 								        lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr).then(
 								        remove_tts_from_history, gradio('history'), gradio('history')).then(
-												TTS extensions fixes after #4022

											
										
										
											2023-09-22 23:57:52 +02:00
+								        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-												Unify the 3 interface modes (#3554)


											
										
										
											2023-08-13 06:12:15 +02:00
+								        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
 								    convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr)
 								    # Toggle message text in history
 								    show_text.change(
 								        lambda x: params.update({"show_text": x}), show_text, None).then(
 								        toggle_text_in_history, gradio('history'), gradio('history')).then(
-												TTS extensions fixes after #4022

											
										
										
											2023-09-22 23:57:52 +02:00
+								        chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then(
-												Unify the 3 interface modes (#3554)


											
										
										
											2023-08-13 06:12:15 +02:00
+								        chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display'))
-												Add ui() function to extensions

											
										
										
											2023-02-24 23:00:11 +01:00
 								    # Event functions to update the parameters in the backend
 								    activate.change(lambda x: params.update({"activate": x}), activate, None)
-												Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID

											
										
										
											2023-03-11 06:34:59 +01:00
+								    autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None)
-												silero_tts: Add language option (#3878)


											
										
										
											2023-09-12 20:49:46 +02:00
+								    language.change(language_change, language, voice, show_progress=False)
-												Add ui() function to extensions

											
										
										
											2023-02-24 23:00:11 +01:00
+								    voice.change(lambda x: params.update({"speaker": x}), voice, None)
-												Working html autoplay, clean up, improve wav naming

- New autoplay using html tag, removed from old message when new input provided
- Add voice pitch and speed control
- Group settings together
- Use name + conversation history to match wavs to messages, minimize problems when changing characters

Current minor bugs:
- Gradio seems to cache the audio files, so using "clear history" and generating new messages will play the old audio (the new messages are saving correctly). Gradio will clear cache and use correct audio after a few messages or after a page refresh.
- Switching characters does not immediately update the message ID used for the audio. ID is updated after the first new message, but that message will use the wrong ID

											
										
										
											2023-03-11 06:34:59 +01:00
+								    v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None)
-												Extract the Preprocessing for Silero into a file and Improve it (#757)


											
										
										
											2023-04-07 16:46:29 +02:00
+								    v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None)
-												Add back silero preview by @missionfloyd (#3446)


											
										
										
											2023-08-04 07:29:14 +02:00
 								    # Play preview
 								    preview_text.submit(voice_preview, preview_text, preview_audio)
 								    preview_play.click(voice_preview, preview_text, preview_audio)