use Gradio microphone input instead

This commit is contained in:
EliasVincent 2023-03-12 21:03:07 +01:00
parent 3b4145966d
commit 48aa52849b
3 changed files with 17 additions and 50 deletions

View file

@ -1,39 +0,0 @@
# Installation instructions
- On all platforms, run `pip install -r requirements.txt` in this folder
- You need **PortAudio** to run the speech recognition. Below are guides for all platforms
## Windows
- You don't need to do anything, `pyaudio` already comes with PortAudio included on Windows.
## Mac
```commandline
brew install portaudio
brew link --overwrite portaudio
pip install pyaudio
```
## Linux
- You have to use your distro's package manager to install PortAudio.
### Ubuntu / Debian / Mint
```commandline
sudo apt install portaudio19-dev python3-pyaudio
```
### Arch / Manjaro
```commandline
sudo pacman -S portaudio
```
### Fedora
```commandline
sudo dnf -y install portaudio
```

View file

@ -1,5 +1,4 @@
git+https://github.com/Uberi/speech_recognition.git@010382b git+https://github.com/Uberi/speech_recognition.git@010382b
PyAudio
openai-whisper openai-whisper
soundfile soundfile
ffmpeg ffmpeg

View file

@ -7,22 +7,24 @@ input_hijack = {
} }
def do_stt(): def do_stt(audio, text_state=""):
transcription = "" transcription = ""
r = sr.Recognizer() r = sr.Recognizer()
with sr.Microphone() as source:
r.adjust_for_ambient_noise(source, 0.2) # Convert to AudioData
audio = r.listen(source) audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4)
try: try:
transcription = r.recognize_whisper(audio, language="english", model="base.en") transcription = r.recognize_whisper(audio_data, language="english", model="base.en")
except sr.UnknownValueError: except sr.UnknownValueError:
print("Whisper could not understand audio") print("Whisper could not understand audio")
except sr.RequestError as e: except sr.RequestError as e:
print("Could not request results from Whisper", e) print("Could not request results from Whisper", e)
input_hijack.update({"state": True, "value": [transcription, transcription]}) input_hijack.update({"state": True, "value": [transcription, transcription]})
return transcription
text_state += transcription + " "
return text_state, text_state
def update_hijack(val): def update_hijack(val):
@ -31,7 +33,12 @@ def update_hijack(val):
def ui(): def ui():
speech_button = gr.Button(value="🎙️") tr_state = gr.State(value="")
output_transcription = gr.Textbox(label="STT-Input", placeholder="Speech Preview. Click \"Generate\" to send", interactive=True) output_transcription = gr.Textbox(label="STT-Input",
output_transcription.change(fn=update_hijack, inputs=[output_transcription]) placeholder="Speech Preview. Click \"Generate\" to send",
speech_button.click(do_stt, outputs=[output_transcription]) interactive=True)
output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state])
with gr.Row():
audio = gr.Audio(source="microphone")
transcribe_button = gr.Button(value="Transcribe")
transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state])