Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

asr/openai_whisper (cloud) support #41

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ See `servers` section of `configuration.yaml` file.
* [whisper](https://github.com/openai/whisper)
* [whisper-cpp](https://github.com/ggerganov/whisper.cpp/)
* [faster-whisper](https://github.com/guillaumekln/faster-whisper/)
* [openai-whisper](https://platform.openai.com/docs/guides/speech-to-text) (cloud)
* [vosk](https://alphacephei.com/vosk/)
* [coqui-stt](https://stt.readthedocs.io)
* [pocketsphinx](https://github.com/cmusphinx/pocketsphinx)
Expand Down
98 changes: 98 additions & 0 deletions programs/asr/openai-whisper/bin/openai_whisper_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3
import argparse
import io
import logging
import os
import socket
import wave
from pathlib import Path

import openai

from rhasspy3.asr import Transcript
from rhasspy3.audio import AudioChunk, AudioStop
from rhasspy3.event import read_event, write_event

_FILE = Path(__file__)
_DIR = _FILE.parent
_LOGGER = logging.getLogger(_FILE.stem)


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("api_key_path", help="Path to OpenAI API key")
parser.add_argument("--model", default="whisper-1", help="Model name to use")
parser.add_argument(
"--socketfile", required=True, help="Path to Unix domain socket file"
)
parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
args = parser.parse_args()

logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)

openai.api_key_path = args.api_key_path

# Need to unlink socket if it exists
try:
os.unlink(args.socketfile)
except OSError:
pass

try:
# Create socket server
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
sock.bind(args.socketfile)
sock.listen()

_LOGGER.info("Ready")

# Listen for connections
while True:
try:
connection, client_address = sock.accept()
_LOGGER.debug("Connection from %s", client_address)

is_first_audio = True
with connection, connection.makefile(
mode="rwb"
) as conn_file, io.BytesIO() as wav_io:
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
with wav_file:
while True:
event = read_event(conn_file) # type: ignore
if event is None:
break

if AudioChunk.is_type(event.type):
chunk = AudioChunk.from_event(event)

if is_first_audio:
_LOGGER.debug("Receiving audio")
wav_file.setframerate(chunk.rate)
wav_file.setsampwidth(chunk.width)
wav_file.setnchannels(chunk.channels)
is_first_audio = False

wav_file.writeframes(chunk.audio)
elif AudioStop.is_type(event.type):
_LOGGER.debug("Audio stopped")
break

wav_io.seek(0)
wav_io.name = _DIR.parent.as_posix()+"/share/dummy.wav"
text = openai.Audio.transcribe(args.model, wav_io).text
_LOGGER.info(text)

write_event(Transcript(text=text).event(), conn_file) # type: ignore
except KeyboardInterrupt:
break
except Exception:
_LOGGER.exception("Error communicating with socket client")
finally:
os.unlink(args.socketfile)


# -----------------------------------------------------------------------------

if __name__ == "__main__":
main()
45 changes: 45 additions & 0 deletions programs/asr/openai-whisper/bin/openai_whisper_wav2text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python3
import argparse
import logging
import time
from pathlib import Path

import openai

_FILE = Path(__file__)
_DIR = _FILE.parent
_LOGGER = logging.getLogger(_FILE.stem)


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("api_key_path", help="Path to OpenAI API key")
parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe")
parser.add_argument("--model", default="whisper-1", help="Model name to use")
parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
args = parser.parse_args()

logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)

openai.api_key_path = args.api_key_path

# Load converted faster-whisper model
for wav_path in args.wav_file:
_LOGGER.debug("Processing %s", wav_path)
start_time = time.monotonic_ns()
audio_file = open(wav_path, "rb")
text = openai.Audio.transcribe(args.model, audio_file).text
audio_file.close()
end_time = time.monotonic_ns()
_LOGGER.debug(
"Transcribed %s in %s second(s)", wav_path, (end_time - start_time) / 1e9
)
_LOGGER.debug(text)

print(text, flush=True)


# -----------------------------------------------------------------------------

if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions programs/asr/openai-whisper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
openai
20 changes: 20 additions & 0 deletions programs/asr/openai-whisper/script/server
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
set -eo pipefail

# Directory of *this* script
this_dir="$( cd "$( dirname "$0" )" && pwd )"

# Base directory of repo
base_dir="$(realpath "${this_dir}/..")"

# Path to virtual environment
: "${venv:=${base_dir}/.venv}"

if [ -d "${venv}" ]; then
source "${venv}/bin/activate"
fi

socket_dir="${base_dir}/var/run"
mkdir -p "${socket_dir}"

python3 "${base_dir}/bin/openai_whisper_server.py" --socketfile "${socket_dir}/openai-whisper.socket" "$@"
48 changes: 48 additions & 0 deletions programs/asr/openai-whisper/script/setup
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env bash
set -eo pipefail

# Directory of *this* script
this_dir="$( cd "$( dirname "$0" )" && pwd )"

# Base directory of repo
base_dir="$(realpath "${this_dir}/..")"

# Path to virtual environment
: "${venv:=${base_dir}/.venv}"

# Python binary to use
: "${PYTHON=python3}"

python_version="$(${PYTHON} --version)"

if [ ! -d "${venv}" ]; then
# Create virtual environment
echo "Creating virtual environment at ${venv} (${python_version})"
rm -rf "${venv}"
"${PYTHON}" -m venv "${venv}"
source "${venv}/bin/activate"

pip3 install --upgrade pip
pip3 install --upgrade wheel setuptools
else
source "${venv}/bin/activate"
fi


# Install Python dependencies
echo 'Installing Python dependencies'
pip3 install -r "${base_dir}/requirements.txt"

# Generate dummy wav file for openai API format check
DUMMY_RAW=$(mktemp)
mkdir -p "${base_dir}/share"
dd if=/dev/zero bs=2048 count=1 of="$DUMMY_RAW"
ffmpeg -f s16le -ar 16000 -ac 1 -i "$DUMMY_RAW" "${base_dir}/share/dummy.wav"
rm -f "$DUMMY_RAW"

# Create directory for api key storage
mkdir -p"$this_dir/../../../../data/asr/openai_whisper"

# -----------------------------------------------------------------------------

echo "OK"
17 changes: 17 additions & 0 deletions programs/asr/openai-whisper/script/wav2text
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -eo pipefail

# Directory of *this* script
this_dir="$( cd "$( dirname "$0" )" && pwd )"

# Base directory of repo
base_dir="$(realpath "${this_dir}/..")"

# Path to virtual environment
: "${venv:=${base_dir}/.venv}"

if [ -d "${venv}" ]; then
source "${venv}/bin/activate"
fi

python3 "${base_dir}/bin/openai_whisper_wav2text.py" "$@"
24 changes: 24 additions & 0 deletions rhasspy3/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,23 @@ programs:
command: |
client_unix_socket.py var/run/faster-whisper.socket

# https://platform.openai.com/docs/guides/speech-to-text
# Get API key on https://platform.openai.com/account/api-keys and save it as ${data_dir}/api_key
# Note this is a cloud service, it is less secure, paid service, and requires internet connection
openai-whisper:
command: |
script/wav2text "${api_key_path}" "{wav_file} --model ${model}"
adapter: |
asr_adapter_wav2text.py
template_args:
api_key_path: "${data_dir}/api_key"
model: "whisper-1"

# Run server: asr openai-whisper
openai-whisper.client:
command: |
client_unix_socket.py var/run/openai-whisper.socket


# --------------
# Text to speech
Expand Down Expand Up @@ -491,6 +508,13 @@ servers:
model: "${data_dir}/tiny-int8"
device: "cpu" # or cuda

openai-whisper:
command: |
script/server "${api_key_path} --model ${model}"
template_args:
api_key_path: "${data_dir}/api_key"
model: "whisper-1"

tts:
mimic3:
command: |
Expand Down