rhasspy · Shulyaka · Aug 25, 2023 · Aug 25, 2023
diff --git a/README.md b/README.md
@@ -102,6 +102,7 @@ See `servers` section of `configuration.yaml` file.
     * [whisper](https://github.com/openai/whisper)
     * [whisper-cpp](https://github.com/ggerganov/whisper.cpp/)
     * [faster-whisper](https://github.com/guillaumekln/faster-whisper/)
+    * [openai-whisper](https://platform.openai.com/docs/guides/speech-to-text) (cloud)
     * [vosk](https://alphacephei.com/vosk/)
     * [coqui-stt](https://stt.readthedocs.io)
     * [pocketsphinx](https://github.com/cmusphinx/pocketsphinx)

diff --git a/programs/asr/openai-whisper/bin/openai_whisper_server.py b/programs/asr/openai-whisper/bin/openai_whisper_server.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import argparse
+import io
+import logging
+import os
+import socket
+import wave
+from pathlib import Path
+
+import openai
+
+from rhasspy3.asr import Transcript
+from rhasspy3.audio import AudioChunk, AudioStop
+from rhasspy3.event import read_event, write_event
+
+_FILE = Path(__file__)
+_DIR = _FILE.parent
+_LOGGER = logging.getLogger(_FILE.stem)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("api_key_path", help="Path to OpenAI API key")
+    parser.add_argument("--model", default="whisper-1", help="Model name to use")
+    parser.add_argument(
+        "--socketfile", required=True, help="Path to Unix domain socket file"
+    )
+    parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+    openai.api_key_path = args.api_key_path
+
+    # Need to unlink socket if it exists
+    try:
+        os.unlink(args.socketfile)
+    except OSError:
+        pass
+
+    try:
+        # Create socket server
+        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        sock.bind(args.socketfile)
+        sock.listen()
+
+        _LOGGER.info("Ready")
+
+        # Listen for connections
+        while True:
+            try:
+                connection, client_address = sock.accept()
+                _LOGGER.debug("Connection from %s", client_address)
+
+                is_first_audio = True
+                with connection, connection.makefile(
+                    mode="rwb"
+                ) as conn_file, io.BytesIO() as wav_io:
+                    wav_file: wave.Wave_write = wave.open(wav_io, "wb")
+                    with wav_file:
+                        while True:
+                            event = read_event(conn_file)  # type: ignore
+                            if event is None:
+                                break
+
+                            if AudioChunk.is_type(event.type):
+                                chunk = AudioChunk.from_event(event)
+
+                                if is_first_audio:
+                                    _LOGGER.debug("Receiving audio")
+                                    wav_file.setframerate(chunk.rate)
+                                    wav_file.setsampwidth(chunk.width)
+                                    wav_file.setnchannels(chunk.channels)
+                                    is_first_audio = False
+
+                                wav_file.writeframes(chunk.audio)
+                            elif AudioStop.is_type(event.type):
+                                _LOGGER.debug("Audio stopped")
+                                break
+
+                    wav_io.seek(0)
+                    wav_io.name = _DIR.parent.as_posix()+"/share/dummy.wav"
+                    text = openai.Audio.transcribe(args.model, wav_io).text
+                    _LOGGER.info(text)
+
+                    write_event(Transcript(text=text).event(), conn_file)  # type: ignore
+            except KeyboardInterrupt:
+                break
+            except Exception:
+                _LOGGER.exception("Error communicating with socket client")
+    finally:
+        os.unlink(args.socketfile)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
diff --git a/programs/asr/openai-whisper/bin/openai_whisper_wav2text.py b/programs/asr/openai-whisper/bin/openai_whisper_wav2text.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import time
+from pathlib import Path
+
+import openai
+
+_FILE = Path(__file__)
+_DIR = _FILE.parent
+_LOGGER = logging.getLogger(_FILE.stem)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("api_key_path", help="Path to OpenAI API key")
+    parser.add_argument("wav_file", nargs="+", help="Path to WAV file(s) to transcribe")
+    parser.add_argument("--model", default="whisper-1", help="Model name to use")
+    parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+    openai.api_key_path = args.api_key_path
+
+    # Load converted faster-whisper model
+    for wav_path in args.wav_file:
+        _LOGGER.debug("Processing %s", wav_path)
+        start_time = time.monotonic_ns()
+        audio_file = open(wav_path, "rb")
+        text = openai.Audio.transcribe(args.model, audio_file).text
+        audio_file.close()
+        end_time = time.monotonic_ns()
+        _LOGGER.debug(
+            "Transcribed %s in %s second(s)", wav_path, (end_time - start_time) / 1e9
+        )
+        _LOGGER.debug(text)
+
+        print(text, flush=True)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
diff --git a/programs/asr/openai-whisper/requirements.txt b/programs/asr/openai-whisper/requirements.txt
@@ -0,0 +1 @@
+openai
diff --git a/programs/asr/openai-whisper/script/server b/programs/asr/openai-whisper/script/server
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+# Base directory of repo
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+if [ -d "${venv}" ]; then
+    source "${venv}/bin/activate"
+fi
+
+socket_dir="${base_dir}/var/run"
+mkdir -p "${socket_dir}"
+
+python3 "${base_dir}/bin/openai_whisper_server.py" --socketfile "${socket_dir}/openai-whisper.socket" "$@"
diff --git a/programs/asr/openai-whisper/script/setup b/programs/asr/openai-whisper/script/setup
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+# Base directory of repo
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+# Python binary to use
+: "${PYTHON=python3}"
+
+python_version="$(${PYTHON} --version)"
+
+if [ ! -d "${venv}" ]; then
+    # Create virtual environment
+    echo "Creating virtual environment at ${venv} (${python_version})"
+    rm -rf "${venv}"
+    "${PYTHON}" -m venv "${venv}"
+    source "${venv}/bin/activate"
+
+    pip3 install --upgrade pip
+    pip3 install --upgrade wheel setuptools
+else
+    source "${venv}/bin/activate"
+fi
+
+
+# Install Python dependencies
+echo 'Installing Python dependencies'
+pip3 install -r "${base_dir}/requirements.txt"
+
+# Generate dummy wav file for openai API format check
+DUMMY_RAW=$(mktemp)
+mkdir -p "${base_dir}/share"
+dd if=/dev/zero bs=2048 count=1 of="$DUMMY_RAW"
+ffmpeg -f s16le -ar 16000 -ac 1 -i "$DUMMY_RAW" "${base_dir}/share/dummy.wav"
+rm -f "$DUMMY_RAW"
+
+# Create directory for api key storage
+mkdir -p"$this_dir/../../../../data/asr/openai_whisper"
+
+# -----------------------------------------------------------------------------
+
+echo "OK"
diff --git a/programs/asr/openai-whisper/script/wav2text b/programs/asr/openai-whisper/script/wav2text
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+# Directory of *this* script
+this_dir="$( cd "$( dirname "$0" )" && pwd )"
+
+# Base directory of repo
+base_dir="$(realpath "${this_dir}/..")"
+
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+if [ -d "${venv}" ]; then
+    source "${venv}/bin/activate"
+fi
+
+python3 "${base_dir}/bin/openai_whisper_wav2text.py" "$@"
diff --git a/rhasspy3/configuration.yaml b/rhasspy3/configuration.yaml
@@ -218,6 +218,23 @@ programs:
       command: |
         client_unix_socket.py var/run/faster-whisper.socket
 
+    # https://platform.openai.com/docs/guides/speech-to-text
+    # Get API key on https://platform.openai.com/account/api-keys and save it as ${data_dir}/api_key
+    # Note this is a cloud service, it is less secure, paid service, and requires internet connection
+    openai-whisper:
+      command: |
+        script/wav2text "${api_key_path}" "{wav_file} --model ${model}"
+      adapter: |
+        asr_adapter_wav2text.py
+      template_args:
+        api_key_path: "${data_dir}/api_key"
+        model: "whisper-1"
+
+    # Run server: asr openai-whisper
+    openai-whisper.client:
+      command: |
+        client_unix_socket.py var/run/openai-whisper.socket
+
 
   # --------------
   # Text to speech
@@ -491,6 +508,13 @@ servers:
         model: "${data_dir}/tiny-int8"
         device: "cpu"  # or cuda
 
+    openai-whisper:
+      command: |
+        script/server "${api_key_path} --model ${model}"
+      template_args:
+        api_key_path: "${data_dir}/api_key"
+        model: "whisper-1"
+
   tts:
     mimic3:
       command: |