Refactoring + UDP

rhasspy · Aug 24, 2023 · f5372f7 · f5372f7
1 parent 96a0b8a
commit f5372f7
Show file tree

Hide file tree

Showing 7 changed files with 159 additions and 46 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+.DS_Store
+.idea
+*.log
+tmp/
+
+*.py[cod]
+*.egg
+build
+htmlcov
+
+.venv/
diff --git a/assist_microphone/config.yaml b/assist_microphone/config.yaml
@@ -1,5 +1,5 @@
 ---
-version: 0.2.9
+version: 0.2.10
 slug: assist_microphone
 name: assist_microphone
 description: Stream microphone audio to Assist
@@ -16,6 +16,8 @@ options:
   awake_sound: true
   done_sound: true
   wake_buffer_seconds: 0.0
+  udp_mic: false
+  udp_snd: false
   debug_logging: false
 schema:
   token: str
@@ -26,8 +28,12 @@ schema:
   awake_sound: bool
   done_sound: bool
   wake_buffer_seconds: float
+  udp_mic: bool
+  udp_snd: bool
   debug_logging: bool
 init: false
 audio: true
 homeassistant_api: true
+ports:
+  "5000/udp": null
 homeassistant: 2023.9.0.dev20230809
diff --git a/assist_microphone/hass_satellite/__main__.py b/assist_microphone/hass_satellite/__main__.py
@@ -1,20 +1,22 @@
 #!/usr/bin/env python3
 import argparse
 import asyncio
+import contextlib
+import functools
 import logging
 import shutil
+import socket
 import sys
 import threading
 from collections import deque
-from dataclasses import dataclass
-from enum import Enum, auto
 from typing import Deque, Optional, Tuple
 
 import sounddevice as sd
 
-from .mic import record
+from .mic import record_stream, record_udp
 from .remote import stream
-from .snd import play
+from .snd import play_stream, play_udp
+from .state import State, MicState
 from .vad import (
     SileroVoiceActivityDetector,
     VoiceActivityDetector,
@@ -24,18 +26,6 @@
 _LOGGER = logging.getLogger(__name__)
 
 
-class MicState(str, Enum):
-    NOT_RECORDING = auto()
-    WAIT_FOR_VAD = auto()
-    RECORDING = auto()
-
-
-@dataclass
-class State:
-    is_running: bool = True
-    mic: MicState = MicState.NOT_RECORDING
-
-
 async def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("host", help="Home Assistant server host")
@@ -76,6 +66,9 @@ async def main() -> None:
     #
     parser.add_argument("--wake-buffer-seconds", type=float, default=0)
     #
+    parser.add_argument("--udp-mic", type=int, help="UDP port to receive input audio")
+    parser.add_argument("--udp-snd", type=int, help="UDP port to send output audio")
+    #
     parser.add_argument(
         "--debug", action="store_true", help="Print DEBUG messages to the console"
     )
@@ -130,21 +123,45 @@ async def main() -> None:
     )
     mic_thread.start()
 
+    # Audio output
+    snd_socket: Optional[socket.socket] = None
+
     try:
         while True:
             try:
+                if args.udp_snd is not None:
+                    if snd_socket is None:
+                        snd_socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+                    snd_stream = contextlib.nullcontext()
+                    play = functools.partial(
+                        play_udp,
+                        udp_socket=snd_socket,
+                        udp_port=args.udp_snd,
+                        state=state,
+                        sample_rate=16000,
+                        volume=args.volume,
+                    )
+                else:
+                    snd_stream = sd.RawOutputStream(
+                        device=args.snd_device,
+                        samplerate=snd_sample_rate,
+                        channels=1,
+                        dtype="int16",
+                    )
+                    play = functools.partial(
+                        play_stream,
+                        stream=snd_stream,
+                        sample_rate=snd_sample_rate,
+                        volume=args.volume,
+                    )
+
                 if args.vad:
                     _LOGGER.debug("Waiting for speech")
                     await speech_detected.wait()
                     speech_detected.clear()
                     _LOGGER.debug("Speech detected")
 
-                with sd.RawOutputStream(
-                    device=args.snd_device,
-                    samplerate=snd_sample_rate,
-                    channels=1,
-                    dtype="int16",
-                ) as snd_stream:
+                with snd_stream:
                     async for _timestamp, event_type, event_data in stream(
                         host=args.host,
                         token=args.token,
@@ -157,32 +174,19 @@ async def main() -> None:
                         if event_type == "wake_word-end":
                             if args.awake_sound:
                                 state.mic = MicState.NOT_RECORDING
-                                play(
-                                    media=args.awake_sound,
-                                    stream=snd_stream,
-                                    sample_rate=snd_sample_rate,
-                                    volume=args.volume,
-                                )
+                                play(media=args.awake_sound)
                                 state.mic = MicState.RECORDING
                         elif event_type == "stt-end":
                             # Stop recording until run ends
                             state.mic = MicState.NOT_RECORDING
                             if args.done_sound:
-                                play(
-                                    media=args.done_sound,
-                                    stream=snd_stream,
-                                    sample_rate=snd_sample_rate,
-                                    volume=args.volume,
-                                )
+                                play(media=args.done_sound)
                         elif event_type == "tts-end":
                             # Play TTS output
                             tts_url = event_data.get("tts_output", {}).get("url")
                             if tts_url:
                                 play(
-                                    media=f"{args.protocol}://{args.host}:{args.port}{tts_url}",
-                                    stream=snd_stream,
-                                    sample_rate=snd_sample_rate,
-                                    volume=args.volume,
+                                    media=f"{args.protocol}://{args.host}:{args.port}{tts_url}"
                                 )
                         elif event_type in ("run-end", "error"):
                             # Start recording for next wake word
@@ -220,7 +224,12 @@ def _mic_proc(
         else:
             _LOGGER.debug("No VAD")
 
-        for ts_chunk in record(args.mic_device):
+        if args.udp_mic is not None:
+            mic_stream = record_udp(args.udp_mic, state)
+        else:
+            mic_stream = record_stream(args.mic_device)
+
+        for ts_chunk in mic_stream:
             if not state.is_running:
                 break
 

diff --git a/assist_microphone/hass_satellite/mic.py b/assist_microphone/hass_satellite/mic.py
@@ -1,17 +1,18 @@
-import argparse
-import asyncio
-import sys
+import socket
 import time
 from typing import Final, Iterable, Optional, Tuple, Union
 
 import sounddevice as sd
 
+from .state import State
+
 _RATE: Final = 16000
+_WIDTH: Final = 2
 _CHANNELS: Final = 1
 _SAMPLES_PER_CHUNK = int(0.03 * _RATE)  # 30ms
 
 
-def record(
+def record_stream(
     device: Optional[Union[str, int]],
     samples_per_chunk: int = _SAMPLES_PER_CHUNK,
 ) -> Iterable[Tuple[int, bytes]]:
@@ -27,3 +28,21 @@ def record(
             chunk, _overflowed = stream.read(samples_per_chunk)
             chunk = bytes(chunk)
             yield time.monotonic_ns(), chunk
+
+
+def record_udp(
+    port: int,
+    state: State,
+    host: str = "0.0.0.0",
+    samples_per_chunk: int = _SAMPLES_PER_CHUNK,
+) -> Iterable[Tuple[int, bytes]]:
+    udp_socket = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+    udp_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    udp_socket.bind((host, port))
+
+    while True:
+        chunk, addr = udp_socket.recvfrom(samples_per_chunk * _WIDTH)
+        if state.mic_host is None:
+            state.mic_host = addr[0]
+
+        yield time.monotonic_ns(), chunk
diff --git a/assist_microphone/hass_satellite/snd.py b/assist_microphone/hass_satellite/snd.py
@@ -1,12 +1,16 @@
 import logging
+import socket
 import subprocess
 import wave
 
 import sounddevice as sd
 
-_LOGGER  = logging.getLogger()
+from .state import State
 
-def play(
+_LOGGER = logging.getLogger()
+
+
+def play_stream(
     media: str,
     stream: sd.RawOutputStream,
     sample_rate: int,
@@ -40,3 +44,43 @@ def play(
             while chunk:
                 stream.write(chunk)
                 chunk = wav_file.readframes(samples_per_chunk)
+
+
+def play_udp(
+    media: str,
+    udp_socket: socket.socket,
+    udp_port: int,
+    state: State,
+    sample_rate: int,
+    samples_per_chunk: int = 1024,
+    volume: float = 1.0,
+) -> None:
+    assert state.mic_host is not None
+
+    cmd = [
+        "ffmpeg",
+        "-i",
+        media,
+        "-f",
+        "wav",
+        "-ar",
+        str(sample_rate),
+        "-ac",
+        "1",
+        "-filter:a",
+        f"volume={volume}",
+        "-",
+    ]
+    _LOGGER.debug("play: %s", cmd)
+
+    with subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.DEVNULL,
+    ) as proc:
+        with wave.open(proc.stdout, "rb") as wav_file:
+            assert wav_file.getsampwidth() == 2
+            chunk = wav_file.readframes(samples_per_chunk)
+            while chunk:
+                udp_socket.sendto(chunk, (state.mic_host, udp_port))
+                chunk = wav_file.readframes(samples_per_chunk)
diff --git a/assist_microphone/hass_satellite/state.py b/assist_microphone/hass_satellite/state.py
@@ -0,0 +1,16 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional
+
+
+class MicState(str, Enum):
+    NOT_RECORDING = auto()
+    WAIT_FOR_VAD = auto()
+    RECORDING = auto()
+
+
+@dataclass
+class State:
+    is_running: bool = True
+    mic: MicState = MicState.NOT_RECORDING
+    mic_host: Optional[str] = None
diff --git a/assist_microphone/rootfs/etc/s6-overlay/s6-rc.d/assist_microphone/run b/assist_microphone/rootfs/etc/s6-overlay/s6-rc.d/assist_microphone/run
@@ -18,6 +18,14 @@ if bashio::config.true 'done_sound'; then
     extra_args+=('--done-sound' '/usr/src/sounds/done.wav')
 fi
 
+if bashio::config.true 'udp_mic'; then
+    extra_args+=('--udp-mic' 5000)
+fi
+
+if bashio::config.true 'udp_snd'; then
+    extra_args+=('--udp-snd' 6055)
+fi
+
 exec python3 -m hass_satellite \
     'homeassistant' \
     "$(bashio::config 'token')" \