audio_stream.py: Added focus_side property. When set, the stream yields from the focused side regardless of energy (attention lock). When None, falls back to energy-based auto selection. multi_speaker.py: When beams lock onto 2 speakers, sets audio focus to the target speaker's side. Auto-switches target when the current target goes silent and the other starts talking. Manual focus via API. headmic.py: New endpoint POST /speakers/focus?speaker=0|1 to manually switch attention. /speakers/tracked now shows is_target, target_speaker, and audio_focus fields. The cocktail party effect: when 2 people are talking, the audio feed to Porcupine/VAD/transcription comes from the target speaker's direction, suppressing the other. XVF3800 beam gating silences the non-speaking beam, and audio_stream focus locks the ear facing the target. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
201 lines
6.7 KiB
Python
201 lines
6.7 KiB
Python
"""
|
|
Dual audio stream manager for two XVF3800 mic arrays.
|
|
|
|
Runs two arecord subprocesses (one per array) and provides best-beam selection:
|
|
the stream with higher energy is considered "active" (facing the speaker).
|
|
"""
|
|
|
|
import logging
|
|
import struct
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
from typing import Optional, Generator
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger("headmic.audio")
|
|
|
|
SAMPLE_RATE = 16000
|
|
FRAME_SIZE = 512 # Porcupine requires 512 samples
|
|
BYTES_PER_FRAME = FRAME_SIZE * 2 # 16-bit = 2 bytes per sample
|
|
ENERGY_WINDOW = 10 # frames to average for energy comparison
|
|
|
|
|
|
class MicStream:
|
|
"""Audio stream from a single ALSA device via arecord subprocess."""
|
|
|
|
def __init__(self, label: str, alsa_device: str):
|
|
self.label = label
|
|
self.alsa_device = alsa_device
|
|
self.proc: Optional[subprocess.Popen] = None
|
|
self.running = False
|
|
self.current_frame: Optional[bytes] = None
|
|
self.energy: float = 0.0
|
|
self._energy_history: list[float] = []
|
|
self._lock = threading.Lock()
|
|
self._thread: Optional[threading.Thread] = None
|
|
|
|
def start(self):
|
|
cmd = [
|
|
"arecord",
|
|
"-D", self.alsa_device,
|
|
"-f", "S16_LE",
|
|
"-r", str(SAMPLE_RATE),
|
|
"-c", "1",
|
|
"-t", "raw",
|
|
"-q",
|
|
"-"
|
|
]
|
|
logger.info("[%s] Starting: %s", self.label, " ".join(cmd))
|
|
self.proc = subprocess.Popen(
|
|
cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
|
bufsize=BYTES_PER_FRAME
|
|
)
|
|
self.running = True
|
|
self._thread = threading.Thread(target=self._read_loop, daemon=True)
|
|
self._thread.start()
|
|
|
|
def _read_loop(self):
|
|
try:
|
|
while self.running and self.proc:
|
|
data = self.proc.stdout.read(BYTES_PER_FRAME)
|
|
if len(data) < BYTES_PER_FRAME:
|
|
break
|
|
# Compute frame energy (RMS)
|
|
samples = np.frombuffer(data, dtype=np.int16).astype(np.float32)
|
|
rms = float(np.sqrt(np.mean(samples * samples))) / 32768.0
|
|
|
|
with self._lock:
|
|
self.current_frame = data
|
|
self._energy_history.append(rms)
|
|
if len(self._energy_history) > ENERGY_WINDOW:
|
|
self._energy_history.pop(0)
|
|
self.energy = sum(self._energy_history) / len(self._energy_history)
|
|
except Exception as e:
|
|
logger.error("[%s] Read error: %s", self.label, e)
|
|
finally:
|
|
logger.info("[%s] Stream ended", self.label)
|
|
|
|
def get_frame(self) -> Optional[bytes]:
|
|
with self._lock:
|
|
return self.current_frame
|
|
|
|
def get_energy(self) -> float:
|
|
with self._lock:
|
|
return self.energy
|
|
|
|
def stop(self):
|
|
self.running = False
|
|
if self.proc:
|
|
try:
|
|
self.proc.terminate()
|
|
self.proc.wait(timeout=2)
|
|
except Exception:
|
|
try:
|
|
self.proc.kill()
|
|
except Exception:
|
|
pass
|
|
self.proc = None
|
|
|
|
|
|
class DualAudioStream:
|
|
"""
|
|
Manages two MicStreams and provides best-beam selection.
|
|
|
|
Usage:
|
|
stream = DualAudioStream(left_alsa, right_alsa)
|
|
stream.start()
|
|
for frame_data, side in stream.frames():
|
|
# frame_data is 512 samples (1024 bytes) of int16 PCM
|
|
# side is "left" or "right" (whichever has more energy)
|
|
...
|
|
stream.stop()
|
|
"""
|
|
|
|
def __init__(self, left_device: str, right_device: Optional[str] = None):
|
|
self.left = MicStream("left", left_device)
|
|
self.right = MicStream("right", right_device) if right_device else None
|
|
self.active_side: str = "left"
|
|
self.focus_side: Optional[str] = None # None=auto (energy), "left"/"right"=locked attention
|
|
self._running = False
|
|
|
|
def start(self):
|
|
self._running = True
|
|
self.left.start()
|
|
if self.right:
|
|
self.right.start()
|
|
# Short delay so first frames are populated
|
|
time.sleep(0.1)
|
|
|
|
def stop(self):
|
|
self._running = False
|
|
self.left.stop()
|
|
if self.right:
|
|
self.right.stop()
|
|
|
|
def frames(self) -> Generator[tuple[bytes, str], None, None]:
|
|
"""
|
|
Yield (frame_bytes, side) at Porcupine's expected rate.
|
|
Always yields from the higher-energy side (best beam).
|
|
Falls back to left if right is unavailable.
|
|
"""
|
|
interval = FRAME_SIZE / SAMPLE_RATE # 0.032s = 32ms
|
|
last_frame_left = None
|
|
last_frame_right = None
|
|
|
|
while self._running:
|
|
t0 = time.monotonic()
|
|
|
|
frame_left = self.left.get_frame()
|
|
frame_right = self.right.get_frame() if self.right else None
|
|
|
|
# Wait for at least one new frame
|
|
if frame_left is None and frame_right is None:
|
|
time.sleep(0.005)
|
|
continue
|
|
|
|
# Skip if no new data since last yield
|
|
if frame_left == last_frame_left and frame_right == last_frame_right:
|
|
time.sleep(0.002)
|
|
continue
|
|
|
|
last_frame_left = frame_left
|
|
last_frame_right = frame_right
|
|
|
|
# Pick beam: focused attention overrides energy-based selection
|
|
if frame_right is None:
|
|
self.active_side = "left"
|
|
yield frame_left, "left"
|
|
elif self.focus_side:
|
|
# Cocktail party mode: locked onto a specific side
|
|
self.active_side = self.focus_side
|
|
if self.focus_side == "right" and frame_right:
|
|
yield frame_right, "right"
|
|
else:
|
|
yield frame_left, "left"
|
|
else:
|
|
# Auto mode: pick higher-energy side
|
|
left_energy = self.left.get_energy()
|
|
right_energy = self.right.get_energy()
|
|
if right_energy > left_energy * 1.1:
|
|
self.active_side = "right"
|
|
elif left_energy > right_energy * 1.1:
|
|
self.active_side = "left"
|
|
|
|
if self.active_side == "right" and frame_right:
|
|
yield frame_right, "right"
|
|
else:
|
|
yield frame_left, "left"
|
|
|
|
# Pace to ~32ms per frame
|
|
elapsed = time.monotonic() - t0
|
|
if elapsed < interval:
|
|
time.sleep(interval - elapsed)
|
|
|
|
def get_side_frame(self, side: str) -> Optional[bytes]:
|
|
"""Get the latest frame from a specific side."""
|
|
if side == "right" and self.right:
|
|
return self.right.get_frame()
|
|
return self.left.get_frame()
|