updates for dual mic array

This commit is contained in:
Alex
2026-04-11 15:11:22 -05:00
parent 1cb3bd6833
commit 6c10e75cbc
5 changed files with 710 additions and 123 deletions

192
audio_stream.py Normal file
View File

@@ -0,0 +1,192 @@
"""
Dual audio stream manager for two XVF3800 mic arrays.
Runs two arecord subprocesses (one per array) and provides best-beam selection:
the stream with higher energy is considered "active" (facing the speaker).
"""
import logging
import struct
import subprocess
import threading
import time
from typing import Optional, Generator
import numpy as np
logger = logging.getLogger("headmic.audio")
SAMPLE_RATE = 16000
FRAME_SIZE = 512 # Porcupine requires 512 samples
BYTES_PER_FRAME = FRAME_SIZE * 2 # 16-bit = 2 bytes per sample
ENERGY_WINDOW = 10 # frames to average for energy comparison
class MicStream:
"""Audio stream from a single ALSA device via arecord subprocess."""
def __init__(self, label: str, alsa_device: str):
self.label = label
self.alsa_device = alsa_device
self.proc: Optional[subprocess.Popen] = None
self.running = False
self.current_frame: Optional[bytes] = None
self.energy: float = 0.0
self._energy_history: list[float] = []
self._lock = threading.Lock()
self._thread: Optional[threading.Thread] = None
def start(self):
cmd = [
"arecord",
"-D", self.alsa_device,
"-f", "S16_LE",
"-r", str(SAMPLE_RATE),
"-c", "1",
"-t", "raw",
"-q",
"-"
]
logger.info("[%s] Starting: %s", self.label, " ".join(cmd))
self.proc = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
bufsize=BYTES_PER_FRAME
)
self.running = True
self._thread = threading.Thread(target=self._read_loop, daemon=True)
self._thread.start()
def _read_loop(self):
try:
while self.running and self.proc:
data = self.proc.stdout.read(BYTES_PER_FRAME)
if len(data) < BYTES_PER_FRAME:
break
# Compute frame energy (RMS)
samples = np.frombuffer(data, dtype=np.int16).astype(np.float32)
rms = float(np.sqrt(np.mean(samples * samples))) / 32768.0
with self._lock:
self.current_frame = data
self._energy_history.append(rms)
if len(self._energy_history) > ENERGY_WINDOW:
self._energy_history.pop(0)
self.energy = sum(self._energy_history) / len(self._energy_history)
except Exception as e:
logger.error("[%s] Read error: %s", self.label, e)
finally:
logger.info("[%s] Stream ended", self.label)
def get_frame(self) -> Optional[bytes]:
with self._lock:
return self.current_frame
def get_energy(self) -> float:
with self._lock:
return self.energy
def stop(self):
self.running = False
if self.proc:
try:
self.proc.terminate()
self.proc.wait(timeout=2)
except Exception:
try:
self.proc.kill()
except Exception:
pass
self.proc = None
class DualAudioStream:
"""
Manages two MicStreams and provides best-beam selection.
Usage:
stream = DualAudioStream(left_alsa, right_alsa)
stream.start()
for frame_data, side in stream.frames():
# frame_data is 512 samples (1024 bytes) of int16 PCM
# side is "left" or "right" (whichever has more energy)
...
stream.stop()
"""
def __init__(self, left_device: str, right_device: Optional[str] = None):
self.left = MicStream("left", left_device)
self.right = MicStream("right", right_device) if right_device else None
self.active_side: str = "left"
self._running = False
def start(self):
self._running = True
self.left.start()
if self.right:
self.right.start()
# Short delay so first frames are populated
time.sleep(0.1)
def stop(self):
self._running = False
self.left.stop()
if self.right:
self.right.stop()
def frames(self) -> Generator[tuple[bytes, str], None, None]:
"""
Yield (frame_bytes, side) at Porcupine's expected rate.
Always yields from the higher-energy side (best beam).
Falls back to left if right is unavailable.
"""
interval = FRAME_SIZE / SAMPLE_RATE # 0.032s = 32ms
last_frame_left = None
last_frame_right = None
while self._running:
t0 = time.monotonic()
frame_left = self.left.get_frame()
frame_right = self.right.get_frame() if self.right else None
# Wait for at least one new frame
if frame_left is None and frame_right is None:
time.sleep(0.005)
continue
# Skip if no new data since last yield
if frame_left == last_frame_left and frame_right == last_frame_right:
time.sleep(0.002)
continue
last_frame_left = frame_left
last_frame_right = frame_right
# Pick best beam
if frame_right is None:
self.active_side = "left"
yield frame_left, "left"
else:
left_energy = self.left.get_energy()
right_energy = self.right.get_energy()
if right_energy > left_energy * 1.1: # 10% hysteresis
self.active_side = "right"
elif left_energy > right_energy * 1.1:
self.active_side = "left"
# else: keep current active_side (hysteresis prevents flapping)
if self.active_side == "right" and frame_right:
yield frame_right, "right"
else:
yield frame_left, "left"
# Pace to ~32ms per frame
elapsed = time.monotonic() - t0
if elapsed < interval:
time.sleep(interval - elapsed)
def get_side_frame(self, side: str) -> Optional[bytes]:
"""Get the latest frame from a specific side."""
if side == "right" and self.right:
return self.right.get_frame()
return self.left.get_frame()

View File

@@ -7,27 +7,32 @@ Runs on head-vixy (Raspberry Pi 5).
Wake word: "Hey Vivi" (trained via Picovoice Porcupine) Wake word: "Hey Vivi" (trained via Picovoice Porcupine)
Architecture: Single shared audio stream feeds both Porcupine (wake word) Architecture: Dual XVF3800 mic arrays (left/right ear), best-beam selection.
and recording buffer. This avoids device conflicts. Single shared audio stream feeds Porcupine, VAD, sound classification, and speaker ID.
Flow: Flow:
1. Continuous audio stream from ReSpeaker 1. Dual audio streams from two XVF3800 arrays
2. Feed frames to Porcupine for wake word detection 2. Best-beam selection (higher energy side)
3. On "Hey Vivi" → start buffering audio 3. Feed frames to Porcupine for wake word detection
4. Use VAD to detect end of speech 4. On "Hey Vivi" → start buffering from active side
5. Send buffer to EarTail for transcription 5. Use VAD to detect end of speech
6. Return to listening mode 6. Send buffer to EarTail for transcription
7. Return to listening mode
Hardware: 2× ReSpeaker XVF3800 4-Mic Array (USB, 2-channel firmware)
DoA + LEDs via USB vendor control (xvf3800.py)
Built by Vixy on Day 77 (January 17, 2026) 💜 Built by Vixy on Day 77 (January 17, 2026) 💜
Upgraded to dual XVF3800 on Day 160 (April 2026)
""" """
import asyncio import asyncio
import collections import collections
import io import io
import json
import logging import logging
import os import os
import struct import struct
import subprocess
import threading import threading
import time import time
import wave import wave
@@ -53,7 +58,8 @@ PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "")
WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn") WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn")
SAMPLE_RATE = 16000 SAMPLE_RATE = 16000
ALSA_DEVICE = "plughw:ArrayUAC10,0" # ReSpeaker 4 Mic Array - by name, not card number (survives reboot order changes) CONFIG_DIR = os.path.expanduser("~/.vixy")
CONFIG_PATH = os.path.join(CONFIG_DIR, "headmic.json")
VAD_AGGRESSIVENESS = 2 # 0-3, higher = more aggressive VAD_AGGRESSIVENESS = 2 # 0-3, higher = more aggressive
SILENCE_FRAMES = 50 # ~1.5 sec of silence to stop (at 30ms frames) SILENCE_FRAMES = 50 # ~1.5 sec of silence to stop (at 30ms frames)
@@ -61,54 +67,73 @@ MAX_RECORDING_FRAMES = 1000 # ~30 sec max
EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764") EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764")
DOA_POLL_HZ = 10 # DoA polling rate
EYE_SERVICE_URL = os.environ.get("EYE_SERVICE_URL", "http://localhost:8780")
# ============================================================================ # ============================================================================
# LED Control # Config persistence
# ============================================================================ # ============================================================================
try: def load_config() -> dict:
from pixel_ring import pixel_ring if not os.path.exists(CONFIG_PATH):
LEDS_AVAILABLE = True return {}
pixel_ring.off() try:
except ImportError: with open(CONFIG_PATH) as f:
LEDS_AVAILABLE = False return json.load(f)
logger.warning("pixel_ring not available") except Exception as e:
logger.warning("Failed to read config: %s", e)
return {}
def save_config(cfg: dict):
os.makedirs(CONFIG_DIR, exist_ok=True)
with open(CONFIG_PATH, "w") as f:
json.dump(cfg, f, indent=2)
# ============================================================================
# XVF3800 + LED Control
# ============================================================================
from xvf3800 import XVF3800Manager, learn_devices
xvf_manager = XVF3800Manager()
LEDS_AVAILABLE = False
def leds_wakeup(): def leds_wakeup():
if LEDS_AVAILABLE: if LEDS_AVAILABLE:
try: try:
pixel_ring.wakeup() xvf_manager.all_leds_solid(0xFFFFFF)
except: pass except: pass
def leds_listening(): def leds_listening():
if LEDS_AVAILABLE: if LEDS_AVAILABLE:
try: try:
pixel_ring.set_color_palette(0x00FFFF, 0x000000) xvf_manager.all_leds_doa()
pixel_ring.think()
except: pass except: pass
def leds_processing(): def leds_processing():
if LEDS_AVAILABLE: if LEDS_AVAILABLE:
try: try:
pixel_ring.set_color_palette(0x9400D3, 0x000000) xvf_manager.all_leds_breath(0x9400D3)
pixel_ring.spin()
except: pass except: pass
def leds_enrolling(): def leds_enrolling():
if LEDS_AVAILABLE: if LEDS_AVAILABLE:
try: try:
pixel_ring.set_color_palette(0xFF8C00, 0x000000) xvf_manager.all_leds_solid(0xFF8C00)
pixel_ring.think()
except: pass except: pass
def leds_off(): def leds_off():
if LEDS_AVAILABLE: if LEDS_AVAILABLE:
try: try:
pixel_ring.off() xvf_manager.all_leds_off()
except: pass except: pass
@@ -132,6 +157,8 @@ class ServiceState:
self.speaker_confidence: float = 0.0 self.speaker_confidence: float = 0.0
self.speaker_recognition_enabled: bool = False self.speaker_recognition_enabled: bool = False
self.enrolling: bool = False self.enrolling: bool = False
self.active_side: str = "left" # which mic array is currently active
self.doa: dict = {} # latest DoA from both arrays
state = ServiceState() state = ServiceState()
@@ -144,48 +171,8 @@ speaker_recognizer = None
enrollment_buffer = None # list of frame bytes, set during enrollment enrollment_buffer = None # list of frame bytes, set during enrollment
enrollment_name = None enrollment_name = None
# Audio stream
# ============================================================================ dual_stream = None # DualAudioStream instance
# Audio Stream using ALSA directly (arecord)
# ============================================================================
def read_audio_stream():
"""
Generator that yields audio frames from ALSA using arecord.
Each frame is 512 samples (32ms at 16kHz) as required by Porcupine.
"""
frame_size = 512 # Porcupine requires 512 samples
bytes_per_frame = frame_size * 2 # 16-bit = 2 bytes per sample
cmd = [
"arecord",
"-D", ALSA_DEVICE,
"-f", "S16_LE",
"-r", str(SAMPLE_RATE),
"-c", "1", # Mono
"-t", "raw",
"-q", # Quiet
"-"
]
logger.info(f"Starting audio stream: {' '.join(cmd)}")
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
bufsize=bytes_per_frame
)
try:
while state.running:
data = proc.stdout.read(bytes_per_frame)
if len(data) < bytes_per_frame:
break
yield data
finally:
proc.terminate()
proc.wait()
# ============================================================================ # ============================================================================
@@ -198,22 +185,22 @@ async def transcribe_audio(audio_data: bytes) -> str:
files = {"audio": ("recording.wav", audio_data, "audio/wav")} files = {"audio": ("recording.wav", audio_data, "audio/wav")}
response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files) response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files)
response.raise_for_status() response.raise_for_status()
job_id = response.json().get("job_id") job_id = response.json().get("job_id")
logger.info(f"Transcription job: {job_id}") logger.info(f"Transcription job: {job_id}")
for _ in range(120): for _ in range(120):
status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}") status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
data = status.json() data = status.json()
if data.get("status") == "SUCCESS": if data.get("status") == "SUCCESS":
result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}") result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}")
return result.json().get("transcription", "") return result.json().get("transcription", "")
elif data.get("status") == "FAILURE": elif data.get("status") == "FAILURE":
raise Exception(f"Transcription failed: {data.get('error')}") raise Exception(f"Transcription failed: {data.get('error')}")
await asyncio.sleep(1) await asyncio.sleep(1)
raise Exception("Transcription timeout") raise Exception("Transcription timeout")
@@ -227,7 +214,7 @@ def transcribe_sync(audio_data: bytes) -> str:
# ============================================================================ # ============================================================================
# Main Listener Loop # Main Listener Loop (dual-stream)
# ============================================================================ # ============================================================================
def audio_to_wav(frames: List[bytes]) -> bytes: def audio_to_wav(frames: List[bytes]) -> bytes:
@@ -243,9 +230,9 @@ def audio_to_wav(frames: List[bytes]) -> bytes:
def listener_loop(): def listener_loop():
"""Main audio processing loop.""" """Main audio processing loop with dual-stream best-beam selection."""
global state global state, dual_stream
logger.info("Initializing Porcupine...") logger.info("Initializing Porcupine...")
try: try:
porcupine = pvporcupine.create( porcupine = pvporcupine.create(
@@ -256,26 +243,27 @@ def listener_loop():
logger.error(f"Failed to init Porcupine: {e}") logger.error(f"Failed to init Porcupine: {e}")
state.error = str(e) state.error = str(e)
return return
vad = webrtcvad.Vad(VAD_AGGRESSIVENESS) vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
# VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples # VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples
# Porcupine needs 512 samples. We'll use 480 for VAD. vad_frame_bytes = 480 * 2
vad_frame_size = 480
vad_frame_bytes = vad_frame_size * 2
state.listening = True state.listening = True
logger.info("🦊 Wake word listener active - say 'Hey Vivi'!") logger.info("🦊 Wake word listener active - say 'Hey Vivi'!")
recording_buffer: List[bytes] = [] recording_buffer: List[bytes] = []
silence_count = 0 silence_count = 0
is_recording = False is_recording = False
recording_side: str = "left"
try: try:
for frame_data in read_audio_stream(): for frame_data, side in dual_stream.frames():
if not state.running: if not state.running:
break break
state.active_side = side
# Convert bytes to int16 array for Porcupine # Convert bytes to int16 array for Porcupine
pcm = struct.unpack_from("h" * 512, frame_data) pcm = struct.unpack_from("h" * 512, frame_data)
@@ -289,52 +277,56 @@ def listener_loop():
# Check for wake word # Check for wake word
keyword_index = porcupine.process(pcm) keyword_index = porcupine.process(pcm)
if keyword_index >= 0 and not is_recording: if keyword_index >= 0 and not is_recording:
logger.info("🦊 Wake word detected: 'Hey Vivi'!") logger.info("🦊 Wake word detected: 'Hey Vivi'! (from %s ear)", side)
state.wake_count += 1 state.wake_count += 1
state.last_wake_time = time.time() state.last_wake_time = time.time()
recording_side = side
leds_wakeup() leds_wakeup()
time.sleep(0.2) time.sleep(0.2)
leds_listening() leds_listening()
is_recording = True is_recording = True
state.recording = True state.recording = True
recording_buffer = [] recording_buffer = []
silence_count = 0 silence_count = 0
logger.info("Recording started...") logger.info("Recording started (using %s ear)...", recording_side)
continue continue
if is_recording: if is_recording:
recording_buffer.append(frame_data) # During recording, use frames from the side that heard the wake word
rec_frame = dual_stream.get_side_frame(recording_side)
if rec_frame:
recording_buffer.append(rec_frame)
# Check VAD (use first 480 samples of the 512 frame) # Check VAD (use first 480 samples of the 512 frame)
vad_data = frame_data[:vad_frame_bytes] vad_data = (rec_frame or frame_data)[:vad_frame_bytes]
try: try:
is_speech = vad.is_speech(vad_data, SAMPLE_RATE) is_speech = vad.is_speech(vad_data, SAMPLE_RATE)
except: except:
is_speech = True # Assume speech on VAD error is_speech = True
if is_speech: if is_speech:
silence_count = 0 silence_count = 0
else: else:
silence_count += 1 silence_count += 1
# Stop conditions # Stop conditions
should_stop = ( should_stop = (
(len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or (len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or
len(recording_buffer) >= MAX_RECORDING_FRAMES len(recording_buffer) >= MAX_RECORDING_FRAMES
) )
if should_stop: if should_stop:
logger.info(f"Recording stopped: {len(recording_buffer)} frames") logger.info(f"Recording stopped: {len(recording_buffer)} frames")
is_recording = False is_recording = False
state.recording = False state.recording = False
leds_processing() leds_processing()
state.processing = True state.processing = True
try: try:
wav_data = audio_to_wav(recording_buffer) wav_data = audio_to_wav(recording_buffer)
transcription = transcribe_sync(wav_data) transcription = transcribe_sync(wav_data)
@@ -346,9 +338,9 @@ def listener_loop():
finally: finally:
state.processing = False state.processing = False
leds_off() leds_off()
recording_buffer = [] recording_buffer = []
except Exception as e: except Exception as e:
logger.error(f"Listener error: {e}") logger.error(f"Listener error: {e}")
state.error = str(e) state.error = str(e)
@@ -396,20 +388,82 @@ def sound_classifier_loop():
logger.info("Sound classifier thread stopped") logger.info("Sound classifier thread stopped")
# ============================================================================
# DoA Polling Thread
# ============================================================================
def doa_poll_loop():
"""Poll Direction of Arrival from both XVF3800 arrays."""
interval = 1.0 / DOA_POLL_HZ
while state.running:
try:
state.doa = xvf_manager.read_both_doa()
except Exception as e:
logger.debug("DoA poll error: %s", e)
time.sleep(interval)
def doa_to_gaze() -> Optional[tuple[int, int]]:
"""Convert the active side's DoA angle to gaze coordinates for the eye service."""
doa = state.doa
side = state.active_side
if not doa or side not in doa or doa[side] is None:
return None
if not doa[side].get("vad"):
return None
import math
angle = doa[side]["angle"]
rad = math.radians(angle)
x = int(127 - 80 * math.sin(rad))
y = int(127 - 40 * math.cos(rad))
return max(0, min(255, x)), max(0, min(255, y))
# ============================================================================ # ============================================================================
# FastAPI # FastAPI
# ============================================================================ # ============================================================================
app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂") app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂 (Dual XVF3800)")
@app.on_event("startup") @app.on_event("startup")
async def startup(): async def startup():
global sound_classifier, sound_ring_buffer, speaker_recognizer global sound_classifier, sound_ring_buffer, speaker_recognizer, dual_stream, LEDS_AVAILABLE
state.running = True state.running = True
# Init sound classifier (optional — graceful if model missing) # --- XVF3800 setup ---
cfg = load_config()
ears_cfg = cfg.get("ears", {})
if ears_cfg.get("left") and ears_cfg.get("right"):
xvf_manager.set_serial_mapping(
ears_cfg["left"]["usb_serial"],
ears_cfg["right"]["usb_serial"]
)
xvf_manager.assign()
LEDS_AVAILABLE = bool(xvf_manager.left or xvf_manager.right)
# Resolve ALSA devices
alsa = xvf_manager.get_alsa_devices()
left_dev = alsa.get("left")
right_dev = alsa.get("right")
if not left_dev:
logger.error("No left ear ALSA device found! Check USB connections and firmware.")
state.error = "No left ear audio device"
else:
logger.info("Left ear ALSA: %s", left_dev)
if right_dev:
logger.info("Right ear ALSA: %s", right_dev)
else:
logger.warning("Right ear ALSA device not found — running with left ear only")
# --- Dual audio stream ---
from audio_stream import DualAudioStream
dual_stream = DualAudioStream(left_dev or "plughw:0,0", right_dev)
dual_stream.start()
# --- Sound classifier (optional) ---
model_dir = Path(__file__).parent / "models" model_dir = Path(__file__).parent / "models"
model_path = model_dir / "yamnet.tflite" model_path = model_dir / "yamnet.tflite"
class_map_path = model_dir / "yamnet_class_map.csv" class_map_path = model_dir / "yamnet_class_map.csv"
@@ -417,7 +471,6 @@ async def startup():
try: try:
from sound_id import SoundClassifier from sound_id import SoundClassifier
sound_classifier = SoundClassifier(str(model_path), str(class_map_path)) sound_classifier = SoundClassifier(str(model_path), str(class_map_path))
# 31 frames of 512 samples = ~0.99s at 16kHz
sound_ring_buffer = collections.deque(maxlen=31) sound_ring_buffer = collections.deque(maxlen=31)
state.sound_classification_enabled = True state.sound_classification_enabled = True
logger.info("Sound classification enabled (YAMNet)") logger.info("Sound classification enabled (YAMNet)")
@@ -429,7 +482,7 @@ async def startup():
else: else:
logger.info("Sound classification models not found, skipping") logger.info("Sound classification models not found, skipping")
# Init speaker recognizer (optional — graceful if resemblyzer not installed) # --- Speaker recognizer (optional) ---
try: try:
from speaker_id import SpeakerRecognizer from speaker_id import SpeakerRecognizer
db_path = Path(__file__).parent / "voices.db" db_path = Path(__file__).parent / "voices.db"
@@ -439,22 +492,32 @@ async def startup():
except Exception as e: except Exception as e:
logger.warning("Speaker recognition unavailable: %s", e) logger.warning("Speaker recognition unavailable: %s", e)
# --- DoA polling ---
if xvf_manager.left or xvf_manager.right:
threading.Thread(target=doa_poll_loop, daemon=True).start()
logger.info("DoA polling started at %d Hz", DOA_POLL_HZ)
# --- Main listener ---
thread = threading.Thread(target=listener_loop, daemon=True) thread = threading.Thread(target=listener_loop, daemon=True)
thread.start() thread.start()
logger.info("HeadMic started") logger.info("HeadMic started (dual XVF3800)")
@app.on_event("shutdown") @app.on_event("shutdown")
async def shutdown(): async def shutdown():
state.running = False state.running = False
leds_off() leds_off()
if dual_stream:
dual_stream.stop()
# --- Info endpoints ---
@app.get("/") @app.get("/")
async def root(): async def root():
return { return {
"service": "HeadMic", "service": "HeadMic",
"description": "Vixy's Ears 🦊👂", "description": "Vixy's Ears 🦊👂 (Dual XVF3800)",
"wake_word": "Hey Vivi" "wake_word": "Hey Vivi"
} }
@@ -469,6 +532,7 @@ async def health():
"wake_count": state.wake_count, "wake_count": state.wake_count,
"sound_classification_enabled": state.sound_classification_enabled, "sound_classification_enabled": state.sound_classification_enabled,
"speaker_recognition_enabled": state.speaker_recognition_enabled, "speaker_recognition_enabled": state.speaker_recognition_enabled,
"active_side": state.active_side,
"error": state.error "error": state.error
} }
@@ -484,6 +548,7 @@ async def status():
"wake_count": state.wake_count, "wake_count": state.wake_count,
"audio_scene": state.audio_scene["dominant_category"] if state.audio_scene else None, "audio_scene": state.audio_scene["dominant_category"] if state.audio_scene else None,
"recognized_speaker": state.recognized_speaker, "recognized_speaker": state.recognized_speaker,
"active_side": state.active_side,
"error": state.error "error": state.error
} }
@@ -496,6 +561,41 @@ async def last():
} }
# --- DoA endpoints ---
@app.get("/doa")
async def doa():
"""Direction of Arrival from both mic arrays."""
return {
"doa": state.doa,
"active_side": state.active_side,
"gaze": doa_to_gaze(),
}
# --- Device info ---
@app.get("/devices")
async def devices():
"""Status of both XVF3800 arrays."""
alsa = xvf_manager.get_alsa_devices()
return {
"left": {
"connected": bool(xvf_manager.left),
"serial": xvf_manager.left.serial if xvf_manager.left else None,
"alsa": alsa.get("left"),
},
"right": {
"connected": bool(xvf_manager.right),
"serial": xvf_manager.right.serial if xvf_manager.right else None,
"alsa": alsa.get("right"),
},
"active_side": state.active_side,
}
# --- Sound endpoints ---
@app.get("/sounds") @app.get("/sounds")
async def sounds(): async def sounds():
"""Current audio scene classification.""" """Current audio scene classification."""
@@ -521,9 +621,7 @@ async def sounds_history(seconds: int = 30):
return {"history": sound_classifier.get_history(seconds)} return {"history": sound_classifier.get_history(seconds)}
# ============================================================================ # --- Speaker endpoints ---
# Speaker Endpoints
# ============================================================================
@app.post("/speakers/enroll") @app.post("/speakers/enroll")
async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)): async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
@@ -532,7 +630,6 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
raise HTTPException(status_code=503, detail="Speaker recognition not available") raise HTTPException(status_code=503, detail="Speaker recognition not available")
audio_bytes = await audio.read() audio_bytes = await audio.read()
# Convert to float32: try raw int16 first, fall back to wav
try: try:
import wave as _wave import wave as _wave
wav_io = io.BytesIO(audio_bytes) wav_io = io.BytesIO(audio_bytes)
@@ -540,7 +637,6 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
raw = wf.readframes(wf.getnframes()) raw = wf.readframes(wf.getnframes())
audio_f32 = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 audio_f32 = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
except Exception: except Exception:
# Assume raw int16 PCM at 16kHz
audio_f32 = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0 audio_f32 = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
try: try:
@@ -553,7 +649,7 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
@app.post("/speakers/enroll-from-mic") @app.post("/speakers/enroll-from-mic")
async def enroll_from_mic(name: str): async def enroll_from_mic(name: str):
"""Record from live mic for 5 seconds and enroll speaker.""" """Record from live mic for 5 seconds and enroll speaker."""
global enrollment_buffer, enrollment_name, enrollment_event global enrollment_buffer, enrollment_name
if speaker_recognizer is None: if speaker_recognizer is None:
raise HTTPException(status_code=503, detail="Speaker recognition not available") raise HTTPException(status_code=503, detail="Speaker recognition not available")
@@ -567,10 +663,8 @@ async def enroll_from_mic(name: str):
leds_enrolling() leds_enrolling()
logger.info("Enrollment started for '%s' — recording 5 seconds", name) logger.info("Enrollment started for '%s' — recording 5 seconds", name)
# Wait 5 seconds for audio, non-blocking to the event loop
await asyncio.sleep(5.0) await asyncio.sleep(5.0)
# Collect what we have
frames = enrollment_buffer frames = enrollment_buffer
enrollment_buffer = None enrollment_buffer = None
enrollment_name = None enrollment_name = None
@@ -611,6 +705,25 @@ async def delete_speaker(name: str):
return {"deleted": name, "samples_removed": removed} return {"deleted": name, "samples_removed": removed}
# ============================================================================
# CLI
# ============================================================================
if __name__ == "__main__": if __name__ == "__main__":
import sys
if "--learn" in sys.argv:
logging.basicConfig(level=logging.INFO)
info = learn_devices()
if not info.get("left") or not info.get("right"):
print("[HEADMIC] Need 2 XVF3800 arrays connected for --learn")
sys.exit(1)
cfg = load_config()
cfg["ears"] = info
save_config(cfg)
print(f"[HEADMIC] Learned ear config → {CONFIG_PATH}")
print(json.dumps(info, indent=2))
sys.exit(0)
import uvicorn import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8446) uvicorn.run(app, host="0.0.0.0", port=8446)

View File

@@ -1,5 +1,5 @@
[Unit] [Unit]
Description=HeadMic - Vixy's Ears Service Description=HeadMic - Vixy's Ears Service (Dual XVF3800)
After=network.target sound.target After=network.target sound.target
[Service] [Service]

View File

@@ -15,9 +15,8 @@ pvporcupine>=3.0.0
# HTTP client for EarTail # HTTP client for EarTail
httpx>=0.25.0 httpx>=0.25.0
# ReSpeaker LED control # XVF3800 USB control (DoA + LEDs)
# pixel_ring - install from: https://github.com/respeaker/pixel_ring pyusb>=1.2.0
# pip install pixel_ring
# Pydantic for models # Pydantic for models
pydantic>=2.0.0 pydantic>=2.0.0

283
xvf3800.py Normal file
View File

@@ -0,0 +1,283 @@
"""
XVF3800 USB Control — DoA, LEDs, device identification.
Each ReSpeaker XVF3800 4-Mic Array is controlled via USB vendor commands (PyUSB).
Replaces the old pixel_ring / Tuning interface used by the XVF3000.
Reference: https://github.com/respeaker/reSpeaker_XVF3800_USB_4MIC_ARRAY/blob/master/python_control/xvf_host.py
"""
import logging
import struct
import time
from typing import Optional
try:
import usb.core
import usb.util
PYUSB_AVAILABLE = True
except ImportError:
PYUSB_AVAILABLE = False
logger = logging.getLogger("headmic.xvf3800")
VID = 0x2886
PID = 0x001A
# USB vendor control transfer parameters
CTRL_REQUEST_TYPE_OUT = usb.util.CTRL_OUT | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE if PYUSB_AVAILABLE else 0
CTRL_REQUEST_TYPE_IN = usb.util.CTRL_IN | usb.util.CTRL_TYPE_VENDOR | usb.util.CTRL_RECIPIENT_DEVICE if PYUSB_AVAILABLE else 0
# Resource IDs
GPO_RESID = 20
# Parameter indices (within resource)
DOA_VALUE_IDX = 18 # returns (angle 0-359, vad 0/1)
LED_EFFECT_IDX = 0 # 0=off, 1=breath, 2=rainbow, 3=solid, 4=doa, 5=ring
LED_BRIGHTNESS_IDX = 1
LED_COLOR_IDX = 3 # single uint32 color
LED_RING_COLOR_IDX = 5 # 12 × uint32
class XVF3800:
"""Control a single ReSpeaker XVF3800 via USB vendor commands."""
def __init__(self, usb_device):
self.dev = usb_device
self.serial = usb_device.serial_number or "unknown"
self.bus = usb_device.bus
self.address = usb_device.address
def _read(self, resid: int, param_idx: int, length: int) -> bytes:
"""Read parameter via USB control transfer."""
wValue = (resid << 8) | param_idx
try:
data = self.dev.ctrl_transfer(CTRL_REQUEST_TYPE_IN, 0, wValue, 0, length, timeout=1000)
return bytes(data)
except Exception as e:
logger.debug("USB read error (resid=%d, param=%d): %s", resid, param_idx, e)
return b""
def _write(self, resid: int, param_idx: int, data: bytes):
"""Write parameter via USB control transfer."""
wValue = (resid << 8) | param_idx
try:
self.dev.ctrl_transfer(CTRL_REQUEST_TYPE_OUT, 0, wValue, 0, data, timeout=1000)
except Exception as e:
logger.debug("USB write error (resid=%d, param=%d): %s", resid, param_idx, e)
# --- DoA ---
def read_doa(self) -> tuple[int, bool]:
"""Read Direction of Arrival. Returns (angle 0-359, vad True/False)."""
data = self._read(GPO_RESID, DOA_VALUE_IDX, 4)
if len(data) < 4:
return 0, False
angle, vad = struct.unpack_from("<HH", data)
return angle % 360, bool(vad)
# --- LEDs ---
def led_off(self):
self._write(GPO_RESID, LED_EFFECT_IDX, struct.pack("<I", 0))
def led_solid(self, color: int):
"""Solid color on all LEDs. color is 0xRRGGBB."""
self._write(GPO_RESID, LED_COLOR_IDX, struct.pack("<I", color))
self._write(GPO_RESID, LED_EFFECT_IDX, struct.pack("<I", 3))
def led_breath(self, color: int, brightness: int = 128):
"""Breathing effect."""
self._write(GPO_RESID, LED_COLOR_IDX, struct.pack("<I", color))
self._write(GPO_RESID, LED_BRIGHTNESS_IDX, struct.pack("<I", brightness))
self._write(GPO_RESID, LED_EFFECT_IDX, struct.pack("<I", 1))
def led_doa(self, base_color: int = 0x003333, doa_color: int = 0x00FFFF):
"""DoA indicator mode — shows beam direction on LED ring."""
# LED_DOA_COLOR takes two uint32 values: base + indicator
data = struct.pack("<II", base_color, doa_color)
self._write(GPO_RESID, 4, data) # param 4 = LED_DOA_COLOR
self._write(GPO_RESID, LED_EFFECT_IDX, struct.pack("<I", 4))
def led_rainbow(self, brightness: int = 128):
self._write(GPO_RESID, LED_BRIGHTNESS_IDX, struct.pack("<I", brightness))
self._write(GPO_RESID, LED_EFFECT_IDX, struct.pack("<I", 2))
class XVF3800Manager:
"""Manage two XVF3800 arrays, identified by USB serial number."""
def __init__(self):
self.left: Optional[XVF3800] = None
self.right: Optional[XVF3800] = None
self._serials: dict[str, str] = {} # {"left": "SN...", "right": "SN..."}
def set_serial_mapping(self, left_serial: str, right_serial: str):
"""Pin left/right assignment by USB serial number."""
self._serials = {"left": left_serial, "right": right_serial}
def discover(self) -> list[XVF3800]:
"""Find all connected XVF3800 devices."""
if not PYUSB_AVAILABLE:
logger.warning("pyusb not installed — XVF3800 control disabled")
return []
devices = []
for dev in usb.core.find(idVendor=VID, idProduct=PID, find_all=True):
try:
devices.append(XVF3800(dev))
except Exception as e:
logger.warning("Failed to init XVF3800 at bus %d addr %d: %s",
dev.bus, dev.address, e)
return devices
def assign(self):
"""Discover devices and assign left/right based on serial mapping."""
devices = self.discover()
logger.info("Found %d XVF3800 device(s): %s",
len(devices), [d.serial for d in devices])
if self._serials:
for dev in devices:
if dev.serial == self._serials.get("left"):
self.left = dev
elif dev.serial == self._serials.get("right"):
self.right = dev
if not self.left:
logger.warning("Left XVF3800 (serial %s) not found", self._serials.get("left"))
if not self.right:
logger.warning("Right XVF3800 (serial %s) not found", self._serials.get("right"))
else:
# No serial mapping — assign by bus address order (unstable, but works for --learn)
devices.sort(key=lambda d: (d.bus, d.address))
if len(devices) >= 1:
self.left = devices[0]
if len(devices) >= 2:
self.right = devices[1]
if self.left:
logger.info("Left ear: serial=%s bus=%d addr=%d", self.left.serial, self.left.bus, self.left.address)
if self.right:
logger.info("Right ear: serial=%s bus=%d addr=%d", self.right.serial, self.right.bus, self.right.address)
def serial_to_alsa(self, serial: str) -> Optional[str]:
"""Find the ALSA card name for a device with a given USB serial number.
Searches /proc/asound/cards and matches via sysfs."""
import os, glob
# Walk /sys/class/sound/card*/device -> look for matching USB serial
for card_dir in sorted(glob.glob("/sys/class/sound/card*")):
card_num = os.path.basename(card_dir).replace("card", "")
# Follow the device symlink up to the USB device
device_path = os.path.join(card_dir, "device")
if not os.path.islink(device_path):
continue
usb_path = os.path.realpath(device_path)
serial_file = os.path.join(usb_path, "..", "serial")
if not os.path.exists(serial_file):
serial_file = os.path.join(usb_path, "..", "..", "serial")
if os.path.exists(serial_file):
try:
dev_serial = open(serial_file).read().strip()
if dev_serial == serial:
# Read the card ID (ALSA name)
id_file = os.path.join(card_dir, "id")
if os.path.exists(id_file):
return open(id_file).read().strip()
return card_num
except Exception:
pass
return None
def get_alsa_devices(self) -> dict[str, Optional[str]]:
"""Return {"left": "plughw:Array,0", "right": "plughw:Array_1,0"} or similar."""
result = {}
for label, dev in [("left", self.left), ("right", self.right)]:
if dev:
card_name = self.serial_to_alsa(dev.serial)
result[label] = f"plughw:{card_name},0" if card_name else None
else:
result[label] = None
return result
# --- Convenience: control both arrays ---
def all_leds_off(self):
for dev in [self.left, self.right]:
if dev:
dev.led_off()
def all_leds_solid(self, color: int):
for dev in [self.left, self.right]:
if dev:
dev.led_solid(color)
def all_leds_breath(self, color: int, brightness: int = 128):
for dev in [self.left, self.right]:
if dev:
dev.led_breath(color, brightness)
def all_leds_doa(self):
for dev in [self.left, self.right]:
if dev:
dev.led_doa()
def read_both_doa(self) -> dict:
"""Read DoA from both arrays."""
result = {}
for label, dev in [("left", self.left), ("right", self.right)]:
if dev:
angle, vad = dev.read_doa()
result[label] = {"angle": angle, "vad": vad}
else:
result[label] = None
return result
def learn_devices() -> dict:
"""Discover connected XVF3800 devices and return their serials for config."""
mgr = XVF3800Manager()
mgr.assign()
result = {}
if mgr.left:
result["left"] = {"usb_serial": mgr.left.serial}
alsa = mgr.serial_to_alsa(mgr.left.serial)
if alsa:
result["left"]["alsa_card"] = alsa
if mgr.right:
result["right"] = {"usb_serial": mgr.right.serial}
alsa = mgr.serial_to_alsa(mgr.right.serial)
if alsa:
result["right"]["alsa_card"] = alsa
return result
# === CLI test ===
if __name__ == "__main__":
import sys
logging.basicConfig(level=logging.INFO)
if "--learn" in sys.argv:
info = learn_devices()
import json
print(json.dumps(info, indent=2))
sys.exit(0)
if "--test-doa" in sys.argv:
mgr = XVF3800Manager()
mgr.assign()
for _ in range(50):
doa = mgr.read_both_doa()
print(f"DoA: left={doa.get('left')} right={doa.get('right')}", end="\r")
time.sleep(0.1)
print()
sys.exit(0)
if "--test-leds" in sys.argv:
mgr = XVF3800Manager()
mgr.assign()
for color, name in [(0xFF0000, "red"), (0x00FF00, "green"), (0x0000FF, "blue"),
(0x00FFFF, "cyan"), (0x9400D3, "purple")]:
print(f" {name}")
mgr.all_leds_solid(color)
time.sleep(1)
mgr.all_leds_off()
sys.exit(0)