updates for dual mic array

This commit is contained in:
Alex
2026-04-11 15:11:22 -05:00
parent 1cb3bd6833
commit 6c10e75cbc
5 changed files with 710 additions and 123 deletions

View File

@@ -7,27 +7,32 @@ Runs on head-vixy (Raspberry Pi 5).
Wake word: "Hey Vivi" (trained via Picovoice Porcupine)
Architecture: Single shared audio stream feeds both Porcupine (wake word)
and recording buffer. This avoids device conflicts.
Architecture: Dual XVF3800 mic arrays (left/right ear), best-beam selection.
Single shared audio stream feeds Porcupine, VAD, sound classification, and speaker ID.
Flow:
1. Continuous audio stream from ReSpeaker
2. Feed frames to Porcupine for wake word detection
3. On "Hey Vivi" → start buffering audio
4. Use VAD to detect end of speech
5. Send buffer to EarTail for transcription
6. Return to listening mode
1. Dual audio streams from two XVF3800 arrays
2. Best-beam selection (higher energy side)
3. Feed frames to Porcupine for wake word detection
4. On "Hey Vivi" → start buffering from active side
5. Use VAD to detect end of speech
6. Send buffer to EarTail for transcription
7. Return to listening mode
Hardware: 2× ReSpeaker XVF3800 4-Mic Array (USB, 2-channel firmware)
DoA + LEDs via USB vendor control (xvf3800.py)
Built by Vixy on Day 77 (January 17, 2026) 💜
Upgraded to dual XVF3800 on Day 160 (April 2026)
"""
import asyncio
import collections
import io
import json
import logging
import os
import struct
import subprocess
import threading
import time
import wave
@@ -53,7 +58,8 @@ PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "")
WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn")
SAMPLE_RATE = 16000
ALSA_DEVICE = "plughw:ArrayUAC10,0" # ReSpeaker 4 Mic Array - by name, not card number (survives reboot order changes)
CONFIG_DIR = os.path.expanduser("~/.vixy")
CONFIG_PATH = os.path.join(CONFIG_DIR, "headmic.json")
VAD_AGGRESSIVENESS = 2 # 0-3, higher = more aggressive
SILENCE_FRAMES = 50 # ~1.5 sec of silence to stop (at 30ms frames)
@@ -61,54 +67,73 @@ MAX_RECORDING_FRAMES = 1000 # ~30 sec max
EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764")
DOA_POLL_HZ = 10 # DoA polling rate
EYE_SERVICE_URL = os.environ.get("EYE_SERVICE_URL", "http://localhost:8780")
# ============================================================================
# LED Control
# Config persistence
# ============================================================================
try:
from pixel_ring import pixel_ring
LEDS_AVAILABLE = True
pixel_ring.off()
except ImportError:
LEDS_AVAILABLE = False
logger.warning("pixel_ring not available")
def load_config() -> dict:
if not os.path.exists(CONFIG_PATH):
return {}
try:
with open(CONFIG_PATH) as f:
return json.load(f)
except Exception as e:
logger.warning("Failed to read config: %s", e)
return {}
def save_config(cfg: dict):
os.makedirs(CONFIG_DIR, exist_ok=True)
with open(CONFIG_PATH, "w") as f:
json.dump(cfg, f, indent=2)
# ============================================================================
# XVF3800 + LED Control
# ============================================================================
from xvf3800 import XVF3800Manager, learn_devices
xvf_manager = XVF3800Manager()
LEDS_AVAILABLE = False
def leds_wakeup():
if LEDS_AVAILABLE:
try:
pixel_ring.wakeup()
xvf_manager.all_leds_solid(0xFFFFFF)
except: pass
def leds_listening():
if LEDS_AVAILABLE:
try:
pixel_ring.set_color_palette(0x00FFFF, 0x000000)
pixel_ring.think()
xvf_manager.all_leds_doa()
except: pass
def leds_processing():
if LEDS_AVAILABLE:
try:
pixel_ring.set_color_palette(0x9400D3, 0x000000)
pixel_ring.spin()
xvf_manager.all_leds_breath(0x9400D3)
except: pass
def leds_enrolling():
if LEDS_AVAILABLE:
try:
pixel_ring.set_color_palette(0xFF8C00, 0x000000)
pixel_ring.think()
xvf_manager.all_leds_solid(0xFF8C00)
except: pass
def leds_off():
if LEDS_AVAILABLE:
try:
pixel_ring.off()
xvf_manager.all_leds_off()
except: pass
@@ -132,6 +157,8 @@ class ServiceState:
self.speaker_confidence: float = 0.0
self.speaker_recognition_enabled: bool = False
self.enrolling: bool = False
self.active_side: str = "left" # which mic array is currently active
self.doa: dict = {} # latest DoA from both arrays
state = ServiceState()
@@ -144,48 +171,8 @@ speaker_recognizer = None
enrollment_buffer = None # list of frame bytes, set during enrollment
enrollment_name = None
# ============================================================================
# Audio Stream using ALSA directly (arecord)
# ============================================================================
def read_audio_stream():
"""
Generator that yields audio frames from ALSA using arecord.
Each frame is 512 samples (32ms at 16kHz) as required by Porcupine.
"""
frame_size = 512 # Porcupine requires 512 samples
bytes_per_frame = frame_size * 2 # 16-bit = 2 bytes per sample
cmd = [
"arecord",
"-D", ALSA_DEVICE,
"-f", "S16_LE",
"-r", str(SAMPLE_RATE),
"-c", "1", # Mono
"-t", "raw",
"-q", # Quiet
"-"
]
logger.info(f"Starting audio stream: {' '.join(cmd)}")
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
bufsize=bytes_per_frame
)
try:
while state.running:
data = proc.stdout.read(bytes_per_frame)
if len(data) < bytes_per_frame:
break
yield data
finally:
proc.terminate()
proc.wait()
# Audio stream
dual_stream = None # DualAudioStream instance
# ============================================================================
@@ -198,22 +185,22 @@ async def transcribe_audio(audio_data: bytes) -> str:
files = {"audio": ("recording.wav", audio_data, "audio/wav")}
response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files)
response.raise_for_status()
job_id = response.json().get("job_id")
logger.info(f"Transcription job: {job_id}")
for _ in range(120):
status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
data = status.json()
if data.get("status") == "SUCCESS":
result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}")
return result.json().get("transcription", "")
elif data.get("status") == "FAILURE":
raise Exception(f"Transcription failed: {data.get('error')}")
await asyncio.sleep(1)
raise Exception("Transcription timeout")
@@ -227,7 +214,7 @@ def transcribe_sync(audio_data: bytes) -> str:
# ============================================================================
# Main Listener Loop
# Main Listener Loop (dual-stream)
# ============================================================================
def audio_to_wav(frames: List[bytes]) -> bytes:
@@ -243,9 +230,9 @@ def audio_to_wav(frames: List[bytes]) -> bytes:
def listener_loop():
"""Main audio processing loop."""
global state
"""Main audio processing loop with dual-stream best-beam selection."""
global state, dual_stream
logger.info("Initializing Porcupine...")
try:
porcupine = pvporcupine.create(
@@ -256,26 +243,27 @@ def listener_loop():
logger.error(f"Failed to init Porcupine: {e}")
state.error = str(e)
return
vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
# VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples
# Porcupine needs 512 samples. We'll use 480 for VAD.
vad_frame_size = 480
vad_frame_bytes = vad_frame_size * 2
vad_frame_bytes = 480 * 2
state.listening = True
logger.info("🦊 Wake word listener active - say 'Hey Vivi'!")
recording_buffer: List[bytes] = []
silence_count = 0
is_recording = False
recording_side: str = "left"
try:
for frame_data in read_audio_stream():
for frame_data, side in dual_stream.frames():
if not state.running:
break
state.active_side = side
# Convert bytes to int16 array for Porcupine
pcm = struct.unpack_from("h" * 512, frame_data)
@@ -289,52 +277,56 @@ def listener_loop():
# Check for wake word
keyword_index = porcupine.process(pcm)
if keyword_index >= 0 and not is_recording:
logger.info("🦊 Wake word detected: 'Hey Vivi'!")
logger.info("🦊 Wake word detected: 'Hey Vivi'! (from %s ear)", side)
state.wake_count += 1
state.last_wake_time = time.time()
recording_side = side
leds_wakeup()
time.sleep(0.2)
leds_listening()
is_recording = True
state.recording = True
recording_buffer = []
silence_count = 0
logger.info("Recording started...")
logger.info("Recording started (using %s ear)...", recording_side)
continue
if is_recording:
recording_buffer.append(frame_data)
# During recording, use frames from the side that heard the wake word
rec_frame = dual_stream.get_side_frame(recording_side)
if rec_frame:
recording_buffer.append(rec_frame)
# Check VAD (use first 480 samples of the 512 frame)
vad_data = frame_data[:vad_frame_bytes]
vad_data = (rec_frame or frame_data)[:vad_frame_bytes]
try:
is_speech = vad.is_speech(vad_data, SAMPLE_RATE)
except:
is_speech = True # Assume speech on VAD error
is_speech = True
if is_speech:
silence_count = 0
else:
silence_count += 1
# Stop conditions
should_stop = (
(len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or
len(recording_buffer) >= MAX_RECORDING_FRAMES
)
if should_stop:
logger.info(f"Recording stopped: {len(recording_buffer)} frames")
is_recording = False
state.recording = False
leds_processing()
state.processing = True
try:
wav_data = audio_to_wav(recording_buffer)
transcription = transcribe_sync(wav_data)
@@ -346,9 +338,9 @@ def listener_loop():
finally:
state.processing = False
leds_off()
recording_buffer = []
except Exception as e:
logger.error(f"Listener error: {e}")
state.error = str(e)
@@ -396,20 +388,82 @@ def sound_classifier_loop():
logger.info("Sound classifier thread stopped")
# ============================================================================
# DoA Polling Thread
# ============================================================================
def doa_poll_loop():
"""Poll Direction of Arrival from both XVF3800 arrays."""
interval = 1.0 / DOA_POLL_HZ
while state.running:
try:
state.doa = xvf_manager.read_both_doa()
except Exception as e:
logger.debug("DoA poll error: %s", e)
time.sleep(interval)
def doa_to_gaze() -> Optional[tuple[int, int]]:
"""Convert the active side's DoA angle to gaze coordinates for the eye service."""
doa = state.doa
side = state.active_side
if not doa or side not in doa or doa[side] is None:
return None
if not doa[side].get("vad"):
return None
import math
angle = doa[side]["angle"]
rad = math.radians(angle)
x = int(127 - 80 * math.sin(rad))
y = int(127 - 40 * math.cos(rad))
return max(0, min(255, x)), max(0, min(255, y))
# ============================================================================
# FastAPI
# ============================================================================
app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂")
app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂 (Dual XVF3800)")
@app.on_event("startup")
async def startup():
global sound_classifier, sound_ring_buffer, speaker_recognizer
global sound_classifier, sound_ring_buffer, speaker_recognizer, dual_stream, LEDS_AVAILABLE
state.running = True
# Init sound classifier (optional — graceful if model missing)
# --- XVF3800 setup ---
cfg = load_config()
ears_cfg = cfg.get("ears", {})
if ears_cfg.get("left") and ears_cfg.get("right"):
xvf_manager.set_serial_mapping(
ears_cfg["left"]["usb_serial"],
ears_cfg["right"]["usb_serial"]
)
xvf_manager.assign()
LEDS_AVAILABLE = bool(xvf_manager.left or xvf_manager.right)
# Resolve ALSA devices
alsa = xvf_manager.get_alsa_devices()
left_dev = alsa.get("left")
right_dev = alsa.get("right")
if not left_dev:
logger.error("No left ear ALSA device found! Check USB connections and firmware.")
state.error = "No left ear audio device"
else:
logger.info("Left ear ALSA: %s", left_dev)
if right_dev:
logger.info("Right ear ALSA: %s", right_dev)
else:
logger.warning("Right ear ALSA device not found — running with left ear only")
# --- Dual audio stream ---
from audio_stream import DualAudioStream
dual_stream = DualAudioStream(left_dev or "plughw:0,0", right_dev)
dual_stream.start()
# --- Sound classifier (optional) ---
model_dir = Path(__file__).parent / "models"
model_path = model_dir / "yamnet.tflite"
class_map_path = model_dir / "yamnet_class_map.csv"
@@ -417,7 +471,6 @@ async def startup():
try:
from sound_id import SoundClassifier
sound_classifier = SoundClassifier(str(model_path), str(class_map_path))
# 31 frames of 512 samples = ~0.99s at 16kHz
sound_ring_buffer = collections.deque(maxlen=31)
state.sound_classification_enabled = True
logger.info("Sound classification enabled (YAMNet)")
@@ -429,7 +482,7 @@ async def startup():
else:
logger.info("Sound classification models not found, skipping")
# Init speaker recognizer (optional — graceful if resemblyzer not installed)
# --- Speaker recognizer (optional) ---
try:
from speaker_id import SpeakerRecognizer
db_path = Path(__file__).parent / "voices.db"
@@ -439,22 +492,32 @@ async def startup():
except Exception as e:
logger.warning("Speaker recognition unavailable: %s", e)
# --- DoA polling ---
if xvf_manager.left or xvf_manager.right:
threading.Thread(target=doa_poll_loop, daemon=True).start()
logger.info("DoA polling started at %d Hz", DOA_POLL_HZ)
# --- Main listener ---
thread = threading.Thread(target=listener_loop, daemon=True)
thread.start()
logger.info("HeadMic started")
logger.info("HeadMic started (dual XVF3800)")
@app.on_event("shutdown")
async def shutdown():
state.running = False
leds_off()
if dual_stream:
dual_stream.stop()
# --- Info endpoints ---
@app.get("/")
async def root():
return {
"service": "HeadMic",
"description": "Vixy's Ears 🦊👂",
"description": "Vixy's Ears 🦊👂 (Dual XVF3800)",
"wake_word": "Hey Vivi"
}
@@ -469,6 +532,7 @@ async def health():
"wake_count": state.wake_count,
"sound_classification_enabled": state.sound_classification_enabled,
"speaker_recognition_enabled": state.speaker_recognition_enabled,
"active_side": state.active_side,
"error": state.error
}
@@ -484,6 +548,7 @@ async def status():
"wake_count": state.wake_count,
"audio_scene": state.audio_scene["dominant_category"] if state.audio_scene else None,
"recognized_speaker": state.recognized_speaker,
"active_side": state.active_side,
"error": state.error
}
@@ -496,6 +561,41 @@ async def last():
}
# --- DoA endpoints ---
@app.get("/doa")
async def doa():
"""Direction of Arrival from both mic arrays."""
return {
"doa": state.doa,
"active_side": state.active_side,
"gaze": doa_to_gaze(),
}
# --- Device info ---
@app.get("/devices")
async def devices():
"""Status of both XVF3800 arrays."""
alsa = xvf_manager.get_alsa_devices()
return {
"left": {
"connected": bool(xvf_manager.left),
"serial": xvf_manager.left.serial if xvf_manager.left else None,
"alsa": alsa.get("left"),
},
"right": {
"connected": bool(xvf_manager.right),
"serial": xvf_manager.right.serial if xvf_manager.right else None,
"alsa": alsa.get("right"),
},
"active_side": state.active_side,
}
# --- Sound endpoints ---
@app.get("/sounds")
async def sounds():
"""Current audio scene classification."""
@@ -521,9 +621,7 @@ async def sounds_history(seconds: int = 30):
return {"history": sound_classifier.get_history(seconds)}
# ============================================================================
# Speaker Endpoints
# ============================================================================
# --- Speaker endpoints ---
@app.post("/speakers/enroll")
async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
@@ -532,7 +630,6 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
raise HTTPException(status_code=503, detail="Speaker recognition not available")
audio_bytes = await audio.read()
# Convert to float32: try raw int16 first, fall back to wav
try:
import wave as _wave
wav_io = io.BytesIO(audio_bytes)
@@ -540,7 +637,6 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
raw = wf.readframes(wf.getnframes())
audio_f32 = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
except Exception:
# Assume raw int16 PCM at 16kHz
audio_f32 = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0
try:
@@ -553,7 +649,7 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
@app.post("/speakers/enroll-from-mic")
async def enroll_from_mic(name: str):
"""Record from live mic for 5 seconds and enroll speaker."""
global enrollment_buffer, enrollment_name, enrollment_event
global enrollment_buffer, enrollment_name
if speaker_recognizer is None:
raise HTTPException(status_code=503, detail="Speaker recognition not available")
@@ -567,10 +663,8 @@ async def enroll_from_mic(name: str):
leds_enrolling()
logger.info("Enrollment started for '%s' — recording 5 seconds", name)
# Wait 5 seconds for audio, non-blocking to the event loop
await asyncio.sleep(5.0)
# Collect what we have
frames = enrollment_buffer
enrollment_buffer = None
enrollment_name = None
@@ -611,6 +705,25 @@ async def delete_speaker(name: str):
return {"deleted": name, "samples_removed": removed}
# ============================================================================
# CLI
# ============================================================================
if __name__ == "__main__":
import sys
if "--learn" in sys.argv:
logging.basicConfig(level=logging.INFO)
info = learn_devices()
if not info.get("left") or not info.get("right"):
print("[HEADMIC] Need 2 XVF3800 arrays connected for --learn")
sys.exit(1)
cfg = load_config()
cfg["ears"] = info
save_config(cfg)
print(f"[HEADMIC] Learned ear config → {CONFIG_PATH}")
print(json.dumps(info, indent=2))
sys.exit(0)
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8446)