updates for dual mic array

2026-04-11 15:11:22 -05:00
parent 1cb3bd6833
commit 6c10e75cbc
5 changed files with 710 additions and 123 deletions
--- a/headmic.py
+++ b/headmic.py
@@ -7,27 +7,32 @@ Runs on head-vixy (Raspberry Pi 5).

 Wake word: "Hey Vivi" (trained via Picovoice Porcupine)

-Architecture: Single shared audio stream feeds both Porcupine (wake word)
-and recording buffer. This avoids device conflicts.
+Architecture: Dual XVF3800 mic arrays (left/right ear), best-beam selection.
+Single shared audio stream feeds Porcupine, VAD, sound classification, and speaker ID.

 Flow:
-  1. Continuous audio stream from ReSpeaker
-  2. Feed frames to Porcupine for wake word detection
-  3. On "Hey Vivi" → start buffering audio
-  4. Use VAD to detect end of speech
-  5. Send buffer to EarTail for transcription
-  6. Return to listening mode
+  1. Dual audio streams from two XVF3800 arrays
+  2. Best-beam selection (higher energy side)
+  3. Feed frames to Porcupine for wake word detection
+  4. On "Hey Vivi" → start buffering from active side
+  5. Use VAD to detect end of speech
+  6. Send buffer to EarTail for transcription
+  7. Return to listening mode
+
+Hardware: 2× ReSpeaker XVF3800 4-Mic Array (USB, 2-channel firmware)
+DoA + LEDs via USB vendor control (xvf3800.py)

 Built by Vixy on Day 77 (January 17, 2026) 💜
+Upgraded to dual XVF3800 on Day 160 (April 2026)
 """

 import asyncio
 import collections
 import io
+import json
 import logging
 import os
 import struct
-import subprocess
 import threading
 import time
 import wave
@@ -53,7 +58,8 @@ PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "")
 WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn")

 SAMPLE_RATE = 16000
-ALSA_DEVICE = "plughw:ArrayUAC10,0"  # ReSpeaker 4 Mic Array - by name, not card number (survives reboot order changes)
+CONFIG_DIR  = os.path.expanduser("~/.vixy")
+CONFIG_PATH = os.path.join(CONFIG_DIR, "headmic.json")

 VAD_AGGRESSIVENESS = 2  # 0-3, higher = more aggressive
 SILENCE_FRAMES = 50  # ~1.5 sec of silence to stop (at 30ms frames)
@@ -61,54 +67,73 @@ MAX_RECORDING_FRAMES = 1000  # ~30 sec max

 EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764")

+DOA_POLL_HZ = 10  # DoA polling rate
+EYE_SERVICE_URL = os.environ.get("EYE_SERVICE_URL", "http://localhost:8780")
+
 # ============================================================================
-# LED Control
+# Config persistence
 # ============================================================================

-try:
-    from pixel_ring import pixel_ring
-    LEDS_AVAILABLE = True
-    pixel_ring.off()
-except ImportError:
-    LEDS_AVAILABLE = False
-    logger.warning("pixel_ring not available")
+def load_config() -> dict:
+    if not os.path.exists(CONFIG_PATH):
+        return {}
+    try:
+        with open(CONFIG_PATH) as f:
+            return json.load(f)
+    except Exception as e:
+        logger.warning("Failed to read config: %s", e)
+        return {}
+
+
+def save_config(cfg: dict):
+    os.makedirs(CONFIG_DIR, exist_ok=True)
+    with open(CONFIG_PATH, "w") as f:
+        json.dump(cfg, f, indent=2)
+
+
+# ============================================================================
+# XVF3800 + LED Control
+# ============================================================================
+
+from xvf3800 import XVF3800Manager, learn_devices
+
+xvf_manager = XVF3800Manager()
+
+LEDS_AVAILABLE = False


 def leds_wakeup():
    if LEDS_AVAILABLE:
        try:
-            pixel_ring.wakeup()
+            xvf_manager.all_leds_solid(0xFFFFFF)
        except: pass


 def leds_listening():
    if LEDS_AVAILABLE:
        try:
-            pixel_ring.set_color_palette(0x00FFFF, 0x000000)
-            pixel_ring.think()
+            xvf_manager.all_leds_doa()
        except: pass


 def leds_processing():
    if LEDS_AVAILABLE:
        try:
-            pixel_ring.set_color_palette(0x9400D3, 0x000000)
-            pixel_ring.spin()
+            xvf_manager.all_leds_breath(0x9400D3)
        except: pass


 def leds_enrolling():
    if LEDS_AVAILABLE:
        try:
-            pixel_ring.set_color_palette(0xFF8C00, 0x000000)
-            pixel_ring.think()
+            xvf_manager.all_leds_solid(0xFF8C00)
        except: pass


 def leds_off():
    if LEDS_AVAILABLE:
        try:
-            pixel_ring.off()
+            xvf_manager.all_leds_off()
        except: pass


@@ -132,6 +157,8 @@ class ServiceState:
        self.speaker_confidence: float = 0.0
        self.speaker_recognition_enabled: bool = False
        self.enrolling: bool = False
+        self.active_side: str = "left"    # which mic array is currently active
+        self.doa: dict = {}               # latest DoA from both arrays

 state = ServiceState()

@@ -144,48 +171,8 @@ speaker_recognizer = None
 enrollment_buffer = None  # list of frame bytes, set during enrollment
 enrollment_name = None

-
-# ============================================================================
-# Audio Stream using ALSA directly (arecord)
-# ============================================================================
-
-def read_audio_stream():
-    """
-    Generator that yields audio frames from ALSA using arecord.
-    Each frame is 512 samples (32ms at 16kHz) as required by Porcupine.
-    """
-    frame_size = 512  # Porcupine requires 512 samples
-    bytes_per_frame = frame_size * 2  # 16-bit = 2 bytes per sample
-    
-    cmd = [
-        "arecord",
-        "-D", ALSA_DEVICE,
-        "-f", "S16_LE",
-        "-r", str(SAMPLE_RATE),
-        "-c", "1",  # Mono
-        "-t", "raw",
-        "-q",  # Quiet
-        "-"
-    ]
-    
-    logger.info(f"Starting audio stream: {' '.join(cmd)}")
-    
-    proc = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.DEVNULL,
-        bufsize=bytes_per_frame
-    )
-    
-    try:
-        while state.running:
-            data = proc.stdout.read(bytes_per_frame)
-            if len(data) < bytes_per_frame:
-                break
-            yield data
-    finally:
-        proc.terminate()
-        proc.wait()
+# Audio stream
+dual_stream = None  # DualAudioStream instance


 # ============================================================================
@@ -198,22 +185,22 @@ async def transcribe_audio(audio_data: bytes) -> str:
        files = {"audio": ("recording.wav", audio_data, "audio/wav")}
        response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files)
        response.raise_for_status()
-        
+
        job_id = response.json().get("job_id")
        logger.info(f"Transcription job: {job_id}")
-        
+
        for _ in range(120):
            status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
            data = status.json()
-            
+
            if data.get("status") == "SUCCESS":
                result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}")
                return result.json().get("transcription", "")
            elif data.get("status") == "FAILURE":
                raise Exception(f"Transcription failed: {data.get('error')}")
-            
+
            await asyncio.sleep(1)
-        
+
        raise Exception("Transcription timeout")


@@ -227,7 +214,7 @@ def transcribe_sync(audio_data: bytes) -> str:


 # ============================================================================
-# Main Listener Loop
+# Main Listener Loop (dual-stream)
 # ============================================================================

 def audio_to_wav(frames: List[bytes]) -> bytes:
@@ -243,9 +230,9 @@ def audio_to_wav(frames: List[bytes]) -> bytes:


 def listener_loop():
-    """Main audio processing loop."""
-    global state
-    
+    """Main audio processing loop with dual-stream best-beam selection."""
+    global state, dual_stream
+
    logger.info("Initializing Porcupine...")
    try:
        porcupine = pvporcupine.create(
@@ -256,26 +243,27 @@ def listener_loop():
        logger.error(f"Failed to init Porcupine: {e}")
        state.error = str(e)
        return
-    
+
    vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
-    
+
    # VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples
-    # Porcupine needs 512 samples. We'll use 480 for VAD.
-    vad_frame_size = 480
-    vad_frame_bytes = vad_frame_size * 2
-    
+    vad_frame_bytes = 480 * 2
+
    state.listening = True
    logger.info("🦊 Wake word listener active - say 'Hey Vivi'!")
-    
+
    recording_buffer: List[bytes] = []
    silence_count = 0
    is_recording = False
-    
+    recording_side: str = "left"
+
    try:
-        for frame_data in read_audio_stream():
+        for frame_data, side in dual_stream.frames():
            if not state.running:
                break
-            
+
+            state.active_side = side
+
            # Convert bytes to int16 array for Porcupine
            pcm = struct.unpack_from("h" * 512, frame_data)

@@ -289,52 +277,56 @@ def listener_loop():

            # Check for wake word
            keyword_index = porcupine.process(pcm)
-            
+
            if keyword_index >= 0 and not is_recording:
-                logger.info("🦊 Wake word detected: 'Hey Vivi'!")
+                logger.info("🦊 Wake word detected: 'Hey Vivi'! (from %s ear)", side)
                state.wake_count += 1
                state.last_wake_time = time.time()
-                
+                recording_side = side
+
                leds_wakeup()
                time.sleep(0.2)
                leds_listening()
-                
+
                is_recording = True
                state.recording = True
                recording_buffer = []
                silence_count = 0
-                logger.info("Recording started...")
+                logger.info("Recording started (using %s ear)...", recording_side)
                continue
-            
+
            if is_recording:
-                recording_buffer.append(frame_data)
-                
+                # During recording, use frames from the side that heard the wake word
+                rec_frame = dual_stream.get_side_frame(recording_side)
+                if rec_frame:
+                    recording_buffer.append(rec_frame)
+
                # Check VAD (use first 480 samples of the 512 frame)
-                vad_data = frame_data[:vad_frame_bytes]
+                vad_data = (rec_frame or frame_data)[:vad_frame_bytes]
                try:
                    is_speech = vad.is_speech(vad_data, SAMPLE_RATE)
                except:
-                    is_speech = True  # Assume speech on VAD error
-                
+                    is_speech = True
+
                if is_speech:
                    silence_count = 0
                else:
                    silence_count += 1
-                
+
                # Stop conditions
                should_stop = (
                    (len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or
                    len(recording_buffer) >= MAX_RECORDING_FRAMES
                )
-                
+
                if should_stop:
                    logger.info(f"Recording stopped: {len(recording_buffer)} frames")
                    is_recording = False
                    state.recording = False
-                    
+
                    leds_processing()
                    state.processing = True
-                    
+
                    try:
                        wav_data = audio_to_wav(recording_buffer)
                        transcription = transcribe_sync(wav_data)
@@ -346,9 +338,9 @@ def listener_loop():
                    finally:
                        state.processing = False
                        leds_off()
-                    
+
                    recording_buffer = []
-    
+
    except Exception as e:
        logger.error(f"Listener error: {e}")
        state.error = str(e)
@@ -396,20 +388,82 @@ def sound_classifier_loop():
    logger.info("Sound classifier thread stopped")


+# ============================================================================
+# DoA Polling Thread
+# ============================================================================
+
+def doa_poll_loop():
+    """Poll Direction of Arrival from both XVF3800 arrays."""
+    interval = 1.0 / DOA_POLL_HZ
+    while state.running:
+        try:
+            state.doa = xvf_manager.read_both_doa()
+        except Exception as e:
+            logger.debug("DoA poll error: %s", e)
+        time.sleep(interval)
+
+
+def doa_to_gaze() -> Optional[tuple[int, int]]:
+    """Convert the active side's DoA angle to gaze coordinates for the eye service."""
+    doa = state.doa
+    side = state.active_side
+    if not doa or side not in doa or doa[side] is None:
+        return None
+    if not doa[side].get("vad"):
+        return None
+    import math
+    angle = doa[side]["angle"]
+    rad = math.radians(angle)
+    x = int(127 - 80 * math.sin(rad))
+    y = int(127 - 40 * math.cos(rad))
+    return max(0, min(255, x)), max(0, min(255, y))
+
+
 # ============================================================================
 # FastAPI
 # ============================================================================

-app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂")
+app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂 (Dual XVF3800)")


@app.on_event("startup")
 async def startup():
-    global sound_classifier, sound_ring_buffer, speaker_recognizer
+    global sound_classifier, sound_ring_buffer, speaker_recognizer, dual_stream, LEDS_AVAILABLE

    state.running = True

-    # Init sound classifier (optional — graceful if model missing)
+    # --- XVF3800 setup ---
+    cfg = load_config()
+    ears_cfg = cfg.get("ears", {})
+    if ears_cfg.get("left") and ears_cfg.get("right"):
+        xvf_manager.set_serial_mapping(
+            ears_cfg["left"]["usb_serial"],
+            ears_cfg["right"]["usb_serial"]
+        )
+    xvf_manager.assign()
+    LEDS_AVAILABLE = bool(xvf_manager.left or xvf_manager.right)
+
+    # Resolve ALSA devices
+    alsa = xvf_manager.get_alsa_devices()
+    left_dev = alsa.get("left")
+    right_dev = alsa.get("right")
+
+    if not left_dev:
+        logger.error("No left ear ALSA device found! Check USB connections and firmware.")
+        state.error = "No left ear audio device"
+    else:
+        logger.info("Left ear ALSA:  %s", left_dev)
+    if right_dev:
+        logger.info("Right ear ALSA: %s", right_dev)
+    else:
+        logger.warning("Right ear ALSA device not found — running with left ear only")
+
+    # --- Dual audio stream ---
+    from audio_stream import DualAudioStream
+    dual_stream = DualAudioStream(left_dev or "plughw:0,0", right_dev)
+    dual_stream.start()
+
+    # --- Sound classifier (optional) ---
    model_dir = Path(__file__).parent / "models"
    model_path = model_dir / "yamnet.tflite"
    class_map_path = model_dir / "yamnet_class_map.csv"
@@ -417,7 +471,6 @@ async def startup():
        try:
            from sound_id import SoundClassifier
            sound_classifier = SoundClassifier(str(model_path), str(class_map_path))
-            # 31 frames of 512 samples = ~0.99s at 16kHz
            sound_ring_buffer = collections.deque(maxlen=31)
            state.sound_classification_enabled = True
            logger.info("Sound classification enabled (YAMNet)")
@@ -429,7 +482,7 @@ async def startup():
    else:
        logger.info("Sound classification models not found, skipping")

-    # Init speaker recognizer (optional — graceful if resemblyzer not installed)
+    # --- Speaker recognizer (optional) ---
    try:
        from speaker_id import SpeakerRecognizer
        db_path = Path(__file__).parent / "voices.db"
@@ -439,22 +492,32 @@ async def startup():
    except Exception as e:
        logger.warning("Speaker recognition unavailable: %s", e)

+    # --- DoA polling ---
+    if xvf_manager.left or xvf_manager.right:
+        threading.Thread(target=doa_poll_loop, daemon=True).start()
+        logger.info("DoA polling started at %d Hz", DOA_POLL_HZ)
+
+    # --- Main listener ---
    thread = threading.Thread(target=listener_loop, daemon=True)
    thread.start()
-    logger.info("HeadMic started")
+    logger.info("HeadMic started (dual XVF3800)")


@app.on_event("shutdown")
 async def shutdown():
    state.running = False
    leds_off()
+    if dual_stream:
+        dual_stream.stop()


+# --- Info endpoints ---
+
@app.get("/")
 async def root():
    return {
        "service": "HeadMic",
-        "description": "Vixy's Ears 🦊👂",
+        "description": "Vixy's Ears 🦊👂 (Dual XVF3800)",
        "wake_word": "Hey Vivi"
    }

@@ -469,6 +532,7 @@ async def health():
        "wake_count": state.wake_count,
        "sound_classification_enabled": state.sound_classification_enabled,
        "speaker_recognition_enabled": state.speaker_recognition_enabled,
+        "active_side": state.active_side,
        "error": state.error
    }

@@ -484,6 +548,7 @@ async def status():
        "wake_count": state.wake_count,
        "audio_scene": state.audio_scene["dominant_category"] if state.audio_scene else None,
        "recognized_speaker": state.recognized_speaker,
+        "active_side": state.active_side,
        "error": state.error
    }

@@ -496,6 +561,41 @@ async def last():
    }


+# --- DoA endpoints ---
+
+@app.get("/doa")
+async def doa():
+    """Direction of Arrival from both mic arrays."""
+    return {
+        "doa": state.doa,
+        "active_side": state.active_side,
+        "gaze": doa_to_gaze(),
+    }
+
+
+# --- Device info ---
+
+@app.get("/devices")
+async def devices():
+    """Status of both XVF3800 arrays."""
+    alsa = xvf_manager.get_alsa_devices()
+    return {
+        "left": {
+            "connected": bool(xvf_manager.left),
+            "serial": xvf_manager.left.serial if xvf_manager.left else None,
+            "alsa": alsa.get("left"),
+        },
+        "right": {
+            "connected": bool(xvf_manager.right),
+            "serial": xvf_manager.right.serial if xvf_manager.right else None,
+            "alsa": alsa.get("right"),
+        },
+        "active_side": state.active_side,
+    }
+
+
+# --- Sound endpoints ---
+
@app.get("/sounds")
 async def sounds():
    """Current audio scene classification."""
@@ -521,9 +621,7 @@ async def sounds_history(seconds: int = 30):
    return {"history": sound_classifier.get_history(seconds)}


-# ============================================================================
-# Speaker Endpoints
-# ============================================================================
+# --- Speaker endpoints ---

@app.post("/speakers/enroll")
 async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
@@ -532,7 +630,6 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
        raise HTTPException(status_code=503, detail="Speaker recognition not available")

    audio_bytes = await audio.read()
-    # Convert to float32: try raw int16 first, fall back to wav
    try:
        import wave as _wave
        wav_io = io.BytesIO(audio_bytes)
@@ -540,7 +637,6 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
            raw = wf.readframes(wf.getnframes())
            audio_f32 = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
    except Exception:
-        # Assume raw int16 PCM at 16kHz
        audio_f32 = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32768.0

    try:
@@ -553,7 +649,7 @@ async def enroll_speaker(name: str = Form(...), audio: UploadFile = File(...)):
@app.post("/speakers/enroll-from-mic")
 async def enroll_from_mic(name: str):
    """Record from live mic for 5 seconds and enroll speaker."""
-    global enrollment_buffer, enrollment_name, enrollment_event
+    global enrollment_buffer, enrollment_name

    if speaker_recognizer is None:
        raise HTTPException(status_code=503, detail="Speaker recognition not available")
@@ -567,10 +663,8 @@ async def enroll_from_mic(name: str):
    leds_enrolling()
    logger.info("Enrollment started for '%s' — recording 5 seconds", name)

-    # Wait 5 seconds for audio, non-blocking to the event loop
    await asyncio.sleep(5.0)

-    # Collect what we have
    frames = enrollment_buffer
    enrollment_buffer = None
    enrollment_name = None
@@ -611,6 +705,25 @@ async def delete_speaker(name: str):
    return {"deleted": name, "samples_removed": removed}


+# ============================================================================
+# CLI
+# ============================================================================
+
 if __name__ == "__main__":
+    import sys
+
+    if "--learn" in sys.argv:
+        logging.basicConfig(level=logging.INFO)
+        info = learn_devices()
+        if not info.get("left") or not info.get("right"):
+            print("[HEADMIC] Need 2 XVF3800 arrays connected for --learn")
+            sys.exit(1)
+        cfg = load_config()
+        cfg["ears"] = info
+        save_config(cfg)
+        print(f"[HEADMIC] Learned ear config → {CONFIG_PATH}")
+        print(json.dumps(info, indent=2))
+        sys.exit(0)
+
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8446)