Add cocktail party spatial filtering (#7)

audio_stream.py: Added focus_side property. When set, the stream yields from the focused side regardless of energy (attention lock). When None, falls back to energy-based auto selection. multi_speaker.py: When beams lock onto 2 speakers, sets audio focus to the target speaker's side. Auto-switches target when the current target goes silent and the other starts talking. Manual focus via API. headmic.py: New endpoint POST /speakers/focus?speaker=0|1 to manually switch attention. /speakers/tracked now shows is_target, target_speaker, and audio_focus fields. The cocktail party effect: when 2 people are talking, the audio feed to Porcupine/VAD/transcription comes from the target speaker's direction, suppressing the other. XVF3800 beam gating silences the non-speaking beam, and audio_stream focus locks the ear facing the target. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 21:47:30 -05:00
parent 38d21ef53c
commit 0705b3818b
3 changed files with 64 additions and 8 deletions
--- a/headmic.py
+++ b/headmic.py
@@ -588,8 +588,8 @@ async def startup():
    # --- Multi-speaker tracking ---
    if xvf_manager.left or xvf_manager.right:
        from multi_speaker import MultiSpeakerTracker
-        multi_speaker = MultiSpeakerTracker(xvf_manager)
-        logger.info("Multi-speaker tracking enabled (2 beams per array)")
+        multi_speaker = MultiSpeakerTracker(xvf_manager, audio_stream=dual_stream)
+        logger.info("Multi-speaker tracking enabled (2 beams per array, cocktail party filtering)")

    # --- Binaural recording ---
    if os.environ.get("BINAURAL_RECORD", "").lower() in ("1", "true", "yes"):
@@ -692,6 +692,19 @@ async def tracked_speakers():
    return state.multi_speaker


+@app.post("/speakers/focus")
+async def focus_speaker(speaker: int = 0):
+    """Switch attention to a specific tracked speaker (0 or 1).
+    In cocktail party mode, the focused speaker's audio feeds wake word + transcription."""
+    if not multi_speaker:
+        raise HTTPException(status_code=503, detail="Multi-speaker tracking not available")
+    if speaker not in (0, 1):
+        raise HTTPException(status_code=400, detail="Speaker index must be 0 or 1")
+    multi_speaker.target_speaker_idx = speaker
+    multi_speaker._update_audio_focus()
+    return {"ok": True, "target_speaker": speaker}
+
+
 # --- Spatial scene ---

@app.get("/scene")