Add cocktail party spatial filtering (#7)

audio_stream.py: Added focus_side property. When set, the stream yields from the focused side regardless of energy (attention lock). When None, falls back to energy-based auto selection. multi_speaker.py: When beams lock onto 2 speakers, sets audio focus to the target speaker's side. Auto-switches target when the current target goes silent and the other starts talking. Manual focus via API. headmic.py: New endpoint POST /speakers/focus?speaker=0|1 to manually switch attention. /speakers/tracked now shows is_target, target_speaker, and audio_focus fields. The cocktail party effect: when 2 people are talking, the audio feed to Porcupine/VAD/transcription comes from the target speaker's direction, suppressing the other. XVF3800 beam gating silences the non-speaking beam, and audio_stream focus locks the ear facing the target. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 21:47:30 -05:00
parent 38d21ef53c
commit 0705b3818b
3 changed files with 64 additions and 8 deletions
--- a/multi_speaker.py
+++ b/multi_speaker.py
@@ -80,10 +80,12 @@ class TrackedSpeaker:
 class MultiSpeakerTracker:
    """Track multiple speakers and manage beam steering."""

-    def __init__(self, xvf_manager):
+    def __init__(self, xvf_manager, audio_stream=None):
        self.xvf = xvf_manager
+        self.audio_stream = audio_stream  # for cocktail party focus control
        self.speakers: list[TrackedSpeaker] = []
        self.fixed_mode = False
+        self.target_speaker_idx: int = 0  # which speaker is the "target" (0 or 1)
        self._lock = threading.Lock()

    def update(self, doa: dict) -> dict:
@@ -144,7 +146,8 @@ class MultiSpeakerTracker:
        return best

    def _manage_beams(self):
-        """Switch between auto and fixed beam mode based on speaker count."""
+        """Switch between auto and fixed beam mode based on speaker count.
+        Also manages audio focus for cocktail party filtering."""
        stable_speakers = [s for s in self.speakers if s.stable]

        if len(stable_speakers) >= 2 and not self.fixed_mode:
@@ -157,34 +160,66 @@ class MultiSpeakerTracker:
            s2.beam_locked = True
            self.fixed_mode = True

+            # Focus audio on the target speaker's side
+            self._update_audio_focus()
+
        elif len(stable_speakers) >= 2 and self.fixed_mode:
            # Update beam directions if speakers moved
            s1, s2 = stable_speakers[0], stable_speakers[1]
            self.xvf.steer_beams(s1.angle, s2.angle)

+            # If the non-target speaker starts talking and target is silent,
+            # auto-switch target to the active one
+            target = stable_speakers[self.target_speaker_idx]
+            other_idx = 1 - self.target_speaker_idx
+            other = stable_speakers[other_idx]
+            if other.active and not target.active and target.silence_duration > 1.0:
+                self.target_speaker_idx = other_idx
+                logger.info("Attention shifted to speaker %d at %.0f°",
+                            other_idx + 1, other.angle)
+                self._update_audio_focus()
+
        elif len(stable_speakers) < 2 and self.fixed_mode:
            # Back to single speaker or silence — release beams
            logger.info("Releasing beams — back to auto mode")
            self.xvf.release_beams()
            self.fixed_mode = False
+            self.target_speaker_idx = 0
            for s in self.speakers:
                s.beam_locked = False
+            # Release audio focus
+            if self.audio_stream:
+                self.audio_stream.focus_side = None
+
+    def _update_audio_focus(self):
+        """Set the audio stream to focus on the target speaker's side."""
+        if not self.audio_stream or not self.speakers:
+            return
+        stable = [s for s in self.speakers if s.stable]
+        if self.target_speaker_idx < len(stable):
+            target = stable[self.target_speaker_idx]
+            self.audio_stream.focus_side = target.side
+            logger.debug("Audio focus: %s (speaker at %.0f°)", target.side, target.angle)

    def _get_state(self) -> dict:
        return {
            "speakers": [
                {
+                    "index": i,
                    "angle": round(s.angle, 1),
                    "side": s.side,
                    "active": s.active,
                    "beam_locked": s.beam_locked,
+                    "is_target": i == self.target_speaker_idx,
                    "age_seconds": round(s.age, 1),
                    "silence_seconds": round(s.silence_duration, 1),
                    "speaker_name": s.speaker_name,
                }
-                for s in self.speakers
+                for i, s in enumerate(self.speakers)
            ],
            "beam_mode": "fixed" if self.fixed_mode else "auto",
+            "target_speaker": self.target_speaker_idx,
+            "audio_focus": self.audio_stream.focus_side if self.audio_stream else None,
            "active_count": sum(1 for s in self.speakers if s.active),
            "total_tracked": len(self.speakers),
        }